Repository: ray-project/xgboost_ray Branch: master Commit: e9049256575e Files: 85 Total size: 505.3 KB Directory structure: gitextract_x7_6mxkx/ ├── .flake8 ├── .github/ │ └── workflows/ │ ├── gpu.yaml │ └── test.yaml ├── .gitignore ├── LICENSE ├── README.md ├── format.sh ├── requirements/ │ ├── lint-requirements.txt │ └── test-requirements.txt ├── run_ci_examples.sh ├── run_ci_tests.sh ├── setup.py └── xgboost_ray/ ├── __init__.py ├── callback.py ├── compat/ │ ├── __init__.py │ └── tracker.py ├── data_sources/ │ ├── __init__.py │ ├── _distributed.py │ ├── csv.py │ ├── dask.py │ ├── data_source.py │ ├── modin.py │ ├── numpy.py │ ├── object_store.py │ ├── pandas.py │ ├── parquet.py │ ├── partitioned.py │ ├── petastorm.py │ └── ray_dataset.py ├── elastic.py ├── examples/ │ ├── __init__.py │ ├── create_test_data.py │ ├── higgs.py │ ├── higgs_parquet.py │ ├── readme.py │ ├── readme_sklearn_api.py │ ├── simple.py │ ├── simple_dask.py │ ├── simple_modin.py │ ├── simple_objectstore.py │ ├── simple_partitioned.py │ ├── simple_predict.py │ ├── simple_ray_dataset.py │ ├── simple_tune.py │ ├── train_on_test_data.py │ └── train_with_ml_dataset.py ├── main.py ├── matrix.py ├── session.py ├── sklearn.py ├── tests/ │ ├── __init__.py │ ├── conftest.py │ ├── env_info.sh │ ├── fault_tolerance.py │ ├── release/ │ │ ├── benchmark_cpu_gpu.py │ │ ├── benchmark_ft.py │ │ ├── cluster_cpu.yaml │ │ ├── cluster_ft.yaml │ │ ├── cluster_gpu.yaml │ │ ├── create_learnable_data.py │ │ ├── create_test_data.py │ │ ├── custom_objective_metric.py │ │ ├── run_e2e_gpu.sh │ │ ├── setup_xgboost.sh │ │ ├── start_cpu_cluster.sh │ │ ├── start_ft_cluster.sh │ │ ├── start_gpu_cluster.sh │ │ ├── submit_cpu_gpu_benchmark.sh │ │ ├── submit_ft_benchmark.sh │ │ ├── tune_cluster.yaml │ │ └── tune_placement.py │ ├── test_client.py │ ├── test_colocation.py │ ├── test_data_source.py │ ├── test_end_to_end.py │ ├── test_fault_tolerance.py │ ├── test_matrix.py │ ├── test_sklearn.py │ ├── test_sklearn_matrix.py │ ├── test_tune.py │ ├── test_xgboost_api.py │ └── utils.py ├── tune.py ├── util.py └── xgb.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .flake8 ================================================ [flake8] max-line-length = 88 inline-quotes = " ignore = C408 C417 E121 E123 E126 E203 E226 E24 E704 W503 W504 W605 I N B001 B002 B003 B004 B005 B007 B008 B009 B010 B011 B012 B013 B014 B015 B016 B017 avoid-escape = no # Error E731 is ignored because of the migration from YAPF to Black. # See https://github.com/ray-project/ray/issues/21315 for more information. per-file-ignores = rllib/evaluation/worker_set.py:E731 rllib/evaluation/sampler.py:E731 ================================================ FILE: .github/workflows/gpu.yaml ================================================ name: GPU on manual trigger on: workflow_dispatch jobs: test_gpu: runs-on: ubuntu-latest timeout-minutes: 20 steps: - uses: actions/checkout@v3 - name: Set up Python 3.8 uses: actions/setup-python@v3 with: python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install -U anyscale pyyaml - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Set anyscale project env: ANYSCALE_PROJECT: ${{ secrets.ANYSCALE_PROJECT }} run: | echo "project_id: ${ANYSCALE_PROJECT}" > ./xgboost_ray/tests/release/.anyscale.yaml - name: Run end to end GPU test env: ANYSCALE_CLI_TOKEN: ${{ secrets.ANYSCALE_CLI_TOKEN }} run: | pushd ./xgboost_ray/tests/release ./run_e2e_gpu.sh popd || true ================================================ FILE: .github/workflows/test.yaml ================================================ name: pytest on push on: push: pull_request: schedule: - cron: "0 5 * * *" jobs: test_lint: runs-on: ubuntu-latest timeout-minutes: 3 steps: - uses: actions/checkout@v3 - name: Set up Python 3.8 uses: actions/setup-python@v3 with: python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run format script run: | ls -alp ./format.sh --all test_linux_ray_master: runs-on: ubuntu-latest timeout-minutes: 160 strategy: matrix: python-version: ["3.8", "3.9", "3.10"] include: - python-version: "3.8" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl - python-version: "3.9" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl - python-version: "3.10" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ${{ matrix.ray-wheel }} if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run tests uses: nick-invision/retry@v2 with: timeout_minutes: 45 max_attempts: 3 command: bash ./run_ci_tests.sh - name: Run examples uses: nick-invision/retry@v2 with: timeout_minutes: 10 max_attempts: 3 command: bash ./run_ci_examples.sh test_linux_ray_release: runs-on: ubuntu-latest timeout-minutes: 160 strategy: matrix: python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run tests uses: nick-invision/retry@v2 with: timeout_minutes: 45 max_attempts: 3 command: bash ./run_ci_tests.sh - name: Run examples uses: nick-invision/retry@v2 with: timeout_minutes: 10 max_attempts: 3 command: bash ./run_ci_examples.sh test_linux_compat: # Test compatibility when some optional libraries are missing # Test runs on latest ray release runs-on: ubuntu-latest timeout-minutes: 160 strategy: matrix: python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Uninstall unavailable dependencies # Disables modin and Ray Tune (via tabulate) run: | python -m pip uninstall -y modin python -m pip uninstall -y tabulate - name: Install package run: | python -m pip install -e . - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run tests uses: nick-invision/retry@v2 with: timeout_minutes: 45 max_attempts: 3 command: bash ./run_ci_tests.sh --no-tune - name: Run examples uses: nick-invision/retry@v2 with: timeout_minutes: 10 max_attempts: 3 command: bash ./run_ci_examples.sh --no-tune test_linux_cutting_edge: # Tests on cutting edge, i.e. latest Ray master, latest XGBoost master runs-on: ubuntu-latest timeout-minutes: 160 strategy: matrix: # no new versions for xgboost are published for 3.6 python-version: ["3.8", "3.9", "3.10"] include: - python-version: "3.8" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl - python-version: "3.9" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl - python-version: "3.10" ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ${{ matrix.ray-wheel }} if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install Ubuntu system dependencies run: | sudo apt-get install -y --no-install-recommends ninja-build - name: Install package run: | python -m pip install -e . - name: Clone XGBoost repo uses: actions/checkout@v3 with: repository: dmlc/xgboost path: xgboost submodules: true - name: Install XGBoost from source shell: bash -l {0} run: | pushd ${GITHUB_WORKSPACE}/xgboost/python-package python --version python setup.py sdist pip install -v ./dist/xgboost-*.tar.gz popd - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run tests uses: nick-invision/retry@v2 with: timeout_minutes: 45 max_attempts: 3 command: bash ./run_ci_tests.sh - name: Run examples uses: nick-invision/retry@v2 with: timeout_minutes: 10 max_attempts: 3 command: bash ./run_ci_examples.sh test_linux_xgboost_legacy: # Tests on XGBoost 0.90 and latest Ray release runs-on: ubuntu-latest timeout-minutes: 160 strategy: matrix: python-version: [3.8] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . - name: Install legacy XGBoost run: | python -m pip install xgboost==0.90 - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run tests uses: nick-invision/retry@v2 with: timeout_minutes: 45 max_attempts: 3 command: bash ./run_ci_tests.sh - name: Run examples uses: nick-invision/retry@v2 with: timeout_minutes: 10 max_attempts: 3 command: bash ./run_ci_examples.sh ================================================ FILE: .gitignore ================================================ # Python byte code files *.pyc python/.eggs # Backup files *.bak # Emacs temporary files *~ *# # Debug symbols *.pdb # Visual Studio files /packages *.suo *.user *.VC.db *.VC.opendb # Protobuf-generated files *_pb2.py *.pb.h *.pb.cc # Ray cluster configuration scripts/nodes.txt # OS X folder attributes .DS_Store # Debug files *.dSYM/ *.su # Python setup files *.egg-info # Compressed files *.gz # Datasets from examples **/MNIST_data/ **/cifar-10-batches-bin/ # Generated documentation files /doc/_build /doc/source/_static/thumbs /doc/source/tune/generated_guides/ # User-specific stuff: .idea/ # Pytest Cache **/.pytest_cache **/.cache .benchmarks python-driver-* # Vscode .vscode/ *.iml # python virtual env venv # pyenv version file .python-version # Vim .*.swp *.swp tags # Emacs .#* # tools tools/prometheus* # ray project files project-id .mypy_cache/ # XGBoost models from examples *.xgb # Downloaded test data *.csv *.csv.gz *.parquet # Byte-compiled files __pycache__/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- Code in python/ray/rllib/{evolution_strategies, dqn} adapted from https://github.com/openai (MIT License) Copyright (c) 2016 OpenAI (http://openai.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- Code in python/ray/rllib/impala/vtrace.py from https://github.com/deepmind/scalable_agent Copyright 2018 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- Code in python/ray/rllib/ars is adapted from https://github.com/modestyachts/ARS Copyright (c) 2018, ARS contributors (Horia Mania, Aurelia Guy, Benjamin Recht) All rights reserved. Redistribution and use of ARS in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------ Code in python/ray/prometheus_exporter.py is adapted from https://github.com/census-instrumentation/opencensus-python/blob/master/contrib/opencensus-ext-prometheus/opencensus/ext/prometheus/stats_exporter/__init__.py # Copyright 2018, OpenCensus Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: README.md ================================================ # Distributed XGBoost on Ray ![Build Status](https://github.com/ray-project/xgboost_ray/workflows/pytest%20on%20push/badge.svg) [![docs.ray.io](https://img.shields.io/badge/docs-ray.io-blue)](https://docs.ray.io/en/master/xgboost-ray.html) XGBoost-Ray is a distributed backend for [XGBoost](https://xgboost.readthedocs.io/en/latest/), built on top of [distributed computing framework Ray](https://ray.io). XGBoost-Ray - enables [multi-node](#usage) and [multi-GPU](#multi-gpu-training) training - integrates seamlessly with distributed [hyperparameter optimization](#hyperparameter-tuning) library [Ray Tune](http://tune.io) - comes with advanced [fault tolerance handling](#fault-tolerance) mechanisms, and - supports [distributed dataframes and distributed data loading](#distributed-data-loading) All releases are tested on large clusters and workloads. ## Installation You can install the latest XGBoost-Ray release from PIP: ```bash pip install "xgboost_ray" ``` If you'd like to install the latest master, use this command instead: ```bash pip install "git+https://github.com/ray-project/xgboost_ray.git#egg=xgboost_ray" ``` ## Usage XGBoost-Ray provides a drop-in replacement for XGBoost's `train` function. To pass data, instead of using `xgb.DMatrix` you will have to use `xgboost_ray.RayDMatrix`. You can also use a scikit-learn interface - see next section. Just as in original `xgb.train()` function, the [training parameters](https://xgboost.readthedocs.io/en/stable/parameter.html) are passed as the `params` dictionary. Ray-specific distributed training parameters are configured with a `xgboost_ray.RayParams` object. For instance, you can set the `num_actors` property to specify how many distributed actors you would like to use. Here is a simplified example (which requires `sklearn`): **Training:** ```python from xgboost_ray import RayDMatrix, RayParams, train from sklearn.datasets import load_breast_cancer train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) evals_result = {} bst = train( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=RayParams( num_actors=2, # Number of remote actors cpus_per_actor=1)) bst.save_model("model.xgb") print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1])) ``` **Prediction:** ```python from xgboost_ray import RayDMatrix, RayParams, predict from sklearn.datasets import load_breast_cancer import xgboost as xgb data, labels = load_breast_cancer(return_X_y=True) dpred = RayDMatrix(data, labels) bst = xgb.Booster(model_file="model.xgb") pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) print(pred_ray) ``` ### scikit-learn API XGBoost-Ray also features a scikit-learn API fully mirroring pure XGBoost scikit-learn API, providing a completely drop-in replacement. The following estimators are available: - `RayXGBClassifier` - `RayXGRegressor` - `RayXGBRFClassifier` - `RayXGBRFRegressor` - `RayXGBRanker` Example usage of `RayXGBClassifier`: ```python from xgboost_ray import RayXGBClassifier, RayParams from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split seed = 42 X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.25, random_state=42 ) clf = RayXGBClassifier( n_jobs=4, # In XGBoost-Ray, n_jobs sets the number of actors random_state=seed ) # scikit-learn API will automatically convert the data # to RayDMatrix format as needed. # You can also pass X as a RayDMatrix, in which case # y will be ignored. clf.fit(X_train, y_train) pred_ray = clf.predict(X_test) print(pred_ray) pred_proba_ray = clf.predict_proba(X_test) print(pred_proba_ray) # It is also possible to pass a RayParams object # to fit/predict/predict_proba methods - will override # n_jobs set during initialization clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2)) pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2)) print(pred_ray) ``` Things to keep in mind: - `n_jobs` parameter controls the number of actors spawned. You can pass a `RayParams` object to the `fit`/`predict`/`predict_proba` methods as the `ray_params` argument for greater control over resource allocation. Doing so will override the value of `n_jobs` with the value of `ray_params.num_actors` attribute. For more information, refer to the [Resources](#resources) section below. - By default `n_jobs` is set to `1`, which means the training will **not** be distributed. Make sure to either set `n_jobs` to a higher value or pass a `RayParams` object as outlined above in order to take advantage of XGBoost-Ray's functionality. - After calling `fit`, additional evaluation results (e.g. training time, number of rows, callback results) will be available under `additional_results_` attribute. - XGBoost-Ray's scikit-learn API is based on XGBoost 1.4. While we try to support older XGBoost versions, please note that this library is only fully tested and supported for XGBoost >= 1.4. For more information on the scikit-learn API, refer to the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn). ## Data loading Data is passed to XGBoost-Ray via a `RayDMatrix` object. The `RayDMatrix` lazy loads data and stores it sharded in the Ray object store. The Ray XGBoost actors then access these shards to run their training on. A `RayDMatrix` support various data and file types, like Pandas DataFrames, Numpy Arrays, CSV files and Parquet files. Example loading multiple parquet files: ```python import glob from xgboost_ray import RayDMatrix, RayFileType # We can also pass a list of files path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet"))) # This argument will be passed to `pd.read_parquet()` columns = [ "passenger_count", "trip_distance", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "total_amount" ] dtrain = RayDMatrix( path, label="passenger_count", # Will select this column as the label columns=columns, # ignore=["total_amount"], # Optional list of columns to ignore filetype=RayFileType.PARQUET) ``` ## Hyperparameter Tuning XGBoost-Ray integrates with [Ray Tune](https://tune.io) to provide distributed hyperparameter tuning for your distributed XGBoost models. You can run multiple XGBoost-Ray training runs in parallel, each with a different hyperparameter configuration, and each training run parallelized by itself. All you have to do is move your training code to a function, and pass the function to `tune.run`. Internally, `train` will detect if `tune` is being used and will automatically report results to tune. Example using XGBoost-Ray with Ray Tune: ```python from xgboost_ray import RayDMatrix, RayParams, train from sklearn.datasets import load_breast_cancer num_actors = 4 num_cpus_per_actor = 1 ray_params = RayParams( num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) def train_model(config): train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) evals_result = {} bst = train( params=config, dtrain=train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=ray_params) bst.save_model("model.xgb") from ray import tune # Specify the hyperparameter search space. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` analysis = tune.run( train_model, config=config, metric="train-error", mode="min", num_samples=4, resources_per_trial=ray_params.get_tune_resources()) print("Best hyperparameters", analysis.best_config) ``` Also see examples/simple_tune.py for another example. ## Fault tolerance XGBoost-Ray leverages the stateful Ray actor model to enable fault tolerant training. There are currently two modes implemented. ### Non-elastic training (warm restart) When an actor or node dies, XGBoost-Ray will retain the state of the remaining actors. In non-elastic training, the failed actors will be replaced as soon as resources are available again. Only these actors will reload their parts of the data. Training will resume once all actors are ready for training again. You can set this mode in the `RayParams`: ```python from xgboost_ray import RayParams ray_params = RayParams( elastic_training=False, # Use non-elastic training max_actor_restarts=2, # How often are actors allowed to fail ) ``` ### Elastic training In elastic training, XGBoost-Ray will continue training with fewer actors (and on fewer data) when a node or actor dies. The missing actors are staged in the background, and are reintegrated into training once they are back and loaded their data. This mode will train on fewer data for a period of time, which can impact accuracy. In practice, we found these effects to be minor, especially for large shuffled datasets. The immediate benefit is that training time is reduced significantly to almost the same level as if no actors died. Thus, especially when data loading takes a large part of the total training time, this setting can dramatically speed up training times for large distributed jobs. You can configure this mode in the `RayParams`: ```python from xgboost_ray import RayParams ray_params = RayParams( elastic_training=True, # Use elastic training max_failed_actors=3, # Only allow at most 3 actors to die at the same time max_actor_restarts=2, # How often are actors allowed to fail ) ``` ## Resources By default, XGBoost-Ray tries to determine the number of CPUs available and distributes them evenly across actors. In the case of very large clusters or clusters with many different machine sizes, it makes sense to limit the number of CPUs per actor by setting the `cpus_per_actor` argument. Consider always setting this explicitly. The number of XGBoost actors always has to be set manually with the `num_actors` argument. ### Multi GPU training XGBoost-Ray enables multi GPU training. The XGBoost core backend will automatically leverage NCCL2 for cross-device communication. All you have to do is to start one actor per GPU and set XGBoost's `tree_method` to a GPU-compatible option, eg. `gpu_hist` (see XGBoost documentation for more details.) For instance, if you have 2 machines with 4 GPUs each, you will want to start 8 remote actors, and set `gpus_per_actor=1`. There is usually no benefit in allocating less (e.g. 0.5) or more than one GPU per actor. You should divide the CPUs evenly across actors per machine, so if your machines have 16 CPUs in addition to the 4 GPUs, each actor should have 4 CPUs to use. ```python from xgboost_ray import RayParams ray_params = RayParams( num_actors=8, gpus_per_actor=1, cpus_per_actor=4, # Divide evenly across actors per machine ) ``` ### How many remote actors should I use? This depends on your workload and your cluster setup. Generally there is no inherent benefit of running more than one remote actor per node for CPU-only training. This is because XGBoost core can already leverage multiple CPUs via threading. However, there are some cases when you should consider starting more than one actor per node: - For [multi GPU training](#multi-gpu-training), each GPU should have a separate remote actor. Thus, if your machine has 24 CPUs and 4 GPUs, you will want to start 4 remote actors with 6 CPUs and 1 GPU each - In a **heterogeneous cluster**, you might want to find the [greatest common divisor](https://en.wikipedia.org/wiki/Greatest_common_divisor) for the number of CPUs. E.g. for a cluster with three nodes of 4, 8, and 12 CPUs, respectively, you should set the number of actors to 6 and the CPUs per actor to 4. ## Distributed data loading XGBoost-Ray can leverage both centralized and distributed data loading. In **centralized data loading**, the data is partitioned by the head node and stored in the object store. Each remote actor then retrieves their partitions by querying the Ray object store. Centralized loading is used when you pass centralized in-memory dataframes, such as Pandas dataframes or Numpy arrays, or when you pass a single source file, such as a single CSV or Parquet file. ```python from xgboost_ray import RayDMatrix # This will use centralized data loading, as only one source file is specified # `label_col` is a column in the CSV, used as the target label ray_params = RayDMatrix("./source_file.csv", label="label_col") ``` In **distributed data loading**, each remote actor loads their data directly from the source (e.g. local hard disk, NFS, HDFS, S3), without a central bottleneck. The data is still stored in the object store, but locally to each actor. This mode is used automatically when loading data from multiple CSV or Parquet files. Please note that we do not check or enforce partition sizes in this case - it is your job to make sure the data is evenly distributed across the source files. ```python from xgboost_ray import RayDMatrix # This will use distributed data loading, as four source files are specified # Please note that you cannot schedule more than four actors in this case. # `label_col` is a column in the Parquet files, used as the target label ray_params = RayDMatrix([ "hdfs:///tmp/part1.parquet", "hdfs:///tmp/part2.parquet", "hdfs:///tmp/part3.parquet", "hdfs:///tmp/part4.parquet", ], label="label_col") ``` Lastly, XGBoost-Ray supports **distributed dataframe** representations, such as [Ray Datasets](https://docs.ray.io/en/latest/data/dataset.html), [Modin](https://modin.readthedocs.io/en/latest/) and [Dask dataframes](https://docs.dask.org/en/latest/dataframe.html) (used with [Dask on Ray](https://docs.ray.io/en/master/dask-on-ray.html)). Here, XGBoost-Ray will check on which nodes the distributed partitions are currently located, and will assign partitions to actors in order to minimize cross-node data transfer. Please note that we also assume here that partition sizes are uniform. ```python from xgboost_ray import RayDMatrix # This will try to allocate the existing Modin partitions # to co-located Ray actors. If this is not possible, data will # be transferred across nodes ray_params = RayDMatrix(existing_modin_df) ``` ### Data sources The following data sources can be used with a `RayDMatrix` object. | Type | Centralized loading | Distributed loading | |------------------------------------------------------------------|---------------------|---------------------| | Numpy array | Yes | No | | Pandas dataframe | Yes | No | | Single CSV | Yes | No | | Multi CSV | Yes | Yes | | Single Parquet | Yes | No | | Multi Parquet | Yes | Yes | | [Ray Dataset](https://docs.ray.io/en/latest/data/dataset.html) | Yes | Yes | | [Petastorm](https://github.com/uber/petastorm) | Yes | Yes | | [Dask dataframe](https://docs.dask.org/en/latest/dataframe.html) | Yes | Yes | | [Modin dataframe](https://modin.readthedocs.io/en/latest/) | Yes | Yes | ## Memory usage XGBoost uses a compute-optimized datastructure, the `DMatrix`, to hold training data. When converting a dataset to a `DMatrix`, XGBoost creates intermediate copies and ends up holding a complete copy of the full data. The data will be converted into the local dataformat (on a 64 bit system these are 64 bit floats.) Depending on the system and original dataset dtype, this matrix can thus occupy more memory than the original dataset. The **peak memory usage** for CPU-based training is at least **3x** the dataset size (assuming dtype `float32` on a 64bit system) plus about **400,000 KiB** for other resources, like operating system requirements and storing of intermediate results. **Example** - Machine type: AWS m5.xlarge (4 vCPUs, 16 GiB RAM) - Usable RAM: ~15,350,000 KiB - Dataset: 1,250,000 rows with 1024 features, dtype float32. Total size: 5,000,000 KiB - XGBoost DMatrix size: ~10,000,000 KiB This dataset will fit exactly on this node for training. Note that the DMatrix size might be lower on a 32 bit system. **GPUs** Generally, the same memory requirements exist for GPU-based training. Additionally, the GPU must have enough memory to hold the dataset. In the example above, the GPU must have at least 10,000,000 KiB (about 9.6 GiB) memory. However, empirically we found that using a `DeviceQuantileDMatrix` seems to show more peak GPU memory usage, possibly for intermediate storage when loading data (about 10%). **Best practices** In order to reduce peak memory usage, consider the following suggestions: - Store data as `float32` or less. More precision is often not needed, and keeping data in a smaller format will help reduce peak memory usage for initial data loading. - Pass the `dtype` when loading data from CSV. Otherwise, floating point values will be loaded as `np.float64` per default, increasing peak memory usage by 33%. ## Placement Strategies XGBoost-Ray leverages Ray's Placement Group API () to implement placement strategies for better fault tolerance. By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the number of worker failures when a node goes down, but comes at a cost of increased inter-node communication To disable this strategy, set the `RXGB_USE_SPREAD_STRATEGY` environment variable to 0. If disabled, no particular placement strategy will be used. Note that this strategy is used only when `elastic_training` is not used. If `elastic_training` is set to `True`, no placement strategy is used. When XGBoost-Ray is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node goes down, it will be less likely to impact multiple trials. When placement strategies are used, XGBoost-Ray will wait for 100 seconds for the required resources to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale to increase the number of resources. You can change the `RXGB_PLACEMENT_GROUP_TIMEOUT_S` environment variable to modify how long this timeout should be. ## More examples For complete end to end examples, please have a look at the [examples folder](https://github.com/ray-project/xgboost_ray/tree/master/xgboost_ray/examples/): - [Simple sklearn breastcancer dataset example](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/simple.py) (requires `sklearn`) - [HIGGS classification example](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/higgs.py) ([download dataset (2.6 GB)](https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz)) - [HIGGS classification example with Parquet](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/higgs_parquet.py) (uses the same dataset) - [Test data classification](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/train_on_test_data.py) (uses a self-generated dataset) ## Resources * [XGBoost-Ray documentation](https://xgboost.readthedocs.io/en/stable/tutorials/ray.html) * [Ray community slack](https://forms.gle/9TSdDYUgxYs8SA9e8) ================================================ FILE: format.sh ================================================ #!/usr/bin/env bash # Black + Clang formatter (if installed). This script formats all changed files from the last mergebase. # You are encouraged to run this locally before pushing changes for review. # Cause the script to exit if a single command fails set -euo pipefail FLAKE8_VERSION_REQUIRED="3.9.1" BLACK_VERSION_REQUIRED="22.10.0" SHELLCHECK_VERSION_REQUIRED="0.7.1" ISORT_VERSION_REQUIRED="5.10.1" check_python_command_exist() { VERSION="" case "$1" in black) VERSION=$BLACK_VERSION_REQUIRED ;; flake8) VERSION=$FLAKE8_VERSION_REQUIRED ;; isort) VERSION=$ISORT_VERSION_REQUIRED ;; *) echo "$1 is not a required dependency" exit 1 esac if ! [ -x "$(command -v "$1")" ]; then echo "$1 not installed. Install the python package with: pip install $1==$VERSION" exit 1 fi } check_docstyle() { echo "Checking docstyle..." violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true) if [[ -n "$violations" ]]; then echo echo "=== Found Ray docstyle violations ===" echo "$violations" echo echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style " echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore." exit 1 fi return 0 } check_python_command_exist black check_python_command_exist flake8 check_python_command_exist isort # this stops git rev-parse from failing if we run this from the .git directory builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 # NOTE(edoakes): black version differs based on installation method: # Option 1) 'black, 21.12b0 (compiled: no)' # Option 2) 'black, version 21.12b0' # For newer versions (at least 22.10.0), a second line is printed which must be dropped: # # black, 22.10.0 (compiled: yes) # Python (CPython) 3.9.13 BLACK_VERSION_STR=$(black --version) if [[ "$BLACK_VERSION_STR" == *"compiled"* ]] then BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}') else BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}') fi FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}') ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}') # params: tool name, tool version, required version tool_version_check() { if [ "$2" != "$3" ]; then echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results." fi } tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED" tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED" tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED" if command -v shellcheck >/dev/null; then SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}') tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED" else echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager." fi if command -v clang-format >/dev/null; then CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}') tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0" else echo "WARNING: clang-format is not installed!" fi if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes" fi if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear" fi SHELLCHECK_FLAGS=( --exclude=1090 # "Can't follow non-constant source. Use a directive to specify location." --exclude=1091 # "Not following {file} due to some error" --exclude=2207 # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash ) BLACK_EXCLUDES=( '--force-exclude' 'python/ray/cloudpickle/*|'` `'python/build/*|'` `'python/ray/core/src/ray/gcs/*|'` `'python/ray/thirdparty_files/*|'` `'python/ray/_private/thirdparty/*|'` `'python/ray/serve/tests/test_config_files/syntax_error\.py' ) GIT_LS_EXCLUDES=( ':(exclude)python/ray/cloudpickle/' ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py' ) # TODO(barakmich): This should be cleaned up. I've at least excised the copies # of these arguments to this location, but the long-term answer is to actually # make a flake8 config file FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605" shellcheck_scripts() { shellcheck "${SHELLCHECK_FLAGS[@]}" "$@" } # Format specified files format_files() { local shell_files=() python_files=() bazel_files=() local name for name in "$@"; do local base="${name%.*}" local suffix="${name#"${base}"}" local shebang="" read -r shebang < "${name}" || true case "${shebang}" in '#!'*) shebang="${shebang#/usr/bin/env }" shebang="${shebang%% *}" shebang="${shebang##*/}" ;; esac if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then bazel_files+=("${name}") elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then python_files+=("${name}") elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then shell_files+=("${name}") else echo "error: failed to determine file type: ${name}" 1>&2 return 1 fi done if [ 0 -lt "${#python_files[@]}" ]; then isort "${python_files[@]}" black "${python_files[@]}" fi if command -v shellcheck >/dev/null; then if shellcheck --shell=sh --format=diff - < /dev/null; then if [ 0 -lt "${#shell_files[@]}" ]; then local difference difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")" difference="${difference%-}" printf "%s" "${difference}" | patch -p1 fi else echo "error: this version of shellcheck does not support diffs" fi fi } format_all_scripts() { command -v flake8 &> /dev/null; HAS_FLAKE8=$? # Run isort before black to fix imports and let black deal with file format. echo "$(date)" "isort...." git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ isort echo "$(date)" "Black...." git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ black "${BLACK_EXCLUDES[@]}" if [ $HAS_FLAKE8 ]; then echo "$(date)" "Flake8...." git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \ flake8 --config=.flake8 fi if command -v shellcheck >/dev/null; then local shell_files non_shell_files non_shell_files=($(git ls-files -- ':(exclude)*.sh')) shell_files=($(git ls-files -- '*.sh')) if [ 0 -lt "${#non_shell_files[@]}" ]; then shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) fi if [ 0 -lt "${#shell_files[@]}" ]; then echo "$(date)" "shellcheck scripts...." shellcheck_scripts "${shell_files[@]}" fi fi } # Format files that differ from main branch. Ignores dirs that are not slated # for autoformat yet. format_changed() { # The `if` guard ensures that the list of filenames is not empty, which # could cause the formatter to receive 0 positional arguments, making # Black error. # # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that # exist on both branches. MERGEBASE="$(git merge-base upstream/master HEAD)" if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ isort fi if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ black "${BLACK_EXCLUDES[@]}" if which flake8 >/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ flake8 --config=.flake8 fi fi if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then if which flake8 >/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \ flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES" fi fi if which clang-format >/dev/null; then if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \ clang-format -i fi fi if command -v shellcheck >/dev/null; then local shell_files non_shell_files non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh')) shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh')) if [ 0 -lt "${#non_shell_files[@]}" ]; then shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) fi if [ 0 -lt "${#shell_files[@]}" ]; then shellcheck_scripts "${shell_files[@]}" fi fi } # This flag formats individual files. --files *must* be the first command line # arg to use this option. if [ "${1-}" == '--files' ]; then format_files "${@:2}" # If `--all` or `--scripts` are passed, then any further arguments are ignored. # Format the entire python directory and other scripts. elif [ "${1-}" == '--all-scripts' ]; then format_all_scripts "${@}" if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi # Format the all Python, C++, Java and other script files. elif [ "${1-}" == '--all' ]; then format_all_scripts "${@}" if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi else # Add the upstream remote if it doesn't exist if ! git remote -v | grep -q upstream; then git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git' fi # Only fetch master since that's the branch we're diffing against. git fetch upstream master || true # Format only the files that changed in last commit. format_changed fi check_docstyle if ! git diff --quiet &>/dev/null; then echo 'Reformatted changed files. Please review and stage the changes.' echo 'Files updated:' echo git --no-pager diff --name-only exit 1 fi ================================================ FILE: requirements/lint-requirements.txt ================================================ flake8==3.9.1 flake8-comprehensions==3.10.1 flake8-quotes==2.0.0 flake8-bugbear==21.9.2 black==22.10.0 isort==5.10.1 importlib-metadata==4.13.0 ================================================ FILE: requirements/test-requirements.txt ================================================ packaging petastorm pytest pyarrow<15.0.0 ray[tune, data, default] scikit-learn # modin==0.23.1.post0 is not compatible with xgboost_ray py38 modin<=0.23.1; python_version == '3.8' # modin==0.26.0 is not compatible with xgboost_ray py39+ modin<0.26.0; python_version > '3.8' dask #workaround for now protobuf<4.0.0 tensorboardX==2.2 ================================================ FILE: run_ci_examples.sh ================================================ #!/bin/bash set -e TUNE=1 for i in "$@" do echo "$i" case "$i" in --no-tune) TUNE=0 ;; *) echo "unknown arg, $i" exit 1 ;; esac done pushd xgboost_ray/examples/ || exit 1 ray stop || true echo "================" echo "Running examples" echo "================" echo "running readme.py" && python readme.py echo "running readme_sklearn_api.py" && python readme_sklearn_api.py echo "running simple.py" && python simple.py --smoke-test echo "running simple_predict.py" && python simple_predict.py echo "running simple_dask.py" && python simple_dask.py --smoke-test echo "running simple_modin.py" && python simple_modin.py --smoke-test echo "running simple_objectstore.py" && python simple_objectstore.py --smoke-test echo "running simple_ray_dataset.py" && python simple_objectstore.py --smoke-test echo "running simple_partitioned.py" && python simple_partitioned.py --smoke-test if [ "$TUNE" = "1" ]; then echo "running simple_tune.py" && python simple_tune.py --smoke-test else echo "skipping tune example" fi echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test popd pushd xgboost_ray/tests echo "running examples with Ray Client" python -m pytest -v --durations=0 -x test_client.py popd || exit 1 ================================================ FILE: run_ci_tests.sh ================================================ #!/bin/bash TUNE=1 for i in "$@" do echo "$i" case "$i" in --no-tune) TUNE=0 ;; *) echo "unknown arg, $i" exit 1 ;; esac done pushd xgboost_ray/tests || exit 1 echo "=============" echo "Running tests" echo "=============" END_STATUS=0 if ! python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x "test_colocation.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_matrix.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_data_source.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_xgboost_api.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_fault_tolerance.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_end_to_end.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_sklearn.py" ; then END_STATUS=1; fi if ! python -m pytest -v --durations=0 -x "test_sklearn_matrix.py" ; then END_STATUS=1; fi if [ "$TUNE" = "1" ]; then if ! python -m pytest -v --durations=0 -x "test_tune.py" ; then END_STATUS=1; fi else echo "skipping tune tests" fi echo "running smoke test on benchmark_cpu_gpu.py" && if ! python release/benchmark_cpu_gpu.py 2 10 20 --smoke-test; then END_STATUS=1; fi popd || exit 1 if [ "$END_STATUS" = "1" ]; then echo "At least one test has failed, exiting with code 1" fi exit "$END_STATUS" ================================================ FILE: setup.py ================================================ from setuptools import find_packages, setup setup( name="xgboost_ray", packages=find_packages(where=".", include="xgboost_ray*"), version="0.1.20", author="Ray Team", description="A Ray backend for distributed XGBoost", license="Apache 2.0", long_description="A distributed backend for XGBoost built on top of " "distributed computing framework Ray.", url="https://github.com/ray-project/xgboost_ray", install_requires=[ "ray>=2.7", "numpy>=1.16", "pandas", "wrapt>=1.12.1", "xgboost>=0.90", "packaging", ], ) ================================================ FILE: xgboost_ray/__init__.py ================================================ from xgboost_ray.main import RayParams, predict, train from xgboost_ray.matrix import ( Data, RayDeviceQuantileDMatrix, RayDMatrix, RayFileType, RayShardingMode, combine_data, ) # workaround for legacy xgboost==0.9.0 try: from xgboost_ray.sklearn import ( RayXGBClassifier, RayXGBRanker, RayXGBRegressor, RayXGBRFClassifier, RayXGBRFRegressor, ) except ImportError: pass __version__ = "0.1.20" __all__ = [ "__version__", "RayParams", "RayDMatrix", "RayDeviceQuantileDMatrix", "RayFileType", "RayShardingMode", "Data", "combine_data", "train", "predict", "RayXGBClassifier", "RayXGBRegressor", "RayXGBRFClassifier", "RayXGBRFRegressor", "RayXGBRanker", ] ================================================ FILE: xgboost_ray/callback.py ================================================ import os from abc import ABC from typing import TYPE_CHECKING, Any, Dict, Sequence, Union import pandas as pd from ray.util.annotations import DeveloperAPI, PublicAPI if TYPE_CHECKING: from xgboost_ray.main import RayXGBoostActor from xgboost_ray.matrix import RayDMatrix @PublicAPI(stability="beta") class DistributedCallback(ABC): """Distributed callbacks for RayXGBoostActors. The hooks of these callbacks are executed on the remote Ray actors at different points in time. They can be used to set environment variables or to prepare the training/prediction environment in other ways. Distributed callback objects are de-serialized on each actor and are then independent of each other - changing the state of one callback will not alter the state of the other copies on different actors. Callbacks can be passed to xgboost_ray via :class:`RayParams ` using the ``distributed_callbacks`` parameter. """ def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): pass def before_data_loading( self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs ): pass def after_data_loading( self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs ): pass def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): pass def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): pass def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): pass def after_predict( self, actor: "RayXGBoostActor", predictions: Union[pd.Series, pd.DataFrame], *args, **kwargs ): pass @DeveloperAPI class DistributedCallbackContainer: def __init__(self, callbacks: Sequence[DistributedCallback]): self.callbacks = callbacks or [] def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.on_init(actor, *args, **kwargs) def before_data_loading( self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs ): for callback in self.callbacks: callback.before_data_loading(actor, data, *args, **kwargs) def after_data_loading( self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs ): for callback in self.callbacks: callback.after_data_loading(actor, data, *args, **kwargs) def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.before_train(actor, *args, **kwargs) def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): for callback in self.callbacks: callback.after_train(actor, result_dict, *args, **kwargs) def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.before_predict(actor, *args, **kwargs) def after_predict( self, actor: "RayXGBoostActor", predictions: Union[pd.Series, pd.DataFrame], *args, **kwargs ): for callback in self.callbacks: callback.after_predict(actor, predictions, *args, **kwargs) class EnvironmentCallback(DistributedCallback): def __init__(self, env_dict: Dict[str, Any]): self.env_dict = env_dict def on_init(self, actor, *args, **kwargs): os.environ.update(self.env_dict) ================================================ FILE: xgboost_ray/compat/__init__.py ================================================ from typing import TYPE_CHECKING if TYPE_CHECKING: from xgboost_ray.xgb import xgboost as xgb try: from xgboost.callback import TrainingCallback LEGACY_CALLBACK = False except ImportError: class TrainingCallback: def __init__(self): if hasattr(self, "before_iteration"): # XGBoost < 1.0 is looking up __dict__ to see if a # callback should be called before or after an iteration. # So here we move this to self._before_iteration and # overwrite the dict. self._before_iteration = getattr(self, "before_iteration") self.__dict__["before_iteration"] = True def __call__(self, callback_env: "xgb.core.CallbackEnv"): if hasattr(self, "_before_iteration"): self._before_iteration( model=callback_env.model, epoch=callback_env.iteration, evals_log=callback_env.evaluation_result_list, ) if hasattr(self, "after_iteration"): self.after_iteration( model=callback_env.model, epoch=callback_env.iteration, evals_log=callback_env.evaluation_result_list, ) def before_training(self, model): pass def after_training(self, model): pass LEGACY_CALLBACK = True try: from xgboost import RabitTracker except ImportError: from xgboost_ray.compat.tracker import RabitTracker __all__ = ["TrainingCallback", "RabitTracker"] ================================================ FILE: xgboost_ray/compat/tracker.py ================================================ # flake8: noqa # Copyright 2021 by XGBoost Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # File copied from: # https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/python-package/xgboost/tracker.py import logging # License: # https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/LICENSE import socket import struct import time from threading import Thread class ExSocket(object): """ Extension of socket to handle recv and send of special data """ def __init__(self, sock): self.sock = sock def recvall(self, nbytes): res = [] nread = 0 while nread < nbytes: chunk = self.sock.recv(min(nbytes - nread, 1024)) nread += len(chunk) res.append(chunk) return b"".join(res) def recvint(self): return struct.unpack("@i", self.recvall(4))[0] def sendint(self, n): self.sock.sendall(struct.pack("@i", n)) def sendstr(self, s): self.sendint(len(s)) self.sock.sendall(s.encode()) def recvstr(self): slen = self.recvint() return self.recvall(slen).decode() # magic number used to verify existence of data kMagic = 0xFF99 def get_some_ip(host): return socket.getaddrinfo(host, None)[0][4][0] def get_host_ip(hostIP=None): if hostIP is None or hostIP == "auto": hostIP = "ip" if hostIP == "dns": hostIP = socket.getfqdn() elif hostIP == "ip": from socket import gaierror try: hostIP = socket.gethostbyname(socket.getfqdn()) except gaierror: logging.debug( "gethostbyname(socket.getfqdn()) failed... trying on hostname()" ) hostIP = socket.gethostbyname(socket.gethostname()) if hostIP.startswith("127."): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # doesn't have to be reachable s.connect(("10.255.255.255", 1)) hostIP = s.getsockname()[0] return hostIP def get_family(addr): return socket.getaddrinfo(addr, None)[0][0] class SlaveEntry(object): def __init__(self, sock, s_addr): slave = ExSocket(sock) self.sock = slave self.host = get_some_ip(s_addr[0]) magic = slave.recvint() assert magic == kMagic, "invalid magic number=%d from %s" % (magic, self.host) slave.sendint(kMagic) self.rank = slave.recvint() self.world_size = slave.recvint() self.jobid = slave.recvstr() self.cmd = slave.recvstr() self.wait_accept = 0 self.port = None def decide_rank(self, job_map): if self.rank >= 0: return self.rank if self.jobid != "NULL" and self.jobid in job_map: return job_map[self.jobid] return -1 def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map): self.rank = rank nnset = set(tree_map[rank]) rprev, rnext = ring_map[rank] self.sock.sendint(rank) # send parent rank self.sock.sendint(parent_map[rank]) # send world size self.sock.sendint(len(tree_map)) self.sock.sendint(len(nnset)) # send the rprev and next link for r in nnset: self.sock.sendint(r) # send prev link if rprev not in (-1, rank): nnset.add(rprev) self.sock.sendint(rprev) else: self.sock.sendint(-1) # send next link if rnext not in (-1, rank): nnset.add(rnext) self.sock.sendint(rnext) else: self.sock.sendint(-1) while True: ngood = self.sock.recvint() goodset = set([]) for _ in range(ngood): goodset.add(self.sock.recvint()) assert goodset.issubset(nnset) badset = nnset - goodset conset = [] for r in badset: if r in wait_conn: conset.append(r) self.sock.sendint(len(conset)) self.sock.sendint(len(badset) - len(conset)) for r in conset: self.sock.sendstr(wait_conn[r].host) self.sock.sendint(wait_conn[r].port) self.sock.sendint(r) nerr = self.sock.recvint() if nerr != 0: continue self.port = self.sock.recvint() rmset = [] # all connection was successuly setup for r in conset: wait_conn[r].wait_accept -= 1 if wait_conn[r].wait_accept == 0: rmset.append(r) for r in rmset: wait_conn.pop(r, None) self.wait_accept = len(badset) - len(conset) return rmset class RabitTracker(object): """ tracker for rabit """ def __init__(self, hostIP, nslave, port=9091, port_end=9999): sock = socket.socket(get_family(hostIP), socket.SOCK_STREAM) for _port in range(port, port_end): try: sock.bind((hostIP, _port)) self.port = _port break except socket.error as e: if e.errno in [98, 48]: continue raise sock.listen(256) self.sock = sock self.hostIP = hostIP self.thread = None self.start_time = None self.end_time = None self.nslave = nslave logging.info("start listen on %s:%d", hostIP, self.port) def __del__(self): self.sock.close() @staticmethod def get_neighbor(rank, nslave): rank = rank + 1 ret = [] if rank > 1: ret.append(rank // 2 - 1) if rank * 2 - 1 < nslave: ret.append(rank * 2 - 1) if rank * 2 < nslave: ret.append(rank * 2) return ret def slave_envs(self): """ get enviroment variables for slaves can be passed in as args or envs """ return {"DMLC_TRACKER_URI": self.hostIP, "DMLC_TRACKER_PORT": self.port} def get_tree(self, nslave): tree_map = {} parent_map = {} for r in range(nslave): tree_map[r] = self.get_neighbor(r, nslave) parent_map[r] = (r + 1) // 2 - 1 return tree_map, parent_map def find_share_ring(self, tree_map, parent_map, r): """ get a ring structure that tends to share nodes with the tree return a list starting from r """ nset = set(tree_map[r]) cset = nset - set([parent_map[r]]) if not cset: return [r] rlst = [r] cnt = 0 for v in cset: vlst = self.find_share_ring(tree_map, parent_map, v) cnt += 1 if cnt == len(cset): vlst.reverse() rlst += vlst return rlst def get_ring(self, tree_map, parent_map): """ get a ring connection used to recover local data """ assert parent_map[0] == -1 rlst = self.find_share_ring(tree_map, parent_map, 0) assert len(rlst) == len(tree_map) ring_map = {} nslave = len(tree_map) for r in range(nslave): rprev = (r + nslave - 1) % nslave rnext = (r + 1) % nslave ring_map[rlst[r]] = (rlst[rprev], rlst[rnext]) return ring_map def get_link_map(self, nslave): """ get the link map, this is a bit hacky, call for better algorithm to place similar nodes together """ tree_map, parent_map = self.get_tree(nslave) ring_map = self.get_ring(tree_map, parent_map) rmap = {0: 0} k = 0 for i in range(nslave - 1): k = ring_map[k][1] rmap[k] = i + 1 ring_map_ = {} tree_map_ = {} parent_map_ = {} for k, v in ring_map.items(): ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]]) for k, v in tree_map.items(): tree_map_[rmap[k]] = [rmap[x] for x in v] for k, v in parent_map.items(): if k != 0: parent_map_[rmap[k]] = rmap[v] else: parent_map_[rmap[k]] = -1 return tree_map_, parent_map_, ring_map_ def accept_slaves(self, nslave): # set of nodes that finishs the job shutdown = {} # set of nodes that is waiting for connections wait_conn = {} # maps job id to rank job_map = {} # list of workers that is pending to be assigned rank pending = [] # lazy initialize tree_map tree_map = None while len(shutdown) != nslave: fd, s_addr = self.sock.accept() s = SlaveEntry(fd, s_addr) if s.cmd == "print": msg = s.sock.recvstr() print(msg.strip(), flush=True) continue if s.cmd == "shutdown": assert s.rank >= 0 and s.rank not in shutdown assert s.rank not in wait_conn shutdown[s.rank] = s logging.debug("Received %s signal from %d", s.cmd, s.rank) continue assert s.cmd == "start" or s.cmd == "recover" # lazily initialize the slaves if tree_map is None: assert s.cmd == "start" if s.world_size > 0: nslave = s.world_size tree_map, parent_map, ring_map = self.get_link_map(nslave) # set of nodes that is pending for getting up todo_nodes = list(range(nslave)) else: assert s.world_size == -1 or s.world_size == nslave if s.cmd == "recover": assert s.rank >= 0 rank = s.decide_rank(job_map) # batch assignment of ranks if rank == -1: assert todo_nodes pending.append(s) if len(pending) == len(todo_nodes): pending.sort(key=lambda x: x.host) for s in pending: rank = todo_nodes.pop(0) if s.jobid != "NULL": job_map[s.jobid] = rank s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) if s.wait_accept > 0: wait_conn[rank] = s logging.debug( "Received %s signal from %s; assign rank %d", s.cmd, s.host, s.rank, ) if not todo_nodes: logging.info("@tracker All of %d nodes getting started", nslave) self.start_time = time.time() else: s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) logging.debug("Received %s signal from %d", s.cmd, s.rank) if s.wait_accept > 0: wait_conn[rank] = s logging.info("@tracker All nodes finishes job") self.end_time = time.time() logging.info( "@tracker %s secs between node start and job finish", str(self.end_time - self.start_time), ) def start(self, nslave): def run(): self.accept_slaves(nslave) self.thread = Thread(target=run, args=(), daemon=True) self.thread.start() def join(self): while self.thread.is_alive(): self.thread.join(100) def alive(self): return self.thread.is_alive() ================================================ FILE: xgboost_ray/data_sources/__init__.py ================================================ from xgboost_ray.data_sources.csv import CSV from xgboost_ray.data_sources.dask import Dask from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.modin import Modin from xgboost_ray.data_sources.numpy import Numpy from xgboost_ray.data_sources.object_store import ObjectStore from xgboost_ray.data_sources.pandas import Pandas from xgboost_ray.data_sources.parquet import Parquet from xgboost_ray.data_sources.partitioned import Partitioned from xgboost_ray.data_sources.petastorm import Petastorm from xgboost_ray.data_sources.ray_dataset import RayDataset data_sources = [ Numpy, Pandas, Partitioned, Modin, Dask, Petastorm, CSV, Parquet, ObjectStore, RayDataset, ] __all__ = [ "DataSource", "RayFileType", "Numpy", "Pandas", "Modin", "Dask", "Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset", "Partitioned", ] ================================================ FILE: xgboost_ray/data_sources/_distributed.py ================================================ import itertools import math from collections import defaultdict from typing import Any, Dict, Sequence import ray from ray.actor import ActorHandle def get_actor_rank_ips(actors: Sequence[ActorHandle]) -> Dict[int, str]: """Get a dict mapping from actor ranks to their IPs""" no_obj = ray.put(None) # Build a dict mapping actor ranks to their IP addresses actor_rank_ips: Dict[int, str] = dict( enumerate( ray.get( [actor.ip.remote() if actor is not None else no_obj for actor in actors] ) ) ) return actor_rank_ips def assign_partitions_to_actors( ip_to_parts: Dict[int, Any], actor_rank_ips: Dict[int, str] ) -> Dict[int, Sequence[Any]]: """Assign partitions from a distributed dataframe to actors. This function collects distributed partitions and evenly distributes them to actors, trying to minimize data transfer by respecting co-locality. This function currently does _not_ take partition sizes into account for distributing data. It assumes that all partitions have (more or less) the same length. Instead, partitions are evenly distributed. E.g. for 8 partitions and 3 actors, each actor gets assigned 2 or 3 partitions. Which partitions are assigned depends on the data locality. The algorithm is as follows: For any number of data partitions, get the Ray object references to the shards and the IP addresses where they currently live. Calculate the minimum and maximum amount of partitions per actor. These numbers should differ by at most 1. Also calculate how many actors will get more partitions assigned than the other actors. First, each actor gets assigned up to ``max_parts_per_actor`` co-located partitions. Only up to ``num_actors_with_max_parts`` actors get the maximum number of partitions, the rest try to fill the minimum. The rest of the partitions (all of which cannot be assigned to a co-located actor) are assigned to actors until there are none left. """ num_partitions = sum(len(parts) for parts in ip_to_parts.values()) num_actors = len(actor_rank_ips) min_parts_per_actor = max(0, math.floor(num_partitions / num_actors)) max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors)) num_actors_with_max_parts = num_partitions % num_actors # This is our result dict that maps actor objects to a list of partitions actor_to_partitions = defaultdict(list) # First we loop through the actors and assign them partitions from their # own IPs. Do this until each actor has `min_parts_per_actor` partitions partition_assigned = True while partition_assigned: partition_assigned = False # Loop through each actor once, assigning for rank, actor_ip in actor_rank_ips.items(): num_parts_left_on_ip = len(ip_to_parts[actor_ip]) num_actor_parts = len(actor_to_partitions[rank]) if num_parts_left_on_ip > 0 and num_actor_parts < max_parts_per_actor: if num_actor_parts >= min_parts_per_actor: # Only allow up to `num_actors_with_max_parts actors to # have the maximum number of partitions assigned. if num_actors_with_max_parts <= 0: continue num_actors_with_max_parts -= 1 actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0)) partition_assigned = True # The rest of the partitions, no matter where they are located, could not # be assigned to co-located actors. Thus, we assign them # to actors who still need partitions. rest_parts = list(itertools.chain(*ip_to_parts.values())) partition_assigned = True while len(rest_parts) > 0 and partition_assigned: partition_assigned = False for rank in actor_rank_ips: num_actor_parts = len(actor_to_partitions[rank]) if num_actor_parts < max_parts_per_actor: if num_actor_parts >= min_parts_per_actor: if num_actors_with_max_parts <= 0: continue num_actors_with_max_parts -= 1 actor_to_partitions[rank].append(rest_parts.pop(0)) partition_assigned = True if len(rest_parts) <= 0: break if len(rest_parts) != 0: raise RuntimeError( "There are still partitions left to assign, but no actor " "has capacity for more. This is probably a bug. Please go " "to https://github.com/ray-project/xgboost_ray to report it." ) return actor_to_partitions ================================================ FILE: xgboost_ray/data_sources/csv.py ================================================ from typing import Any, Iterable, Optional, Sequence, Union import pandas as pd from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.pandas import Pandas class CSV(DataSource): """Read one or many CSV files.""" supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return filetype == RayFileType.CSV @staticmethod def get_filetype(data: Any) -> Optional[RayFileType]: if data.endswith(".csv") or data.endswith("csv.gz"): return RayFileType.CSV return None @staticmethod def load_data( data: Union[str, Sequence[str]], ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs ): if isinstance(data, Iterable) and not isinstance(data, str): shards = [] for i, shard in enumerate(data): if indices and i not in indices: continue shard_df = pd.read_csv(shard, **kwargs) shards.append(Pandas.load_data(shard_df, ignore=ignore)) return pd.concat(shards, copy=False) else: local_df = pd.read_csv(data, **kwargs) return Pandas.load_data(local_df, ignore=ignore) @staticmethod def get_n(data: Any): return len(list(data)) ================================================ FILE: xgboost_ray/data_sources/dask.py ================================================ from collections import defaultdict from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import pandas as pd import ray import wrapt from ray.actor import ActorHandle from xgboost_ray.data_sources._distributed import ( assign_partitions_to_actors, get_actor_rank_ips, ) from xgboost_ray.data_sources.data_source import DataSource, RayFileType try: import dask # noqa: F401 from ray.util.dask import ray_dask_get DASK_INSTALLED = True except ImportError: DASK_INSTALLED = False def _assert_dask_installed(): if not DASK_INSTALLED: raise RuntimeError( "Tried to use Dask as a data source, but dask is not " "installed. This function shouldn't have been called. " "\nFIX THIS by installing dask: `pip install dask`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " "the code should not have been reached." ) @wrapt.decorator def ensure_ray_dask_initialized( func: Any, instance: Any, args: List[Any], kwargs: Any ) -> Any: _assert_dask_installed() dask.config.set(scheduler=ray_dask_get) return func(*args, **kwargs) class Dask(DataSource): """Read from distributed Dask dataframe. A `Dask dataframe `_ is a distributed drop-in replacement for pandas. Dask dataframes are stored on multiple actors, making them suitable for distributed loading. """ supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not DASK_INSTALLED: return False from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import Series as DaskSeries return isinstance(data, (DaskDataFrame, DaskSeries)) @ensure_ray_dask_initialized @staticmethod def load_data( data: Any, # dask.pandas.DataFrame ignore: Optional[Sequence[str]] = None, indices: Optional[Union[Sequence[int], Sequence[int]]] = None, **kwargs ) -> pd.DataFrame: _assert_dask_installed() import dask.dataframe as dd if indices is not None and len(indices) > 0 and isinstance(indices[0], Tuple): # We got a list of partition IDs belonging to Dask partitions return dd.concat([data.partitions[i] for (i,) in indices]).compute() # Dask does not support iloc() for row selection, so we have to # compute a local pandas dataframe first local_df = data.compute() if indices: local_df = local_df.iloc[indices] if ignore: local_df = local_df[local_df.columns.difference(ignore)] return local_df @ensure_ray_dask_initialized @staticmethod def convert_to_series(data: Any) -> pd.Series: _assert_dask_installed() from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import Series as DaskSeries if isinstance(data, DaskDataFrame): return pd.Series(data.compute().squeeze()) elif isinstance(data, DaskSeries): return data.compute() elif isinstance(data, DaskArray): return pd.Series(data.compute()) return DataSource.convert_to_series(data) @ensure_ray_dask_initialized @staticmethod def get_actor_shards( data: Any, actors: Sequence[ActorHandle] # dask.dataframe.DataFrame ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_dask_installed() actor_rank_ips = get_actor_rank_ips(actors) # Get IPs and partitions ip_to_parts = get_ip_to_parts(data) return data, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) @ensure_ray_dask_initialized @staticmethod def get_n(data: Any): """ For naive distributed loading we just return the number of rows here. Loading by shard is achieved via `get_actor_shards()` """ return len(data) def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]: persisted = data.persist(scheduler=ray_dask_get) name = persisted._name node_ids_to_node = {node["NodeID"]: node for node in ray.state.nodes()} # This is a hacky way to get the partition node IDs, and it's not # 100% accurate as the map task could get scheduled on a different node # (though Ray tries to keep locality). We need to use that until # ray.state.objects() or something like it is available again. partition_locations_df = persisted.map_partitions( lambda df: pd.DataFrame([ray.get_runtime_context().get_node_id()]) ).compute() partition_locations = [ partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size) ] ip_to_parts = defaultdict(list) for (obj_name, pid), obj_ref in dask.base.collections_to_dsk([persisted]).items(): assert obj_name == name if isinstance(obj_ref, ray.ObjectRef): node_id = partition_locations[pid] node = node_ids_to_node.get(node_id, {}) ip = node.get("NodeManagerAddress", "_no_ip") else: ip = "_no_ip" # Pass tuples here (integers can be misinterpreted as row numbers) ip_to_parts[ip].append((pid,)) return ip_to_parts ================================================ FILE: xgboost_ray/data_sources/data_source.py ================================================ from enum import Enum from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union import pandas as pd from ray.actor import ActorHandle from ray.util.annotations import PublicAPI if TYPE_CHECKING: from xgboost_ray.xgb import xgboost as xgb @PublicAPI(stability="beta") class RayFileType(Enum): """Enum for different file types (used for overrides).""" CSV = 1 PARQUET = 2 PETASTORM = 3 @PublicAPI(stability="beta") class DataSource: """Abstract class for data sources. xgboost_ray supports reading from various sources, such as files (e.g. CSV, Parquet) or distributed datasets (Modin). This abstract class defines an interface to read from these sources. New data sources can be added by implementing this interface. ``DataSource`` classes are not instantiated. Instead, static and class methods are called directly. """ supports_central_loading = True supports_distributed_loading = False needs_partitions = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: """Check if the supplied data matches this data source. Args: data: Dataset. filetype: RayFileType of the provided dataset. Some DataSource implementations might require that this is explicitly set (e.g. if multiple sources can read CSV files). Returns: Boolean indicating if this data source belongs to/is compatible with the data. """ return False @staticmethod def get_filetype(data: Any) -> Optional[RayFileType]: """Method to help infer the filetype. Returns None if the supplied data type (usually a filename) is not covered by this data source, otherwise the filetype is returned. Args: data: Data set Returns: RayFileType or None. """ return None @staticmethod def load_data( data: Any, ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[Any]] = None, **kwargs ) -> pd.DataFrame: """ Load data into a pandas dataframe. Ignore specific columns, and optionally select specific indices. Args: data: Input data ignore: Column names to ignore indices: Indices to select. What an index indicates depends on the data source. Returns: Pandas DataFrame. """ raise NotImplementedError @staticmethod def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): """Optionally update feature names before training/prediction Args: matrix: xgboost DMatrix object. feature_names: Feature names manually passed to the ``RayDMatrix`` object. """ pass @staticmethod def convert_to_series(data: Any) -> pd.Series: """Convert data from the data source type to a pandas series""" if isinstance(data, pd.DataFrame): return pd.Series(data.squeeze()) if not isinstance(data, pd.Series): return pd.Series(data) return data @classmethod def get_column( cls, data: pd.DataFrame, column: Any ) -> Tuple[pd.Series, Optional[Union[str, List]]]: """Helper method wrapping around convert to series. This method should usually not be overwritten. """ if isinstance(column, str) or isinstance(column, List): return data[column], column elif column is not None: return cls.convert_to_series(column), None return column, None @staticmethod def get_n(data: Any): """Get length of data source partitions for sharding.""" return len(data) @staticmethod def get_actor_shards( data: Any, actors: Sequence[ActorHandle] ) -> Tuple[Any, Optional[Dict[int, Any]]]: """Get a dict mapping actor ranks to shards. Args: data: Data to shard. Returns: Returns a tuple of which the first element indicates the new data object that will overwrite the existing data object in the RayDMatrix (e.g. when the object is not serializable). The second element is a dict mapping actor ranks to shards. These objects are usually passed to the ``load_data()`` method for distributed loading, so that method needs to be able to deal with the respective data. """ return data, None ================================================ FILE: xgboost_ray/data_sources/modin.py ================================================ from collections import defaultdict from typing import Any, Dict, Optional, Sequence, Tuple, Union import pandas as pd import ray from ray import ObjectRef from ray.actor import ActorHandle from xgboost_ray.data_sources._distributed import ( assign_partitions_to_actors, get_actor_rank_ips, ) from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.object_store import ObjectStore try: import modin # noqa: F401 from modin.config.envvars import Engine from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F401 from modin.pandas import DataFrame as ModinDataFrame # noqa: F401 from modin.pandas import Series as ModinSeries # noqa: F401 from packaging.version import Version MODIN_INSTALLED = Version(modin.__version__) >= Version("0.9.0") # Check if importing the Ray engine leads to errors Engine().get() except (ImportError, AttributeError): MODIN_INSTALLED = False def _assert_modin_installed(): if not MODIN_INSTALLED: raise RuntimeError( "Tried to use Modin as a data source, but modin is not " "installed or it conflicts with the pandas version. " "This function shouldn't have been called. " "\nFIX THIS by installing modin: `pip install modin` " "and making sure that the installed pandas version is " "supported by modin." "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " "the code should not have been reached." ) class Modin(DataSource): """Read from distributed Modin dataframe. `Modin `_ is a distributed drop-in replacement for pandas supporting Ray as a backend. Modin dataframes are stored on multiple actors, making them suitable for distributed loading. """ supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not MODIN_INSTALLED: return False # Has to be imported again. from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 from modin.pandas import Series as ModinSeries # noqa: F811 return isinstance(data, (ModinDataFrame, ModinSeries)) @staticmethod def load_data( data: Any, # modin.pandas.DataFrame ignore: Optional[Sequence[str]] = None, indices: Optional[Union[Sequence[int], Sequence[ObjectRef]]] = None, **kwargs ) -> pd.DataFrame: _assert_modin_installed() if ( indices is not None and len(indices) > 0 and isinstance(indices[0], ObjectRef) ): # We got a list of ObjectRefs belonging to Modin partitions return ObjectStore.load_data(data=indices, indices=None, ignore=ignore) local_df = data if indices: local_df = local_df.iloc[indices] local_df = local_df._to_pandas() if ignore: local_df = local_df[local_df.columns.difference(ignore)] return local_df @staticmethod def convert_to_series(data: Any) -> pd.Series: _assert_modin_installed() # Has to be imported again. from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 from modin.pandas import Series as ModinSeries # noqa: F811 if isinstance(data, ModinDataFrame): return pd.Series(data._to_pandas().squeeze()) elif isinstance(data, ModinSeries): return data._to_pandas() return DataSource.convert_to_series(data) @staticmethod def get_actor_shards( data: Any, actors: Sequence[ActorHandle] # modin.pandas.DataFrame ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_modin_installed() # Has to be imported again. from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F811 actor_rank_ips = get_actor_rank_ips(actors) # Get IPs and partitions unwrapped = unwrap_partitions(data, axis=0, get_ip=True) ip_objs, part_objs = zip(*unwrapped) # Build a table mapping from IP to list of partitions ip_to_parts = defaultdict(list) for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs): ip_to_parts[ip].append(part_obj) # Modin dataframes are not serializable, so pass None here # as the first return value return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) @staticmethod def get_n(data: Any): """ For naive distributed loading we just return the number of rows here. Loading by shard is achieved via `get_actor_shards()` """ return len(data) ================================================ FILE: xgboost_ray/data_sources/numpy.py ================================================ from typing import TYPE_CHECKING, Any, List, Optional, Sequence import numpy as np import pandas as pd from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.pandas import Pandas if TYPE_CHECKING: from xgboost_ray.xgb import xgboost as xgb class Numpy(DataSource): """Read from numpy arrays.""" @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return isinstance(data, np.ndarray) @staticmethod def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): # Potentially unset feature names matrix.feature_names = feature_names @staticmethod def load_data( data: np.ndarray, ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs, ) -> pd.DataFrame: local_df = pd.DataFrame(data, columns=[f"f{i}" for i in range(data.shape[1])]) return Pandas.load_data(local_df, ignore=ignore, indices=indices) ================================================ FILE: xgboost_ray/data_sources/object_store.py ================================================ from typing import Any, Optional, Sequence import pandas as pd import ray from ray import ObjectRef from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.pandas import Pandas class ObjectStore(DataSource): """Read pandas dataframes and series from ray object store.""" @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if isinstance(data, Sequence): return all(isinstance(d, ObjectRef) for d in data) return isinstance(data, ObjectRef) @staticmethod def load_data( data: Sequence[ObjectRef], ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs ) -> pd.DataFrame: if indices is not None: data = [data[i] for i in indices] local_df = ray.get(data) return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore) @staticmethod def convert_to_series(data: Any) -> pd.Series: if isinstance(data, ObjectRef): data = ray.get(data) else: data = pd.concat(ray.get(data), copy=False) return DataSource.convert_to_series(data) ================================================ FILE: xgboost_ray/data_sources/pandas.py ================================================ from typing import Any, Optional, Sequence import pandas as pd from xgboost_ray.data_sources.data_source import DataSource, RayFileType class Pandas(DataSource): """Read from pandas dataframes and series.""" @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return isinstance(data, (pd.DataFrame, pd.Series)) @staticmethod def load_data( data: Any, ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs ) -> pd.DataFrame: local_df = data if ignore: local_df = local_df[local_df.columns.difference(ignore)] if indices: return local_df.iloc[indices] return local_df ================================================ FILE: xgboost_ray/data_sources/parquet.py ================================================ from typing import Any, Iterable, Optional, Sequence, Union import pandas as pd from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.pandas import Pandas class Parquet(DataSource): """Read one or many Parquet files.""" supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return filetype == RayFileType.PARQUET @staticmethod def get_filetype(data: Any) -> Optional[RayFileType]: if data.endswith(".parquet"): return RayFileType.PARQUET return None @staticmethod def load_data( data: Union[str, Sequence[str]], ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs ) -> pd.DataFrame: if isinstance(data, Iterable) and not isinstance(data, str): shards = [] for i, shard in enumerate(data): if indices and i not in indices: continue shard_df = pd.read_parquet(shard, **kwargs) shards.append(Pandas.load_data(shard_df, ignore=ignore)) return pd.concat(shards, copy=False) else: local_df = pd.read_parquet(data, **kwargs) return Pandas.load_data(local_df, ignore=ignore) @staticmethod def get_n(data: Any): return len(list(data)) ================================================ FILE: xgboost_ray/data_sources/partitioned.py ================================================ from collections import defaultdict from typing import Any, Dict, Optional, Sequence, Tuple import numpy as np import pandas as pd from ray import ObjectRef from ray.actor import ActorHandle from xgboost_ray.data_sources._distributed import ( assign_partitions_to_actors, get_actor_rank_ips, ) from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.numpy import Numpy from xgboost_ray.data_sources.pandas import Pandas class Partitioned(DataSource): """Read from distributed data structure implementing __partitioned__. __partitioned__ provides meta data about how the data is partitioned and distributed across several compute nodes, making supporting objects them suitable for distributed loading. Also see the __partitioned__ spec: https://github.com/IntelPython/DPPY-Spec/blob/draft/partitioned/Partitioned.md """ supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return hasattr(data, "__partitioned__") @staticmethod def load_data( data: Any, # __partitioned__ dict ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[ObjectRef]] = None, **kwargs ) -> pd.DataFrame: assert isinstance(data, dict), "Expected __partitioned__ dict" _get = data["get"] if indices is None or len(indices) == 0: tiling = data["partition_tiling"] ndims = len(tiling) # we need tuples to access partitions in the right order pos_suffix = (0,) * (ndims - 1) parts = data["partitions"] # get the full data, e.g. all shards/partitions local_df = [ _get(parts[(i,) + pos_suffix]["data"]) for i in range(tiling[0]) ] else: # here we got a list of futures for partitions local_df = _get(indices) if isinstance(local_df[0], pd.DataFrame): return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore) else: return Numpy.load_data(np.concatenate(local_df), ignore=ignore) @staticmethod def get_actor_shards( data: Any, actors: Sequence[ActorHandle] # partitioned.pandas.DataFrame ) -> Tuple[Any, Optional[Dict[int, Any]]]: assert hasattr(data, "__partitioned__") actor_rank_ips = get_actor_rank_ips(actors) # Get accessor func and partitions parted = data.__partitioned__ parts = parted["partitions"] tiling = parted["partition_tiling"] ndims = len(tiling) if ndims < 1 or ndims > 2 or any(tiling[x] != 1 for x in range(1, ndims)): raise RuntimeError( "Only row-wise partitionings of 1d/2d structures supported." ) # Now build a table mapping from IP to list of partitions ip_to_parts = defaultdict(lambda: []) # we need tuples to access partitions in the right order pos_suffix = (0,) * (ndims - 1) for i in range(tiling[0]): part = parts[(i,) + pos_suffix] # this works for 1d and 2d ip_to_parts[part["location"][0]].append(part["data"]) # __partitioned__ is serializable, so pass it here # as the first return value ret = parted, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) return ret @staticmethod def get_n(data: Any): """Get length of data source partitions for sharding.""" return data.__partitioned__["shape"][0] ================================================ FILE: xgboost_ray/data_sources/petastorm.py ================================================ from typing import Any, List, Optional, Sequence, Union import pandas as pd from xgboost_ray.data_sources.data_source import DataSource, RayFileType try: import petastorm PETASTORM_INSTALLED = True except ImportError: PETASTORM_INSTALLED = False def _assert_petastorm_installed(): if not PETASTORM_INSTALLED: raise RuntimeError( "Tried to use Petastorm as a data source, but petastorm is not " "installed. This function shouldn't have been called. " "\nFIX THIS by installing petastorm: `pip install petastorm`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " "the code should not have been reached." ) class Petastorm(DataSource): """Read with Petastorm. `Petastorm `_ is a machine learning training and evaluation library. This class accesses Petastorm's dataset loading interface for efficient loading of large datasets. """ supports_central_loading = True supports_distributed_loading = True @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return PETASTORM_INSTALLED and filetype == RayFileType.PETASTORM @staticmethod def get_filetype(data: Any) -> Optional[RayFileType]: if not PETASTORM_INSTALLED: return None if not isinstance(data, List): data = [data] def _is_compatible(url: str): return url.endswith(".parquet") and ( url.startswith("s3://") or url.startswith("gs://") or url.startswith("hdfs://") or url.startswith("file://") ) if all(_is_compatible(url) for url in data): return RayFileType.PETASTORM return None @staticmethod def load_data( data: Union[str, Sequence[str]], ignore: Optional[Sequence[str]] = None, indices: Optional[Sequence[int]] = None, **kwargs ) -> pd.DataFrame: _assert_petastorm_installed() with petastorm.make_batch_reader(data) as reader: shards = [ pd.DataFrame(batch._asdict()) for i, batch in enumerate(reader) if not indices or i in indices ] local_df = pd.concat(shards, copy=False) if ignore: local_df = local_df[local_df.columns.difference(ignore)] return local_df @staticmethod def get_n(data: Any): return len(list(data)) ================================================ FILE: xgboost_ray/data_sources/ray_dataset.py ================================================ from typing import Any, Dict, Optional, Sequence, Tuple, Union import pandas as pd import ray from ray.actor import ActorHandle from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.pandas import Pandas try: import ray.data.dataset # noqa: F401 RAY_DATASET_AVAILABLE = True except (ImportError, AttributeError): RAY_DATASET_AVAILABLE = False DATASET_TO_PANDAS_LIMIT = float("inf") def _assert_ray_data_available(): if not RAY_DATASET_AVAILABLE: raise RuntimeError( "Tried to use Ray datasets as a data source, but your version " "of Ray does not support it. " "\nFIX THIS by upgrading Ray: `pip install -U ray`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " "the code should not have been reached." ) class RayDataset(DataSource): """Read from distributed Ray dataset.""" supports_central_loading = True supports_distributed_loading = True needs_partitions = False @staticmethod def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not RAY_DATASET_AVAILABLE: return False return isinstance(data, ray.data.dataset.Dataset) @staticmethod def load_data( data: "ray.data.dataset.Dataset", ignore: Optional[Sequence[str]] = None, indices: Optional[ Union[Sequence[int], Sequence["ray.data.dataset.Dataset"]] ] = None, **kwargs ) -> pd.DataFrame: _assert_ray_data_available() if indices is not None: if len(indices) > 0 and isinstance(indices[0], ray.data.dataset.Dataset): # We got a list of Datasets belonging a partition data = indices else: data = [data[i] for i in indices] if isinstance(data, ray.data.dataset.Dataset): local_df = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) else: local_df = pd.concat( [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False ) return Pandas.load_data(local_df, ignore=ignore) @staticmethod def convert_to_series( data: Union["ray.data.dataset.Dataset", Sequence["ray.data.dataset.Dataset"]] ) -> pd.Series: _assert_ray_data_available() if isinstance(data, ray.data.dataset.Dataset): data = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) else: data = pd.concat( [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False ) return DataSource.convert_to_series(data) @staticmethod def get_actor_shards( data: "ray.data.dataset.Dataset", actors: Sequence[ActorHandle] ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_ray_data_available() # We do not use our assign_partitions_to_actors as assignment of splits # to actors is handled by locality_hints argument. dataset_splits = data.split( len(actors), equal=True, locality_hints=actors, ) return None, { i: [dataset_split] for i, dataset_split in enumerate(dataset_splits) } @staticmethod def get_n(data: "ray.data.dataset.Dataset"): """ Return number of distributed blocks. """ return data._plan.initial_num_blocks() ================================================ FILE: xgboost_ray/elastic.py ================================================ import time from typing import Callable, Dict, List, Optional, Tuple import ray from xgboost_ray.main import ( ENV, ActorHandle, RayParams, RayXGBoostActorAvailable, _create_actor, _PrepareActorTask, _TrainingState, logger, ) from xgboost_ray.matrix import RayDMatrix def _maybe_schedule_new_actors( training_state: _TrainingState, num_cpus_per_actor: int, num_gpus_per_actor: int, resources_per_actor: Optional[Dict], ray_params: RayParams, load_data: List[RayDMatrix], ) -> bool: """Schedule new actors for elastic training if resources are available. Potentially starts new actors and triggers data loading.""" # This is only enabled for elastic training. if not ray_params.elastic_training: return False missing_actor_ranks = [ rank for rank, actor in enumerate(training_state.actors) if actor is None and rank not in training_state.pending_actors ] # If all actors are alive, there is nothing to do. if not missing_actor_ranks: return False now = time.time() # Check periodically every n seconds. if ( now < training_state.last_resource_check_at + ENV.ELASTIC_RESTART_RESOURCE_CHECK_S ): return False training_state.last_resource_check_at = now new_pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = {} for rank in missing_actor_ranks: # Actor rank should not be already pending if rank in training_state.pending_actors or rank in new_pending_actors: continue # Try to schedule this actor actor = _create_actor( rank=rank, num_actors=ray_params.num_actors, num_cpus_per_actor=num_cpus_per_actor, num_gpus_per_actor=num_gpus_per_actor, resources_per_actor=resources_per_actor, placement_group=training_state.placement_group, queue=training_state.queue, checkpoint_frequency=ray_params.checkpoint_frequency, distributed_callbacks=ray_params.distributed_callbacks, ) task = _PrepareActorTask( actor, queue=training_state.queue, stop_event=training_state.stop_event, load_data=load_data, ) new_pending_actors[rank] = (actor, task) logger.debug( f"Re-scheduled actor with rank {rank}. Waiting for " f"placement and data loading before promoting it " f"to training." ) if new_pending_actors: training_state.pending_actors.update(new_pending_actors) logger.info( f"Re-scheduled {len(new_pending_actors)} actors for " f"training. Once data loading finished, they will be " f"integrated into training again." ) return bool(new_pending_actors) def _update_scheduled_actor_states(training_state: _TrainingState): """Update status of scheduled actors in elastic training. If actors finished their preparation tasks, promote them to proper training actors (set the `training_state.actors` entry). Also schedule a `RayXGBoostActorAvailable` exception so that training is restarted with the new actors. """ now = time.time() actor_became_ready = False # Wrap in list so we can alter the `training_state.pending_actors` dict for rank in list(training_state.pending_actors.keys()): actor, task = training_state.pending_actors[rank] if task.is_ready(): # Promote to proper actor training_state.actors[rank] = actor del training_state.pending_actors[rank] actor_became_ready = True if actor_became_ready: if not training_state.pending_actors: # No other actors are pending, so let's restart right away. training_state.restart_training_at = now - 1.0 # If an actor became ready but other actors are pending, we wait # for n seconds before restarting, as chances are that they become # ready as well (e.g. if a large node came up). grace_period = ENV.ELASTIC_RESTART_GRACE_PERIOD_S if training_state.restart_training_at is None: logger.debug( f"A RayXGBoostActor became ready for training. Waiting " f"{grace_period} seconds before triggering training restart." ) training_state.restart_training_at = now + grace_period if training_state.restart_training_at is not None: if now > training_state.restart_training_at: training_state.restart_training_at = None raise RayXGBoostActorAvailable( "A new RayXGBoostActor became available for training. " "Triggering restart." ) def _get_actor_alive_status( actors: List[ActorHandle], callback: Callable[[ActorHandle], None] ): """Loop through all actors. Invoke a callback on dead actors.""" obj_to_rank = {} alive = 0 dead = 0 for rank, actor in enumerate(actors): if actor is None: dead += 1 continue obj = actor.pid.remote() obj_to_rank[obj] = rank not_ready = list(obj_to_rank.keys()) while not_ready: ready, not_ready = ray.wait(not_ready, timeout=0) for obj in ready: try: pid = ray.get(obj) rank = obj_to_rank[obj] logger.debug(f"Actor {actors[rank]} with PID {pid} is alive.") alive += 1 except Exception: rank = obj_to_rank[obj] logger.debug(f"Actor {actors[rank]} is _not_ alive.") dead += 1 callback(actors[rank]) logger.info(f"Actor status: {alive} alive, {dead} dead " f"({alive+dead} total)") return alive, dead ================================================ FILE: xgboost_ray/examples/__init__.py ================================================ ================================================ FILE: xgboost_ray/examples/create_test_data.py ================================================ from xgboost_ray.tests.utils import create_parquet def main(): create_parquet( "example.parquet", num_rows=1_000_000, num_partitions=100, num_features=8, num_classes=2, ) if __name__ == "__main__": main() ================================================ FILE: xgboost_ray/examples/higgs.py ================================================ import os import time from xgboost_ray import RayDMatrix, RayParams, train FILENAME_CSV = "HIGGS.csv.gz" def download_higgs(target_file): url = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/" "00280/HIGGS.csv.gz" ) try: import urllib.request except ImportError as e: raise ValueError( f"Automatic downloading of the HIGGS dataset requires `urllib`." f"\nFIX THIS by running `pip install urllib` or manually " f"downloading the dataset from {url}." ) from e print(f"Downloading HIGGS dataset to {target_file}") urllib.request.urlretrieve(url, target_file) return os.path.exists(target_file) def main(): # Example adapted from this blog post: # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 # This uses the HIGGS dataset. Download here: # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz if not os.path.exists(FILENAME_CSV): assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed." print("HIGGS dataset downloaded.") else: print("HIGGS dataset found locally.") colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames) config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} start = time.time() bst = train( config, dtrain, evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=1), num_boost_round=100, evals=[(dtrain, "train")], ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("higgs.xgb") print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": import ray ray.init() start = time.time() main() taken = time.time() - start print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") ================================================ FILE: xgboost_ray/examples/higgs_parquet.py ================================================ import os import time import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from higgs import download_higgs from xgboost_ray import RayDMatrix, RayParams, train FILENAME_CSV = "HIGGS.csv.gz" FILENAME_PARQUET = "HIGGS.parquet" def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs): if os.path.exists(out_file): return False print(f"Converting CSV {in_file} to PARQUET {out_file}") csv_stream = pd.read_csv( in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs ) parquet_schema = None parquet_writer = None for i, chunk in enumerate(csv_stream): print("Chunk", i) if not parquet_schema: # Guess the schema of the CSV file from the first chunk parquet_schema = pa.Table.from_pandas(df=chunk).schema # Open a Parquet file for writing parquet_writer = pq.ParquetWriter( out_file, parquet_schema, compression="snappy" ) # Write CSV chunk to the parquet file table = pa.Table.from_pandas(chunk, schema=parquet_schema) parquet_writer.write_table(table) parquet_writer.close() return True def main(): # Example adapted from this blog post: # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 # This uses the HIGGS dataset. Download here: # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz if not os.path.exists(FILENAME_PARQUET): if not os.path.exists(FILENAME_CSV): download_higgs(FILENAME_CSV) print("Downloaded HIGGS csv dataset") print("Converting HIGGS csv dataset to parquet") csv_to_parquet( FILENAME_CSV, FILENAME_PARQUET, names=[ "label", "feature-01", "feature-02", "feature-03", "feature-04", "feature-05", "feature-06", "feature-07", "feature-08", "feature-09", "feature-10", "feature-11", "feature-12", "feature-13", "feature-14", "feature-15", "feature-16", "feature-17", "feature-18", "feature-19", "feature-20", "feature-21", "feature-22", "feature-23", "feature-24", "feature-25", "feature-26", "feature-27", "feature-28", ], ) colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] # Here we load the Parquet file dtrain = RayDMatrix( os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames ) config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} start = time.time() bst = train( config, dtrain, evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=1), num_boost_round=100, evals=[(dtrain, "train")], ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("higgs.xgb") print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": import ray ray.init() start = time.time() main() taken = time.time() - start print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") ================================================ FILE: xgboost_ray/examples/readme.py ================================================ # flake8: noqa E501 def readme_simple(): from sklearn.datasets import load_breast_cancer from xgboost_ray import RayDMatrix, RayParams, train train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) evals_result = {} bst = train( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=RayParams(num_actors=2, cpus_per_actor=1), ) bst.save_model("model.xgb") print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) def readme_predict(): import xgboost as xgb from sklearn.datasets import load_breast_cancer from xgboost_ray import RayDMatrix, RayParams, predict data, labels = load_breast_cancer(return_X_y=True) dpred = RayDMatrix(data, labels) bst = xgb.Booster(model_file="model.xgb") pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) print(pred_ray) def readme_tune(): from sklearn.datasets import load_breast_cancer from xgboost_ray import RayDMatrix, RayParams, train num_actors = 4 num_cpus_per_actor = 1 ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) def train_model(config): train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) evals_result = {} bst = train( params=config, dtrain=train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=ray_params, ) bst.save_model("model.xgb") from ray import tune # Specify the hyperparameter search space. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), } # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` analysis = tune.run( train_model, config=config, metric="train-error", mode="min", num_samples=4, resources_per_trial=ray_params.get_tune_resources(), ) print("Best hyperparameters", analysis.best_config) if __name__ == "__main__": import ray ray.init(num_cpus=5) print("Readme: Simple example") readme_simple() readme_predict() try: print("Readme: Ray Tune example") readme_tune() except ImportError: print("Ray Tune not installed.") ================================================ FILE: xgboost_ray/examples/readme_sklearn_api.py ================================================ def readme_sklearn_api(): from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from xgboost_ray import RayParams, RayXGBClassifier seed = 42 X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.25, random_state=42 ) clf = RayXGBClassifier( n_jobs=4, random_state=seed # In XGBoost-Ray, n_jobs sets the number of actors ) # scikit-learn API will automatically conver the data # to RayDMatrix format as needed. # You can also pass X as a RayDMatrix, in which case # y will be ignored. clf.fit(X_train, y_train) pred_ray = clf.predict(X_test) print(pred_ray) pred_proba_ray = clf.predict_proba(X_test) print(pred_proba_ray) # It is also possible to pass a RayParams object # to fit/predict/predict_proba methods - will override # n_jobs set during initialization clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2)) pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2)) print(pred_ray) if __name__ == "__main__": import ray ray.init(num_cpus=5) print("Readme: scikit-learn API example") readme_sklearn_api() ================================================ FILE: xgboost_ray/examples/simple.py ================================================ import argparse import ray from sklearn import datasets from sklearn.model_selection import train_test_split from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) train_set = RayDMatrix(train_x, train_y) test_set = RayDMatrix(test_x, test_y) evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(test_set, "eval")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "simple.xgb" bst.save_model(model_path) print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_dask.py ================================================ import argparse import numpy as np import pandas as pd import ray from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.data_sources.dask import DASK_INSTALLED def main(cpus_per_actor, num_actors): if not DASK_INSTALLED: print("Dask is not installed. Install with `pip install dask`") return # Local import so the installation check comes first import dask import dask.dataframe as dd from ray.util.dask import ray_dask_get dask.config.set(scheduler=ray_dask_get) # Generate dataset x = np.repeat(range(8), 16).reshape((32, 4)) # Even numbers --> 0, odd numbers --> 1 y = np.tile(np.repeat(range(2), 4), 4) # Flip some bits to reduce max accuracy bits_to_flip = np.random.choice(32, size=6, replace=False) y[bits_to_flip] = 1 - y[bits_to_flip] data = pd.DataFrame(x) data["label"] = y # Split into 4 partitions dask_df = dd.from_pandas(data, npartitions=4) train_set = RayDMatrix(dask_df, "label") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "dask.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors + 1) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_modin.py ================================================ import argparse import numpy as np import pandas as pd import ray from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.data_sources.modin import MODIN_INSTALLED def main(cpus_per_actor, num_actors): if not MODIN_INSTALLED: print( "Modin is not installed or installed in a version that is not " "compatible with xgboost_ray (< 0.9.0)." ) return # Import modin after initializing Ray from modin.distributed.dataframe.pandas import from_partitions # Generate dataset x = np.repeat(range(8), 16).reshape((32, 4)) # Even numbers --> 0, odd numbers --> 1 y = np.tile(np.repeat(range(2), 4), 4) # Flip some bits to reduce max accuracy bits_to_flip = np.random.choice(32, size=6, replace=False) y[bits_to_flip] = 1 - y[bits_to_flip] data = pd.DataFrame(x) data["label"] = y # Split into 4 partitions partitions = [ray.put(part) for part in np.split(data, 4)] # Create modin df here modin_df = from_partitions(partitions, axis=0) train_set = RayDMatrix(modin_df, "label") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "modin.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors + 1) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_objectstore.py ================================================ import argparse import numpy as np import pandas as pd import ray from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): # Generate dataset x = np.repeat(range(8), 16).reshape((32, 4)) # Even numbers --> 0, odd numbers --> 1 y = np.tile(np.repeat(range(2), 4), 4) # Flip some bits to reduce max accuracy bits_to_flip = np.random.choice(32, size=6, replace=False) y[bits_to_flip] = 1 - y[bits_to_flip] data = pd.DataFrame(x) data["label"] = y # Split into 4 partitions partitions = [ray.put(part) for part in np.split(data, 4)] train_set = RayDMatrix(partitions, "label") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "modin.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors + 1) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_partitioned.py ================================================ import argparse import numpy as np import ray from sklearn import datasets from sklearn.model_selection import train_test_split from xgboost_ray import RayDMatrix, RayParams, train nc = 31 @ray.remote class AnActor: """We mimic a distributed DF by having several actors create data which form the global DF. """ @ray.method(num_returns=2) def genData(self, rank, nranks, nrows): """Generate global dataset and cut out local piece. In real life each actor would of course directly create local data. """ # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, _, train_y, _ = train_test_split(data, labels, test_size=0.25) train_y = train_y.reshape((train_y.shape[0], 1)) train = np.hstack([train_x, train_y]) assert nrows <= train.shape[0] assert nc == train.shape[1] sz = nrows // nranks return train[sz * rank : sz * (rank + 1)], ray.util.get_node_ip_address() class Parted: """Class exposing __partitioned__""" def __init__(self, parted): self.__partitioned__ = parted def main(cpus_per_actor, num_actors): nr = 424 actors = [AnActor.remote() for _ in range(num_actors)] parts = [actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors)] rowsperpart = nr // num_actors nr = rowsperpart * num_actors parted = Parted( { "shape": (nr, nc), "partition_tiling": (num_actors, 1), "get": lambda x: ray.get(x), "partitions": { (i, 0): { "start": (i * rowsperpart, 0), "shape": (rowsperpart, nc), "data": parts[i][0], "location": [ray.get(parts[i][1])], } for i in range(num_actors) }, } ) yl = nc - 1 # Let's create DMatrix from our __partitioned__ structure train_set = RayDMatrix(parted, f"f{yl}") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "partitioned.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if not ray.is_initialized(): if args.smoke_test: ray.init(num_cpus=args.num_actors + 1) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_predict.py ================================================ import os import numpy as np import xgboost as xgb from sklearn import datasets from xgboost_ray import RayDMatrix, RayParams, predict def main(): if not os.path.exists("simple.xgb"): raise ValueError( "Model file not found: `simple.xgb`" "\nFIX THIS by running `python `simple.py` first to " "train the model." ) # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) dmat_xgb = xgb.DMatrix(data, labels) dmat_ray = RayDMatrix(data, labels) bst = xgb.Booster(model_file="simple.xgb") pred_xgb = bst.predict(dmat_xgb) pred_ray = predict(bst, dmat_ray, ray_params=RayParams(num_actors=2)) np.testing.assert_array_equal(pred_xgb, pred_ray) print(pred_ray) if __name__ == "__main__": main() ================================================ FILE: xgboost_ray/examples/simple_ray_dataset.py ================================================ import argparse import numpy as np import pandas as pd import ray from xgboost import DMatrix from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): np.random.seed(1234) # Generate dataset x = np.repeat(range(8), 16).reshape((32, 4)) # Even numbers --> 0, odd numbers --> 1 y = np.tile(np.repeat(range(2), 4), 4) # Flip some bits to reduce max accuracy bits_to_flip = np.random.choice(32, size=6, replace=False) y[bits_to_flip] = 1 - y[bits_to_flip] data = pd.DataFrame(x) # Ray Datasets require all columns to be string data.columns = [str(c) for c in data.columns] data["label"] = y ray_ds = ray.data.from_pandas(data) train_set = RayDMatrix(ray_ds, "label") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train( params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams( max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ), verbose_eval=False, num_boost_round=10, ) model_path = "ray_datasets.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) # Distributed prediction scored = ray_ds.drop_columns(["label"]).map_batches( lambda batch: {"pred": bst.predict(DMatrix(batch))}, batch_format="pandas" ) print(scored.to_pandas()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per xgboost training worker.", ) parser.add_argument( "--num-actors", type=int, default=4, help="Sets number of xgboost workers to use.", ) parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors + 1) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors) ================================================ FILE: xgboost_ray/examples/simple_tune.py ================================================ import argparse import os import ray from ray import tune from sklearn import datasets from sklearn.model_selection import train_test_split import xgboost_ray from xgboost_ray import RayDMatrix, RayParams, train def train_breast_cancer(config, ray_params): # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) train_set = RayDMatrix(train_x, train_y) test_set = RayDMatrix(test_x, test_y) evals_result = {} bst = train( params=config, dtrain=train_set, evals=[(test_set, "eval")], evals_result=evals_result, ray_params=ray_params, verbose_eval=False, num_boost_round=10, ) model_path = "tuned.xgb" bst.save_model(model_path) print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) def main(cpus_per_actor, num_actors, num_samples): # Set XGBoost config. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), } ray_params = RayParams( max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors, ) analysis = tune.run( tune.with_parameters(train_breast_cancer, ray_params=ray_params), # Use the `get_tune_resources` helper function to set the resources. resources_per_trial=ray_params.get_tune_resources(), config=config, num_samples=num_samples, metric="eval-error", mode="min", ) # Load the best model checkpoint. best_bst = xgboost_ray.tune.load_model( os.path.join(analysis.best_trial.local_path, "tuned.xgb") ) best_bst.save_model("best_model.xgb") accuracy = 1.0 - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--address", required=False, type=str, help="the address to use for Ray" ) parser.add_argument( "--server-address", required=False, type=str, help="Address of the remote server if using Ray Client.", ) parser.add_argument( "--cpus-per-actor", type=int, default=1, help="Sets number of CPUs per XGBoost training worker.", ) parser.add_argument( "--num-actors", type=int, default=1, help="Sets number of XGBoost workers to use.", ) parser.add_argument( "--num-samples", type=int, default=4, help="Number of samples to use for Tune." ) parser.add_argument("--smoke-test", action="store_true", default=False) args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=args.num_actors * args.num_samples) elif args.server_address: ray.util.connect(args.server_address) else: ray.init(address=args.address) main(args.cpus_per_actor, args.num_actors, args.num_samples) ================================================ FILE: xgboost_ray/examples/train_on_test_data.py ================================================ import argparse import os import shutil import time from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.tests.utils import create_parquet_in_tempdir #### # Run `create_test_data.py` first to create a large fake data set. # Alternatively, run with `--smoke-test` to create an ephemeral small fake # data set. #### def main(fname, num_actors=2): dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"]) config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} start = time.time() bst = train( config, dtrain, evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), num_boost_round=10, evals=[(dtrain, "train")], ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("test_data.xgb") print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", default=False, help="Finish quickly for testing", ) args = parser.parse_args() temp_dir, path = None, None if args.smoke_test: temp_dir, path = create_parquet_in_tempdir( "smoketest.parquet", num_rows=1_000, num_features=4, num_classes=2, num_partitions=2, ) else: path = os.path.join(os.path.dirname(__file__), "parted.parquet") import ray ray.init() start = time.time() main(path) taken = time.time() - start print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") if args.smoke_test: shutil.rmtree(temp_dir) ================================================ FILE: xgboost_ray/examples/train_with_ml_dataset.py ================================================ import argparse import os import shutil import time from ray.util.data import read_parquet from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.tests.utils import create_parquet_in_tempdir #### # Run `create_test_data.py` first to create a large fake data set. # Alternatively, run with `--smoke-test` to create an ephemeral small fake # data set. #### def main(fname, num_actors=2): ml_dataset = read_parquet(fname, num_shards=num_actors) dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"]) config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} start = time.time() bst = train( config, dtrain, evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), num_boost_round=10, evals=[(dtrain, "train")], ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("test_data.xgb") print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", default=False, help="Finish quickly for testing", ) args = parser.parse_args() temp_dir, path = None, None if args.smoke_test: temp_dir, path = create_parquet_in_tempdir( "smoketest.parquet", num_rows=1_000, num_features=4, num_classes=2, num_partitions=2, ) else: path = os.path.join(os.path.dirname(__file__), "parted.parquet") import ray ray.init() start = time.time() main(path) taken = time.time() - start print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") if args.smoke_test: shutil.rmtree(temp_dir) ================================================ FILE: xgboost_ray/main.py ================================================ import functools import inspect import multiprocessing import os import pickle import platform import threading import time import warnings from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd from packaging.version import Version from xgboost.core import XGBoostError from xgboost_ray.xgb import xgboost as xgb try: from xgboost.core import EarlyStopException except ImportError: class EarlyStopException(XGBoostError): pass # From xgboost>=1.7.0, rabit is replaced by a collective communicator. try: from xgboost.collective import CommunicatorContext rabit = None HAS_COLLECTIVE = True except ImportError: from xgboost import rabit # noqa CommunicatorContext = None HAS_COLLECTIVE = False from xgboost_ray.callback import DistributedCallback, DistributedCallbackContainer from xgboost_ray.compat import LEGACY_CALLBACK, RabitTracker, TrainingCallback try: import ray from ray import logger from ray.actor import ActorHandle from ray.exceptions import RayActorError, RayTaskError from ray.util import get_node_ip_address, placement_group from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.placement_group import ( PlacementGroup, get_current_placement_group, remove_placement_group, ) from ray.util.queue import Queue from ray.util.scheduling_strategies import ( NodeAffinitySchedulingStrategy, PlacementGroupSchedulingStrategy, ) from xgboost_ray.util import Event, MultiActorTask, force_on_current_node DEFAULT_PG = "default" RAY_INSTALLED = True except ImportError: ray = get_node_ip_address = Queue = Event = ActorHandle = logger = None def PublicAPI(f): @functools.wraps(f) def inner_f(*args, **kwargs): return f(*args, **kwargs) return inner_f DeveloperAPI = PublicAPI RAY_INSTALLED = False from xgboost_ray.matrix import ( LEGACY_MATRIX, QUANTILE_AVAILABLE, RayDataIter, RayDeviceQuantileDMatrix, RayDMatrix, RayQuantileDMatrix, combine_data, concat_dataframes, ) from xgboost_ray.session import ( get_rabit_rank, init_session, put_queue, set_session_queue, ) RAY_TUNE_INSTALLED = True try: import ray.train import ray.tune except (ImportError, ModuleNotFoundError): RAY_TUNE_INSTALLED = False if RAY_TUNE_INSTALLED: from xgboost_ray.tune import _get_tune_resources, _try_add_tune_callback else: _get_tune_resources = _try_add_tune_callback = None def _get_environ(item: str, old_val: Any): env_var = f"RXGB_{item}" new_val = old_val if env_var in os.environ: new_val_str = os.environ.get(env_var) if isinstance(old_val, bool): new_val = bool(int(new_val_str)) elif isinstance(old_val, int): new_val = int(new_val_str) elif isinstance(old_val, float): new_val = float(new_val_str) else: new_val = new_val_str return new_val @dataclass class _XGBoostEnv: # Whether to use SPREAD placement group strategy for training. USE_SPREAD_STRATEGY: bool = True # How long to wait for placement group creation before failing. PLACEMENT_GROUP_TIMEOUT_S: int = 100 # Status report frequency when waiting for initial actors # and during training STATUS_FREQUENCY_S: int = 30 # If restarting failed actors is disabled ELASTIC_RESTART_DISABLED: bool = False # How often to check for new available resources ELASTIC_RESTART_RESOURCE_CHECK_S: int = 30 # How long to wait before triggering a new start of the training loop # when new actors become available ELASTIC_RESTART_GRACE_PERIOD_S: int = 10 # Whether to allow soft-placement of communication processes. If True, # the Queue and Event actors may be scheduled on non-driver nodes. COMMUNICATION_SOFT_PLACEMENT: bool = True def __getattribute__(self, item): old_val = super(_XGBoostEnv, self).__getattribute__(item) new_val = _get_environ(item, old_val) if new_val != old_val: setattr(self, item, new_val) return super(_XGBoostEnv, self).__getattribute__(item) ENV = _XGBoostEnv() xgboost_version = xgb.__version__ if xgb else "0.0.0" LEGACY_WARNING = ( f"You are using `xgboost_ray` with a legacy XGBoost version " f"(version {xgboost_version}). While we try to support " f"older XGBoost versions, please note that this library is only " f"fully tested and supported for XGBoost >= 1.4. Please consider " f"upgrading your XGBoost version (`pip install -U xgboost`)." ) # XGBoost Version for comparisions XGBOOST_VERSION = Version(xgboost_version) class RayXGBoostTrainingError(RuntimeError): """Raised from RayXGBoostActor.train() when the local xgb.train function did not complete.""" pass class RayXGBoostTrainingStopped(RuntimeError): """Raised from RayXGBoostActor.train() when training was deliberately stopped.""" pass class RayXGBoostActorAvailable(RuntimeError): """Raise from `_update_scheduled_actor_states()` when new actors become available in elastic training""" pass def _assert_ray_support(): if not RAY_INSTALLED: raise ImportError( "Ray needs to be installed in order to use this module. " "Try: `pip install ray`" ) def _in_ray_tune_session() -> bool: return ( RAY_TUNE_INSTALLED and ray.train.get_context().get_trial_resources() is not None ) def _maybe_print_legacy_warning(): if LEGACY_MATRIX or LEGACY_CALLBACK: warnings.warn(LEGACY_WARNING) def _is_client_connected() -> bool: try: return ray.util.client.ray.is_connected() except Exception: return False class _RabitTrackerCompatMixin: """Fallback calls to legacy terminology""" def accept_workers(self, n_workers: int): return self.accept_slaves(n_workers) def worker_envs(self): return self.slave_envs() class _RabitTracker(RabitTracker, _RabitTrackerCompatMixin): """ This method overwrites the xgboost-provided RabitTracker to switch from a daemon thread to a multiprocessing Process. This is so that we are able to terminate/kill the tracking process at will. """ def start(self, nworker): # TODO: refactor RabitTracker to support spawn process creation. # In python 3.8, spawn is used as default process creation on macOS. # But spawn doesn't work because `run` is not pickleable. # For now we force the start method to use fork. multiprocessing.set_start_method("fork", force=True) def run(): self.accept_workers(nworker) self.thread = multiprocessing.Process(target=run, args=()) self.thread.start() def _start_rabit_tracker(num_workers: int): """Start Rabit tracker. The workers connect to this tracker to share their results. The Rabit tracker is the main process that all local workers connect to to share their weights. When one or more actors die, we want to restart the Rabit tracker, too, for two reasons: First we don't want to be potentially stuck with stale connections from old training processes. Second, we might restart training with a different number of actors, and for that we would have to restart the tracker anyway. To do this we start the Tracker in its own subprocess with its own PID. We can use this process then to specifically kill/terminate the tracker process in `_stop_rabit_tracker` without touching other functionality. """ host = get_node_ip_address() env = {"DMLC_NUM_WORKER": num_workers} rabit_tracker = _RabitTracker(host, num_workers) # Get tracker Host + IP env.update(rabit_tracker.worker_envs()) rabit_tracker.start(num_workers) logger.debug(f"Started Rabit tracker process with PID {rabit_tracker.thread.pid}") return rabit_tracker.thread, env def _stop_rabit_tracker(rabit_process: multiprocessing.Process): logger.debug(f"Stopping Rabit process with PID {rabit_process.pid}") rabit_process.join(timeout=5) rabit_process.terminate() class _RabitContextBase: """This context is used by local training actors to connect to the Rabit tracker. Args: actor_id: Unique actor ID args: Arguments for Rabit initialisation. These are environment variables to configure Rabit clients. """ def __init__(self, actor_id: int, args: dict): args["DMLC_TASK_ID"] = "[xgboost.ray]:" + actor_id self.args = args # From xgboost>=1.7.0, rabit is replaced by a collective communicator if HAS_COLLECTIVE: class _RabitContext(_RabitContextBase, CommunicatorContext): pass else: class _RabitContext(_RabitContextBase): def __init__(self, actor_id: int, args: dict): super().__init__(actor_id, args) self._list_args = [("%s=%s" % item).encode() for item in self.args.items()] def __enter__(self): xgb.rabit.init(self._list_args) def __exit__(self, *args): xgb.rabit.finalize() def _ray_get_actor_cpus(): # Get through resource IDs if Version(ray.__version__) < Version("2.0.0"): # Remove after 2.2? resource_ids = ray.worker.get_resource_ids() if "CPU" in resource_ids: return sum(cpu[1] for cpu in resource_ids["CPU"]) else: resource_ids = ray.get_runtime_context().get_assigned_resources() for key in resource_ids.keys(): if key.startswith("CPU"): return resource_ids[key] return 1 def _ray_get_cluster_cpus(): return ray.cluster_resources().get("CPU", None) def _get_min_node_cpus(): max_node_cpus = min( node.get("Resources", {}).get("CPU", 0.0) for node in ray.nodes() if node.get("Alive", False) ) return max_node_cpus if max_node_cpus > 0.0 else 1.0 def _set_omp_num_threads(): ray_cpus = _ray_get_actor_cpus() if ray_cpus: os.environ["OMP_NUM_THREADS"] = str(int(ray_cpus)) else: if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] return int(float(os.environ.get("OMP_NUM_THREADS", "0.0"))) def _prepare_dmatrix_params(param: Dict) -> Dict: dm_param = { "data": concat_dataframes(param["data"]), "label": concat_dataframes(param["label"]), "weight": concat_dataframes(param["weight"]), "feature_weights": concat_dataframes(param["feature_weights"]), "qid": concat_dataframes(param["qid"]), "base_margin": concat_dataframes(param["base_margin"]), "label_lower_bound": concat_dataframes(param["label_lower_bound"]), "label_upper_bound": concat_dataframes(param["label_upper_bound"]), } return dm_param def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix: if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix): if isinstance(param["data"], list): qdm_param = _prepare_dmatrix_params(param) param.update(qdm_param) if data.enable_categorical is not None: param["enable_categorical"] = data.enable_categorical matrix = xgb.QuantileDMatrix(**param) if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix): # If we only got a single data shard, create a list so we can # iterate over it if not isinstance(param["data"], list): param["data"] = [param["data"]] if not isinstance(param["label"], list): param["label"] = [param["label"]] if not isinstance(param["weight"], list): param["weight"] = [param["weight"]] if not isinstance(param["feature_weights"], list): param["feature_weights"] = [param["feature_weights"]] if not isinstance(param["qid"], list): param["qid"] = [param["qid"]] if not isinstance(param["data"], list): param["base_margin"] = [param["base_margin"]] param["label_lower_bound"] = [None] param["label_upper_bound"] = [None] dm_param = { "feature_names": data.feature_names, "feature_types": data.feature_types, "missing": data.missing, } if data.enable_categorical is not None: dm_param["enable_categorical"] = data.enable_categorical param.update(dm_param) it = RayDataIter(**param) matrix = xgb.DeviceQuantileDMatrix(it, **dm_param) else: if isinstance(param["data"], list): dm_param = _prepare_dmatrix_params(param) param.update(dm_param) ll = param.pop("label_lower_bound", None) lu = param.pop("label_upper_bound", None) fw = param.pop("feature_weights", None) if LEGACY_MATRIX: param.pop("base_margin", None) if "qid" not in inspect.signature(xgb.DMatrix).parameters: param.pop("qid", None) if data.enable_categorical is not None: param["enable_categorical"] = data.enable_categorical matrix = xgb.DMatrix(**param) if not LEGACY_MATRIX: matrix.set_info( label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw ) data.update_matrix_properties(matrix) return matrix @PublicAPI(stability="beta") @dataclass class RayParams: """Parameters to configure Ray-specific behavior. Args: num_actors: Number of parallel Ray actors. cpus_per_actor: Number of CPUs to be used per Ray actor. gpus_per_actor: Number of GPUs to be used per Ray actor. resources_per_actor: Dict of additional resources required per Ray actor. elastic_training: If True, training will continue with fewer actors if an actor fails. Default False. max_failed_actors: If `elastic_training` is True, this specifies the maximum number of failed actors with which we still continue training. max_actor_restarts: Number of retries when Ray actors fail. Defaults to 0 (no retries). Set to -1 for unlimited retries. checkpoint_frequency: How often to save checkpoints. Defaults to ``5`` (every 5th iteration). verbose: Whether to output Ray-specific info messages during training/prediction. placement_options: Optional kwargs to pass to ``PlacementGroupFactory`` in ``get_tune_resources()``. """ # Actor scheduling num_actors: int = 0 cpus_per_actor: int = 0 gpus_per_actor: int = -1 resources_per_actor: Optional[Dict] = None # Fault tolerance elastic_training: bool = False max_failed_actors: int = 0 max_actor_restarts: int = 0 checkpoint_frequency: int = 5 # Distributed callbacks distributed_callbacks: Optional[List[DistributedCallback]] = None verbose: Optional[bool] = None placement_options: Dict[str, Any] = None def get_tune_resources(self): """Return the resources to use for xgboost_ray training with Tune.""" if self.cpus_per_actor <= 0 or self.num_actors <= 0: raise ValueError( "num_actors and cpus_per_actor both must be " "greater than 0." ) return _get_tune_resources( num_actors=self.num_actors, cpus_per_actor=self.cpus_per_actor, gpus_per_actor=max(0, self.gpus_per_actor), resources_per_actor=self.resources_per_actor, placement_options=self.placement_options, ) @dataclass class _Checkpoint: iteration: int = 0 value: Optional[bytes] = None def _validate_ray_params(ray_params: Union[None, RayParams, dict]) -> RayParams: if ray_params is None: ray_params = RayParams() elif isinstance(ray_params, dict): ray_params = RayParams(**ray_params) elif not isinstance(ray_params, RayParams): raise ValueError( f"`ray_params` must be a `RayParams` instance, a dict, or None, " f"but it was {type(ray_params)}." f"\nFIX THIS preferably by passing a `RayParams` instance as " f"the `ray_params` parameter." ) if ray_params.num_actors <= 0: raise ValueError( "The `num_actors` parameter is set to 0. Please always specify " "the number of distributed actors you want to use." "\nFIX THIS by passing a `RayParams(num_actors=X)` argument " "to your call to xgboost_ray." ) elif ray_params.num_actors < 2: warnings.warn( f"`num_actors` in `ray_params` is smaller than 2 " f"({ray_params.num_actors}). XGBoost will NOT be distributed!" ) if ray_params.verbose is None and RAY_TUNE_INSTALLED: # In Tune/Train sessions, reduce verbosity ray_params.verbose = not _in_ray_tune_session() return ray_params @DeveloperAPI class RayXGBoostActor: """Remote Ray XGBoost actor class. This remote actor handles local training and prediction of one data shard. It initializes a Rabit context, thus connecting to the Rabit all-reduce ring, and initializes local training, sending updates to other workers. The actor with rank 0 also checkpoints the model periodically and sends the checkpoint back to the driver. Args: rank: Rank of the actor. Must be ``0 <= rank < num_actors``. num_actors: Total number of actors. queue: Ray queue to communicate with main process. checkpoint_frequency: How often to store checkpoints. Defaults to ``5``, saving checkpoints every 5 boosting rounds. """ def __init__( self, rank: int, num_actors: int, queue: Optional[Queue] = None, stop_event: Optional[Event] = None, checkpoint_frequency: int = 5, distributed_callbacks: Optional[List[DistributedCallback]] = None, ): self.queue = queue init_session(rank, self.queue) self.rank = rank self.num_actors = num_actors self.checkpoint_frequency = checkpoint_frequency self._data: Dict[RayDMatrix, dict] = {} self._local_n: Dict[RayDMatrix, int] = {} self._stop_event = stop_event self._distributed_callbacks = DistributedCallbackContainer( distributed_callbacks ) self._distributed_callbacks.on_init(self) _set_omp_num_threads() logger.debug(f"Initialized remote XGBoost actor with rank {self.rank}") def set_queue(self, queue: Queue): self.queue = queue set_session_queue(self.queue) def set_stop_event(self, stop_event: Event): self._stop_event = stop_event def _get_stop_event(self): return self._stop_event def pid(self): """Get process PID. Used for checking if still alive""" return os.getpid() def ip(self): """Get node IP address.""" return get_node_ip_address() def _save_checkpoint_callback(self): """Send checkpoints to driver""" this = self class _SaveInternalCheckpointCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if get_rabit_rank() == 0 and epoch % this.checkpoint_frequency == 0: put_queue(_Checkpoint(epoch, pickle.dumps(model))) def after_training(self, model): if get_rabit_rank() == 0: put_queue(_Checkpoint(-1, pickle.dumps(model))) return model return _SaveInternalCheckpointCallback() def _stop_callback(self): """Stop if event is set""" this = self # Keep track of initial stop event. Since we're training in a thread, # the stop event might be overwritten, which should he handled # as if the previous stop event was set. initial_stop_event = self._stop_event class _StopCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): try: if ( this._stop_event.is_set() or this._get_stop_event() is not initial_stop_event ): if LEGACY_CALLBACK: raise EarlyStopException(epoch) # Returning True stops training return True except RayActorError: if LEGACY_CALLBACK: raise EarlyStopException(epoch) return True return _StopCallback() def load_data(self, data: RayDMatrix): if data in self._data: return self._distributed_callbacks.before_data_loading(self, data) param = data.get_data(self.rank, self.num_actors) if isinstance(param["data"], list): self._local_n[data] = sum(len(a) for a in param["data"]) else: self._local_n[data] = len(param["data"]) # set nthread for dmatrix conversion param["nthread"] = int(_ray_get_actor_cpus()) self._data[data] = param self._distributed_callbacks.after_data_loading(self, data) def train( self, rabit_args: List[str], return_bst: bool, params: Dict[str, Any], dtrain: RayDMatrix, evals: Tuple[RayDMatrix, str], *args, **kwargs, ) -> Dict[str, Any]: self._distributed_callbacks.before_train(self) num_threads = _set_omp_num_threads() local_params = params.copy() if "xgb_model" in kwargs: if isinstance(kwargs["xgb_model"], bytes): # bytearray type gets lost in remote actor call kwargs["xgb_model"] = bytearray(kwargs["xgb_model"]) if "nthread" not in local_params and "n_jobs" not in local_params: if num_threads > 0: local_params["nthread"] = num_threads local_params["n_jobs"] = num_threads else: local_params["nthread"] = _ray_get_actor_cpus() local_params["n_jobs"] = local_params["nthread"] if dtrain not in self._data: self.load_data(dtrain) for deval, _name in evals: if deval not in self._data: self.load_data(deval) evals_result = dict() if "callbacks" in kwargs: callbacks = kwargs["callbacks"] or [] else: callbacks = [] callbacks.append(self._save_checkpoint_callback()) callbacks.append(self._stop_callback()) kwargs["callbacks"] = callbacks result_dict = {} error_dict = {} # We run xgb.train in a thread to be able to react to the stop event. def _train(): try: with _RabitContext(str(id(self)), rabit_args): local_dtrain = _get_dmatrix(dtrain, self._data[dtrain]) if not local_dtrain.get_label().size: raise RuntimeError( "Training data has no label set. Please make sure " "to set the `label` argument when initializing " "`RayDMatrix()` for data you would like " "to train on." ) local_evals = [] for deval, name in evals: local_evals.append( (_get_dmatrix(deval, self._data[deval]), name) ) if LEGACY_CALLBACK: for xgb_callback in kwargs.get("callbacks", []): if isinstance(xgb_callback, TrainingCallback): xgb_callback.before_training(None) bst = xgb.train( local_params, local_dtrain, *args, evals=local_evals, evals_result=evals_result, **kwargs, ) if LEGACY_CALLBACK: for xgb_callback in kwargs.get("callbacks", []): if isinstance(xgb_callback, TrainingCallback): xgb_callback.after_training(bst) result_dict.update( { "bst": bst, "evals_result": evals_result, "train_n": self._local_n[dtrain], } ) except EarlyStopException: # Usually this should be caught by XGBoost core. # Silent fail, will be raised as RayXGBoostTrainingStopped. return except XGBoostError as e: error_dict.update({"exception": e}) return thread = threading.Thread(target=_train) thread.daemon = True thread.start() while thread.is_alive(): thread.join(timeout=0) if self._stop_event.is_set(): raise RayXGBoostTrainingStopped("Training was interrupted.") time.sleep(0.1) if not result_dict: raise_from = error_dict.get("exception", None) raise RayXGBoostTrainingError("Training failed.") from raise_from thread.join() self._distributed_callbacks.after_train(self, result_dict) if not return_bst: result_dict.pop("bst", None) return result_dict def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs): self._distributed_callbacks.before_predict(self) _set_omp_num_threads() if data not in self._data: self.load_data(data) local_data = _get_dmatrix(data, self._data[data]) predictions = model.predict(local_data, **kwargs) if predictions.ndim == 1: callback_predictions = pd.Series(predictions) else: callback_predictions = pd.DataFrame(predictions) self._distributed_callbacks.after_predict(self, callback_predictions) return predictions @ray.remote class _RemoteRayXGBoostActor(RayXGBoostActor): pass class _PrepareActorTask(MultiActorTask): def __init__( self, actor: ActorHandle, queue: Queue, stop_event: Event, load_data: List[RayDMatrix], ): futures = [] futures.append(actor.set_queue.remote(queue)) futures.append(actor.set_stop_event.remote(stop_event)) for data in load_data: futures.append(actor.load_data.remote(data)) super(_PrepareActorTask, self).__init__(futures) def _autodetect_resources( ray_params: RayParams, use_tree_method: bool = False ) -> Tuple[int, int]: gpus_per_actor = ray_params.gpus_per_actor cpus_per_actor = ray_params.cpus_per_actor # Automatically set gpus_per_actor if left at the default value if gpus_per_actor == -1: gpus_per_actor = 0 if use_tree_method: gpus_per_actor = 1 # Automatically set cpus_per_actor if left at the default value # Will be set to the number of cluster CPUs divided by the number of # actors, bounded by the minimum number of CPUs across actors nodes. if cpus_per_actor <= 0: cluster_cpus = _ray_get_cluster_cpus() or 1 cpus_per_actor = max( 1, min( int(_get_min_node_cpus() or 1), int(cluster_cpus // ray_params.num_actors), ), ) return cpus_per_actor, gpus_per_actor def _create_actor( rank: int, num_actors: int, num_cpus_per_actor: int, num_gpus_per_actor: int, resources_per_actor: Optional[Dict] = None, placement_group: Optional[PlacementGroup] = None, queue: Optional[Queue] = None, checkpoint_frequency: int = 5, distributed_callbacks: Optional[Sequence[DistributedCallback]] = None, ) -> ActorHandle: # Send DEFAULT_PG here, which changed in Ray >= 1.5.0 # If we send `None`, this will ignore the parent placement group and # lead to errors e.g. when used within Ray Tune actor_cls = _RemoteRayXGBoostActor.options( num_cpus=num_cpus_per_actor, num_gpus=num_gpus_per_actor, resources=resources_per_actor, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=placement_group or DEFAULT_PG, placement_group_capture_child_tasks=True, ), ) return actor_cls.remote( rank=rank, num_actors=num_actors, queue=queue, checkpoint_frequency=checkpoint_frequency, distributed_callbacks=distributed_callbacks, ) def _trigger_data_load(actor, dtrain, evals): wait_load = [actor.load_data.remote(dtrain)] for deval, _name in evals: wait_load.append(actor.load_data.remote(deval)) return wait_load def _handle_queue(queue: Queue, checkpoint: _Checkpoint, callback_returns: Dict): """Handle results obtained from workers through the remote Queue object. Remote actors supply these results via the ``xgboost_ray.session.put_queue()`` function. These can be: - Callables. These will be called immediately with no arguments. - ``_Checkpoint`` objects. These will update the latest checkpoint object on the driver. - Any other type. These will be appended to an actor rank-specific ``callback_returns`` dict that will be written to the ``additional_returns`` dict of the :func:`train() ` method. """ while not queue.empty(): (actor_rank, item) = queue.get() if isinstance(item, Callable): item() elif isinstance(item, _Checkpoint): checkpoint.__dict__.update(item.__dict__) else: callback_returns[actor_rank].append(item) def _shutdown( actors: List[ActorHandle], pending_actors: Optional[Dict[int, Tuple[ActorHandle, _PrepareActorTask]]] = None, queue: Optional[Queue] = None, event: Optional[Event] = None, placement_group: Optional[PlacementGroup] = None, force: bool = False, ): alive_actors = [a for a in actors if a is not None] if pending_actors: alive_actors += [a for (a, _) in pending_actors.values()] if force: for actor in alive_actors: ray.kill(actor) else: done_refs = [a.__ray_terminate__.remote() for a in alive_actors] # Wait 5 seconds for actors to die gracefully. done, not_done = ray.wait(done_refs, timeout=5) if not_done: # If all actors are not able to die gracefully, then kill them. for actor in alive_actors: ray.kill(actor) for i in range(len(actors)): actors[i] = None if queue: queue.shutdown() if event: event.shutdown() if placement_group: remove_placement_group(placement_group) def _create_placement_group( cpus_per_actor, gpus_per_actor, resources_per_actor, num_actors, strategy ): resources_per_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} extra_resources_per_bundle = ( {} if resources_per_actor is None else resources_per_actor ) # Create placement group for training worker colocation. bundles = [ {**resources_per_bundle, **extra_resources_per_bundle} for _ in range(num_actors) ] pg = placement_group(bundles, strategy=strategy) # Wait for placement group to get created. logger.debug("Waiting for placement group to start.") timeout = ENV.PLACEMENT_GROUP_TIMEOUT_S ready, _ = ray.wait([pg.ready()], timeout=timeout) if ready: logger.debug("Placement group has started.") else: raise TimeoutError( f"Placement group creation timed out after {timeout} seconds. " "Make sure your cluster either has enough resources or use " "an autoscaling cluster. Current resources " f"available: {ray.available_resources()}, resources requested " f"by the placement group: {pg.bundle_specs}. " "You can change the timeout by setting the " "RXGB_PLACEMENT_GROUP_TIMEOUT_S environment variable." ) return pg def _create_communication_processes(added_tune_callback: bool = False): # Have to explicitly set num_cpus to 0. placement_option = {"num_cpus": 0} current_pg = get_current_placement_group() if current_pg is not None: # If we are already in a placement group, let's use it # Also, if we are specifically in Tune, let's # ensure that we force Queue and # StopEvent onto same bundle as the Trainable. placement_option.update( { "placement_group": current_pg, "placement_group_bundle_index": 0 if added_tune_callback else -1, } ) else: # Create Queue and Event actors and make sure to colocate with # driver node. node_id = ray.get_runtime_context().get_node_id() placement_option.update( { "scheduling_strategy": NodeAffinitySchedulingStrategy( node_id=node_id, soft=ENV.COMMUNICATION_SOFT_PLACEMENT, ) } ) queue = Queue(actor_options=placement_option) # Queue actor stop_event = Event(actor_options=placement_option) # Stop event actor return queue, stop_event def _validate_kwargs_for_func(kwargs: Dict[str, Any], func: Callable, func_name: str): """Raise exception if kwargs are not valid for a given function.""" sig = inspect.signature(func) try: sig.bind_partial(**kwargs) except TypeError as e: # Try to find set of invalid kwargs valid_keys = inspect.getfullargspec(func)[0] invalid_kwargs = [k for k in kwargs if k not in valid_keys] raise TypeError( f"Got invalid keyword arguments to be passed to `{func_name}`. " f"Please check these arguments: {invalid_kwargs}" ) from e @dataclass class _TrainingState: actors: List[Optional[ActorHandle]] queue: Queue stop_event: Event checkpoint: _Checkpoint additional_results: Dict training_started_at: float = 0.0 placement_group: Optional[PlacementGroup] = None failed_actor_ranks: set = field(default_factory=set) # Last time we checked resources to schedule new actors last_resource_check_at: float = 0 pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = field( default_factory=dict ) restart_training_at: Optional[float] = None def _train( params: Dict, dtrain: RayDMatrix, *args, evals=(), ray_params: RayParams, cpus_per_actor: int, gpus_per_actor: int, _training_state: _TrainingState, **kwargs, ) -> Tuple[xgb.Booster, Dict, Dict]: """This is the local train function wrapped by :func:`train() `. This function can be thought of one invocation of a multi-actor xgboost training run. It starts the required number of actors, triggers data loading, collects the results, and handles (i.e. registers) actor failures - but it does not handle fault tolerance or general training setup. Generally, this function is called one or multiple times by the :func:`train() ` function. It is called exactly once if no errors occur. It is called more than once if errors occurred (e.g. an actor died) and failure handling is enabled. """ from xgboost_ray.elastic import ( _get_actor_alive_status, _maybe_schedule_new_actors, _update_scheduled_actor_states, ) # Do not modify original parameters params = params.copy() # Un-schedule possible scheduled restarts _training_state.restart_training_at = None if "nthread" in params or "n_jobs" in params: if ("nthread" in params and params["nthread"] > cpus_per_actor) or ( "n_jobs" in params and params["n_jobs"] > cpus_per_actor ): raise ValueError( "Specified number of threads greater than number of CPUs. " "\nFIX THIS by passing a lower value for the `nthread` " "parameter or a higher number for `cpus_per_actor`." ) else: params["nthread"] = cpus_per_actor params["n_jobs"] = cpus_per_actor if ray_params.verbose: maybe_log = logger.info params.setdefault("verbosity", 1) else: maybe_log = logger.debug params.setdefault("verbosity", 0) # This is a callback that handles actor failures. # We identify the rank of the failed actor, add this to a set of # failed actors (which we might want to restart later), and set its # entry in the actor list to None. def handle_actor_failure(actor_id): rank = _training_state.actors.index(actor_id) _training_state.failed_actor_ranks.add(rank) _training_state.actors[rank] = None # Here we create new actors. In the first invocation of _train(), this # will be all actors. In future invocations, this may be less than # the num_actors setting, depending on the failure mode. newly_created = 0 for i in list(_training_state.failed_actor_ranks): if _training_state.actors[i] is not None: raise RuntimeError( f"Trying to create actor with rank {i}, but it already " f"exists." ) actor = _create_actor( rank=i, num_actors=ray_params.num_actors, num_cpus_per_actor=cpus_per_actor, num_gpus_per_actor=gpus_per_actor, resources_per_actor=ray_params.resources_per_actor, placement_group=_training_state.placement_group, queue=_training_state.queue, checkpoint_frequency=ray_params.checkpoint_frequency, distributed_callbacks=ray_params.distributed_callbacks, ) # Set actor entry in our list _training_state.actors[i] = actor # Remove from this set so it is not created again _training_state.failed_actor_ranks.remove(i) newly_created += 1 alive_actors = sum(1 for a in _training_state.actors if a is not None) maybe_log( f"[RayXGBoost] Created {newly_created} new actors " f"({alive_actors} total actors). Waiting until actors " f"are ready for training." ) # For distributed datasets (e.g. Modin), this will initialize # (and fix) the assignment of data shards to actor ranks dtrain.assert_enough_shards_for_actors(num_actors=ray_params.num_actors) dtrain.assign_shards_to_actors(_training_state.actors) for deval, _ in evals: deval.assert_enough_shards_for_actors(num_actors=ray_params.num_actors) deval.assign_shards_to_actors(_training_state.actors) load_data = [dtrain] + [eval[0] for eval in evals] prepare_actor_tasks = [ _PrepareActorTask( actor, # Maybe we got a new Queue actor, so send it to all actors. queue=_training_state.queue, # Maybe we got a new Event actor, so send it to all actors. stop_event=_training_state.stop_event, # Trigger data loading load_data=load_data, ) for actor in _training_state.actors if actor is not None ] start_wait = time.time() last_status = start_wait try: # Construct list before calling any() to force evaluation ready_states = [task.is_ready() for task in prepare_actor_tasks] while not all(ready_states): if time.time() >= last_status + ENV.STATUS_FREQUENCY_S: wait_time = time.time() - start_wait logger.info( f"Waiting until actors are ready " f"({wait_time:.0f} seconds passed)." ) last_status = time.time() time.sleep(0.1) ready_states = [task.is_ready() for task in prepare_actor_tasks] except Exception as exc: _training_state.stop_event.set() _get_actor_alive_status(_training_state.actors, handle_actor_failure) raise RayActorError from exc maybe_log("[RayXGBoost] Starting XGBoost training.") # Start Rabit tracker for gradient sharing rabit_process, rabit_args = _start_rabit_tracker(alive_actors) # Load checkpoint if we have one. In that case we need to adjust the # number of training rounds. if _training_state.checkpoint.value: kwargs["xgb_model"] = pickle.loads(_training_state.checkpoint.value) if _training_state.checkpoint.iteration == -1: # -1 means training already finished. logger.error( "Trying to load continue from checkpoint, but the checkpoint" "indicates training already finished. Returning last" "checkpointed model instead." ) return kwargs["xgb_model"], {}, _training_state.additional_results # The callback_returns dict contains actor-rank indexed lists of # results obtained through the `put_queue` function, usually # sent via callbacks. callback_returns = _training_state.additional_results.get("callback_returns") if callback_returns is None: callback_returns = [list() for _ in range(len(_training_state.actors))] _training_state.additional_results["callback_returns"] = callback_returns _training_state.training_started_at = time.time() # Trigger the train function live_actors = [actor for actor in _training_state.actors if actor is not None] training_futures = [ actor.train.remote( rabit_args, i == 0, params, dtrain, evals, *args, **kwargs # return_bst ) for i, actor in enumerate(live_actors) ] # Failure handling loop. Here we wait until all training tasks finished. # If a training task fails, we stop training on the remaining actors, # check which ones are still alive, and raise the error. # The train() wrapper function will then handle the error. start_wait = time.time() last_status = start_wait # When the number of trees/dataset size is very small, # training can be too fast and finish before the queue Actor # gets to process the calls it has recieved. This is a very rare edge # case, but it can show up in CI. # In order to mitigate, if the queue has not been handled before, # we simply wait a moment before checking it one last time. has_queue_been_handled = False try: not_ready = training_futures while not_ready: if _training_state.queue: has_queue_been_handled = True _handle_queue( queue=_training_state.queue, checkpoint=_training_state.checkpoint, callback_returns=callback_returns, ) if ray_params.elastic_training and not ENV.ELASTIC_RESTART_DISABLED: _maybe_schedule_new_actors( training_state=_training_state, num_cpus_per_actor=cpus_per_actor, num_gpus_per_actor=gpus_per_actor, resources_per_actor=ray_params.resources_per_actor, ray_params=ray_params, load_data=load_data, ) # This may raise RayXGBoostActorAvailable _update_scheduled_actor_states(_training_state) if time.time() >= last_status + ENV.STATUS_FREQUENCY_S: wait_time = time.time() - start_wait logger.info( f"Training in progress " f"({wait_time:.0f} seconds since last restart)." ) last_status = time.time() ready, not_ready = ray.wait( not_ready, num_returns=len(not_ready), timeout=1 ) ray.get(ready) # Get items from queue one last time if not has_queue_been_handled: time.sleep(1) if _training_state.queue: _handle_queue( queue=_training_state.queue, checkpoint=_training_state.checkpoint, callback_returns=callback_returns, ) # The inner loop should catch all exceptions except Exception as exc: logger.debug(f"Caught exception in training loop: {exc}") # Stop all other actors from training _training_state.stop_event.set() # Check which actors are still alive _get_actor_alive_status(_training_state.actors, handle_actor_failure) # Todo: Try to fetch newer checkpoint, store in `_checkpoint` # Shut down rabit _stop_rabit_tracker(rabit_process) raise RayActorError from exc # Training is now complete. # Stop Rabit tracking process _stop_rabit_tracker(rabit_process) # Get all results from all actors. all_results: List[Dict[str, Any]] = ray.get(training_futures) # All results should be the same because of Rabit tracking. But only # the first one actually returns its bst object. bst = all_results[0]["bst"] evals_result = all_results[0]["evals_result"] if callback_returns: _training_state.additional_results["callback_returns"] = callback_returns total_n = sum(res["train_n"] or 0 for res in all_results) _training_state.additional_results["total_n"] = total_n return bst, evals_result, _training_state.additional_results @PublicAPI(stability="beta") def train( params: Dict, dtrain: RayDMatrix, num_boost_round: int = 10, *args, evals: Union[List[Tuple[RayDMatrix, str]], Tuple[RayDMatrix, str]] = (), evals_result: Optional[Dict] = None, additional_results: Optional[Dict] = None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, **kwargs, ) -> xgb.Booster: """Distributed XGBoost training via Ray. This function will connect to a Ray cluster, create ``num_actors`` remote actors, send data shards to them, and have them train an XGBoost classifier. The XGBoost parameters will be shared and combined via Rabit's all-reduce protocol. If running inside a Ray Tune session, this function will automatically handle results to tune for hyperparameter search. Failure handling: XGBoost on Ray supports automatic failure handling that can be configured with the :class:`ray_params ` argument. If an actor or local training task dies, the Ray actor is marked as dead, and there are three options on how to proceed. First, if ``ray_params.elastic_training`` is ``True`` and the number of dead actors is below ``ray_params.max_failed_actors``, training will continue right away with fewer actors. No data will be loaded again and the latest available checkpoint will be used. A maximum of ``ray_params.max_actor_restarts`` restarts will be tried before exiting. Second, if ``ray_params.elastic_training`` is ``False`` and the number of restarts is below ``ray_params.max_actor_restarts``, Ray will try to schedule the dead actor again, load the data shard on this actor, and then continue training from the latest checkpoint. Third, if none of the above is the case, training is aborted. Args: params: parameter dict passed to ``xgboost.train()`` dtrain: Data object containing the training data. evals: ``evals`` tuple passed to ``xgboost.train()``. evals_result: Dict to store evaluation results in. additional_results: Dict to store additional results. ray_params: Parameters to configure Ray-specific behavior. See :class:`RayParams` for a list of valid configuration parameters. _remote: Whether to run the driver process in a remote function. This is enabled by default in Ray client mode. **kwargs: Keyword arguments will be passed to the local `xgb.train()` calls. Returns: An ``xgboost.Booster`` object. """ os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1") if platform.system() == "Windows": raise RuntimeError( "xgboost-ray training currently does not support " "Windows." ) if xgb is None: raise ImportError( "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. " 'FIX THIS by running `pip install "xgboost-ray"`.' ) if _remote is None: _remote = _is_client_connected() and not _in_ray_tune_session() if not ray.is_initialized(): ray.init() if _remote: # Run this function as a remote function to support Ray client mode. @ray.remote(num_cpus=0) def _wrapped(*args, **kwargs): _evals_result = {} _additional_results = {} bst = train( *args, num_boost_round=num_boost_round, evals_result=_evals_result, additional_results=_additional_results, **kwargs, ) return bst, _evals_result, _additional_results # Make sure that train is called on the server node. _wrapped = force_on_current_node(_wrapped) bst, train_evals_result, train_additional_results = ray.get( _wrapped.remote( params, dtrain, *args, evals=evals, ray_params=ray_params, _remote=False, **kwargs, ) ) if isinstance(evals_result, dict): evals_result.update(train_evals_result) if isinstance(additional_results, dict): additional_results.update(train_additional_results) return bst _maybe_print_legacy_warning() # may raise TypeError _validate_kwargs_for_func(kwargs, xgb.train, "xgb.train()") start_time = time.time() ray_params = _validate_ray_params(ray_params) max_actor_restarts = ( ray_params.max_actor_restarts if ray_params.max_actor_restarts >= 0 else float("inf") ) _assert_ray_support() if not isinstance(dtrain, RayDMatrix): raise ValueError( "The `dtrain` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " "`dtrain = RayDMatrix(data=data, label=label)`.".format(type(dtrain)) ) if RAY_TUNE_INSTALLED: added_tune_callback = _try_add_tune_callback(kwargs) else: added_tune_callback = False # Tune currently does not support elastic training. if ( added_tune_callback and ray_params.elastic_training and not bool(os.getenv("RXGB_ALLOW_ELASTIC_TUNE", "0")) ): raise ValueError( "Elastic Training cannot be used with Ray Tune. " "Please disable elastic_training in RayParams in " "order to use xgboost_ray with Tune." ) if added_tune_callback or get_current_placement_group(): # Don't autodetect resources when used with Tune. cpus_per_actor = ray_params.cpus_per_actor gpus_per_actor = max(0, ray_params.gpus_per_actor) else: cpus_per_actor, gpus_per_actor = _autodetect_resources( ray_params=ray_params, use_tree_method="tree_method" in params and params["tree_method"] is not None and params["tree_method"].startswith("gpu"), ) tree_method = params.get("tree_method", "auto") or "auto" # preemptively raise exceptions with bad params if tree_method == "exact": raise ValueError("`exact` tree method doesn't support distributed training.") if params.get("updater", None) == "grow_colmaker": raise ValueError( "`grow_colmaker` updater doesn't support distributed training." ) if gpus_per_actor > 0 and not tree_method.startswith("gpu_"): warnings.warn( f"GPUs have been assigned to the actors, but the current XGBoost " f"tree method is set to `{tree_method}`. Thus, GPUs will " f"currently not be used. To enable GPUs usage, please set the " f"`tree_method` to a GPU-compatible option, " f"e.g. `gpu_hist`." ) if gpus_per_actor == 0 and cpus_per_actor == 0: raise ValueError( "cpus_per_actor and gpus_per_actor both cannot be " "0. Are you sure your cluster has CPUs available?" ) if ray_params.elastic_training and ray_params.max_failed_actors == 0: raise ValueError( "Elastic training enabled but the maximum number of failed " "actors is set to 0. This means that elastic training is " "effectively disabled. Please set `RayParams.max_failed_actors` " "to something larger than 0 to enable elastic training." ) if ray_params.elastic_training and ray_params.max_actor_restarts == 0: raise ValueError( "Elastic training enabled but the maximum number of actor " "restarts is set to 0. This means that elastic training is " "effectively disabled. Please set `RayParams.max_actor_restarts` " "to something larger than 0 to enable elastic training." ) if not dtrain.has_label: raise ValueError( "Training data has no label set. Please make sure to set " "the `label` argument when initializing `RayDMatrix()` " "for data you would like to train on." ) if not dtrain.loaded and not dtrain.distributed: dtrain.load_data(ray_params.num_actors) for deval, _name in evals: if not deval.has_label: raise ValueError( "Evaluation data has no label set. Please make sure to set " "the `label` argument when initializing `RayDMatrix()` " "for data you would like to evaluate on." ) if not deval.loaded and not deval.distributed: deval.load_data(ray_params.num_actors) bst = None train_evals_result = {} train_additional_results = {} tries = 0 checkpoint = _Checkpoint() # Keep track of latest checkpoint current_results = {} # Keep track of additional results actors = [None] * ray_params.num_actors # All active actors pending_actors = {} # Create the Queue and Event actors. queue, stop_event = _create_communication_processes(added_tune_callback) placement_strategy = None if not ray_params.elastic_training: if added_tune_callback or get_current_placement_group(): # Tune is using placement groups, so the strategy has already # been set. Don't create an additional placement_group here. placement_strategy = None elif bool(ENV.USE_SPREAD_STRATEGY): placement_strategy = "SPREAD" if placement_strategy is not None: pg = _create_placement_group( cpus_per_actor, gpus_per_actor, ray_params.resources_per_actor, ray_params.num_actors, placement_strategy, ) else: pg = None start_actor_ranks = set(range(ray_params.num_actors)) # Start these total_training_time = 0.0 boost_rounds_left = num_boost_round last_checkpoint_value = checkpoint.value while tries <= max_actor_restarts: # Only update number of iterations if the checkpoint changed # If it didn't change, we already subtracted the iterations. if checkpoint.iteration >= 0 and checkpoint.value != last_checkpoint_value: boost_rounds_left -= checkpoint.iteration + 1 last_checkpoint_value = checkpoint.value logger.debug(f"Boost rounds left: {boost_rounds_left}") training_state = _TrainingState( actors=actors, queue=queue, stop_event=stop_event, checkpoint=checkpoint, additional_results=current_results, training_started_at=0.0, placement_group=pg, failed_actor_ranks=start_actor_ranks, pending_actors=pending_actors, ) try: bst, train_evals_result, train_additional_results = _train( params, dtrain, boost_rounds_left, *args, evals=evals, ray_params=ray_params, cpus_per_actor=cpus_per_actor, gpus_per_actor=gpus_per_actor, _training_state=training_state, **kwargs, ) if training_state.training_started_at > 0.0: total_training_time += time.time() - training_state.training_started_at break except (RayActorError, RayTaskError) as exc: if training_state.training_started_at > 0.0: total_training_time += time.time() - training_state.training_started_at alive_actors = sum(1 for a in actors if a is not None) start_again = False if ray_params.elastic_training: if alive_actors < ray_params.num_actors - ray_params.max_failed_actors: raise RuntimeError( "A Ray actor died during training and the maximum " "number of dead actors in elastic training was " "reached. Shutting down training." ) from exc # Do not start new actors before resuming training # (this might still restart actors during training) start_actor_ranks.clear() if exc.__cause__ and isinstance( exc.__cause__, RayXGBoostActorAvailable ): # New actor available, integrate into training loop logger.info( f"A new actor became available. Re-starting training " f"from latest checkpoint with new actor. " f"This will use {alive_actors} existing actors and " f"start {len(start_actor_ranks)} new actors. " f"Sleeping for 10 seconds for cleanup." ) tries -= 1 # This is deliberate so shouldn't count start_again = True elif tries + 1 <= max_actor_restarts: if exc.__cause__ and isinstance( exc.__cause__, RayXGBoostTrainingError ): logger.warning(f"Caught exception: {exc.__cause__}") logger.warning( f"A Ray actor died during training. Trying to " f"continue training on the remaining actors. " f"This will use {alive_actors} existing actors and " f"start {len(start_actor_ranks)} new actors. " f"Sleeping for 10 seconds for cleanup." ) start_again = True elif tries + 1 <= max_actor_restarts: if exc.__cause__ and isinstance(exc.__cause__, RayXGBoostTrainingError): logger.warning(f"Caught exception: {exc.__cause__}") logger.warning( f"A Ray actor died during training. Trying to restart " f"and continue training from last checkpoint " f"(restart {tries + 1} of {max_actor_restarts}). " f"This will use {alive_actors} existing actors and start " f"{len(start_actor_ranks)} new actors. " f"Sleeping for 10 seconds for cleanup." ) start_again = True if start_again: time.sleep(5) queue.shutdown() stop_event.shutdown() time.sleep(5) queue, stop_event = _create_communication_processes() else: raise RuntimeError( f"A Ray actor died during training and the maximum number " f"of retries ({max_actor_restarts}) is exhausted." ) from exc tries += 1 total_time = time.time() - start_time train_additional_results["training_time_s"] = total_training_time train_additional_results["total_time_s"] = total_time if ray_params.verbose: maybe_log = logger.info else: maybe_log = logger.debug maybe_log( "[RayXGBoost] Finished XGBoost training on training data " "with total N={total_n:,} in {total_time_s:.2f} seconds " "({training_time_s:.2f} pure XGBoost training time).".format( **train_additional_results ) ) _shutdown( actors=actors, pending_actors=pending_actors, queue=queue, event=stop_event, placement_group=pg, force=False, ) if isinstance(evals_result, dict): evals_result.update(train_evals_result) if isinstance(additional_results, dict): additional_results.update(train_additional_results) return bst def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, **kwargs): _assert_ray_support() if ray_params.verbose: maybe_log = logger.info else: maybe_log = logger.debug if not ray.is_initialized(): ray.init() # Create remote actors actors = [ _create_actor( rank=i, num_actors=ray_params.num_actors, num_cpus_per_actor=ray_params.cpus_per_actor, num_gpus_per_actor=ray_params.gpus_per_actor if ray_params.gpus_per_actor >= 0 else 0, resources_per_actor=ray_params.resources_per_actor, distributed_callbacks=ray_params.distributed_callbacks, ) for i in range(ray_params.num_actors) ] maybe_log(f"[RayXGBoost] Created {len(actors)} remote actors.") # Split data across workers wait_load = [] for actor in actors: wait_load.extend(_trigger_data_load(actor, data, [])) try: ray.get(wait_load) except Exception as exc: logger.warning(f"Caught an error during prediction: {str(exc)}") _shutdown(actors, force=True) raise # Put model into object store model_ref = ray.put(model) maybe_log("[RayXGBoost] Starting XGBoost prediction.") # Train fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors] try: actor_results = ray.get(fut) except Exception as exc: logger.warning(f"Caught an error during prediction: {str(exc)}") _shutdown(actors=actors, force=True) raise _shutdown(actors=actors, force=False) return combine_data(data.sharding, actor_results) @PublicAPI(stability="beta") def predict( model: xgb.Booster, data: RayDMatrix, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, **kwargs, ) -> Optional[np.ndarray]: """Distributed XGBoost predict via Ray. This function will connect to a Ray cluster, create ``num_actors`` remote actors, send data shards to them, and have them predict labels using an XGBoost booster model. The results are then combined and returned. Args: model: Booster object to call for prediction. data: Data object containing the prediction data. ray_params: Parameters to configure Ray-specific behavior. See :class:`RayParams` for a list of valid configuration parameters. _remote: Whether to run the driver process in a remote function. This is enabled by default in Ray client mode. **kwargs: Keyword arguments will be passed to the local `xgb.predict()` calls. Returns: ``np.ndarray`` containing the predicted labels. """ os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1") if xgb is None: raise ImportError( "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. " 'FIX THIS by running `pip install "xgboost-ray"`.' ) if _remote is None: _remote = _is_client_connected() and not _in_ray_tune_session() if not ray.is_initialized(): ray.init() if _remote: return ray.get( ray.remote(num_cpus=0)(predict).remote( model, data, ray_params, _remote=False, **kwargs ) ) _maybe_print_legacy_warning() ray_params = _validate_ray_params(ray_params) max_actor_restarts = ( ray_params.max_actor_restarts if ray_params.max_actor_restarts >= 0 else float("inf") ) _assert_ray_support() if not isinstance(data, RayDMatrix): raise ValueError( "The `data` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " "`data = RayDMatrix(data=data)`.".format(type(data)) ) tries = 0 while tries <= max_actor_restarts: try: return _predict(model, data, ray_params=ray_params, **kwargs) except RayActorError: if tries + 1 <= max_actor_restarts: logger.warning( "A Ray actor died during prediction. Trying to restart " "prediction from scratch. " "Sleeping for 10 seconds for cleanup." ) time.sleep(10) else: raise RuntimeError( "A Ray actor died during prediction and the maximum " "number of retries ({}) is exhausted.".format(max_actor_restarts) ) tries += 1 return None ================================================ FILE: xgboost_ray/matrix.py ================================================ import glob import uuid from enum import Enum from typing import ( TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Type, Union, ) from ray.actor import ActorHandle try: import cupy as cp except ImportError: cp = None import os import numpy as np import pandas as pd import ray from ray import logger from ray.util.annotations import DeveloperAPI, PublicAPI from xgboost_ray.data_sources import DataSource, RayFileType, data_sources try: from ray.data.dataset import Dataset as RayDataset except ImportError: class RayDataset: pass try: from xgboost.core import DataIter LEGACY_MATRIX = False except ImportError: DataIter = object LEGACY_MATRIX = True try: from xgboost.core import QuantileDmatrix QUANTILE_AVAILABLE = True except ImportError: QuantileDmatrix = object QUANTILE_AVAILABLE = False if TYPE_CHECKING: from xgboost_ray.xgb import xgboost as xgb Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series] def concat_dataframes(dfs: List[Optional[pd.DataFrame]]): filtered = [df for df in dfs if df is not None] return pd.concat(filtered, ignore_index=True, copy=False) def ensure_sorted_by_qid( df: pd.DataFrame, qid: Data ) -> Tuple[Union[np.array, str], pd.DataFrame]: _qid: pd.Series = None if isinstance(qid, str): _qid = df[qid] elif isinstance(qid, np.ndarray): _qid = pd.Series(qid) elif isinstance(qid, pd.DataFrame): if len(df.shape) != 2 and df.shape[1] != 1: raise ValueError( f"qid argument of type pd.DataFrame is expected" "to contains only 1 column of data " f"but the qid passed in is of shape {df.shape}." ) _qid = qid.iloc[:, 0] elif isinstance(qid, pd.Series): _qid = qid # pandas < 2.0 if getattr(_qid, "is_monotonic", False): return _qid, df # pandas >= 2.0 elif getattr(_qid, "is_monotonic_increasing", False) or getattr( _qid, "is_monotonic_decreasing", False ): return _qid, df else: if isinstance(qid, str): return qid, df.sort_values([qid]) else: # case when qid is not part of df return _qid.sort_values(), df.set_index(_qid).sort_index().reset_index( drop=True ) @PublicAPI(stability="beta") class RayShardingMode(Enum): """Enum for different modes of sharding the data. ``RayShardingMode.INTERLEAVED`` will divide the data per row, i.e. every i-th row will be passed to the first worker, every (i+1)th row to the second worker, etc. ``RayShardingMode.BATCH`` will divide the data in batches, i.e. the first 0-(m-1) rows will be passed to the first worker, the m-(2m-1) rows to the second worker, etc. ``RayShardingMode.FIXED`` is set automatically when using a distributed data source that assigns actors to specific data shards on initialization and then keeps these fixed. """ INTERLEAVED = 1 BATCH = 2 FIXED = 3 @DeveloperAPI class RayDataIter(DataIter): def __init__( self, data: List[Data], label: List[Optional[Data]], missing: Optional[float], weight: List[Optional[Data]], feature_weights: List[Optional[Data]], qid: List[Optional[Data]], base_margin: List[Optional[Data]], label_lower_bound: List[Optional[Data]], label_upper_bound: List[Optional[Data]], feature_names: Optional[List[str]], feature_types: Optional[List[np.dtype]], enable_categorical: Optional[bool], ): super(RayDataIter, self).__init__() assert cp is not None self._data = data self._label = label self._missing = missing self._weight = weight self._feature_weights = feature_weights self._qid = qid self._base_margin = base_margin self._label_lower_bound = label_lower_bound self._label_upper_bound = label_upper_bound self._feature_names = feature_names self._feature_types = feature_types self.enable_categorical = enable_categorical self._iter = 0 def __len__(self): return sum(len(shard) for shard in self._data) def reset(self): self._iter = 0 def _prop(self, ref): if ref is None: return None item = ref[self._iter] if item is None: return None if not isinstance(item, cp.ndarray): item = cp.array(item.values) return item def next(self, input_data: Callable): if self._iter >= len(self._data): return 0 input_data( data=self._prop(self._data), label=self._prop(self._label), weight=self._prop(self._weight), feature_weights=self._prop(self._feature_weights), qid=self._prop(self._qid), group=None, label_lower_bound=self._prop(self._label_lower_bound), label_upper_bound=self._prop(self._label_upper_bound), feature_names=self._feature_names, feature_types=self._feature_types, enable_categorical=self._enable_categorical, ) self._iter += 1 return 1 class _RayDMatrixLoader: def __init__( self, data: Data, label: Optional[Data] = None, missing: Optional[float] = None, weight: Optional[Data] = None, feature_weights: Optional[Data] = None, base_margin: Optional[Data] = None, label_lower_bound: Optional[Data] = None, label_upper_bound: Optional[Data] = None, feature_names: Optional[List[str]] = None, feature_types: Optional[List[np.dtype]] = None, qid: Optional[Data] = None, enable_categorical: Optional[bool] = None, filetype: Optional[RayFileType] = None, ignore: Optional[List[str]] = None, **kwargs, ): self.data = data self.label = label self.missing = missing self.weight = weight self.feature_weights = feature_weights self.base_margin = base_margin self.label_lower_bound = label_lower_bound self.label_upper_bound = label_upper_bound self.feature_names = feature_names self.feature_types = feature_types self.qid = qid self.enable_categorical = enable_categorical self.data_source = None self.actor_shards = None self.filetype = filetype self.ignore = ignore self.kwargs = kwargs self._cached_n = None check = None if isinstance(data, str): check = data elif isinstance(data, Sequence) and isinstance(data[0], str): check = data[0] if check is not None: if not self.filetype: # Try to guess filetype for data_source in data_sources: self.filetype = data_source.get_filetype(check) if self.filetype: break if not self.filetype: raise ValueError( f"File or stream ({check}) specified as data source, " "but filetype could not be detected. " "\nFIX THIS by passing " "the `filetype` parameter to the RayDMatrix. Use the " "`RayFileType` enum for this." ) def get_data_source(self) -> Type[DataSource]: raise NotImplementedError def assert_enough_shards_for_actors(self, num_actors: int): """Assert that we have enough shards to split across actors.""" # Pass per default pass def update_matrix_properties(self, matrix: "xgb.DMatrix"): data_source = self.get_data_source() data_source.update_feature_names(matrix, self.feature_names) def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: """Assign data shards to actors. Returns True if shards were assigned to actors. In that case, the sharding mode should be adjusted to ``RayShardingMode.FIXED``. Returns False otherwise. """ return False def _split_dataframe( self, local_data: pd.DataFrame, data_source: Type[DataSource] ) -> Tuple[ pd.DataFrame, Optional[pd.Series], Optional[pd.Series], Optional[pd.Series], Optional[pd.Series], Optional[pd.Series], ]: """ Split dataframe into `features`, `labels`, `weight`, `feature_weights`, `base_margin`, `label_lower_bound`, `label_upper_bound` """ # sort dataframe by qid if exists (required by DMatrix) if self.qid is not None: _qid, local_data = ensure_sorted_by_qid(local_data, self.qid) if not isinstance(self.qid, str): self.qid = _qid exclude_cols: Set[str] = set() # Exclude these columns from `x` label, exclude = data_source.get_column(local_data, self.label) if exclude: if isinstance(exclude, List): exclude_cols.update(exclude) else: exclude_cols.add(exclude) weight, exclude = data_source.get_column(local_data, self.weight) if exclude: exclude_cols.add(exclude) feature_weights, exclude = data_source.get_column( local_data, self.feature_weights ) if exclude: exclude_cols.add(exclude) qid, exclude = data_source.get_column(local_data, self.qid) if exclude: exclude_cols.add(exclude) base_margin, exclude = data_source.get_column(local_data, self.base_margin) if exclude: exclude_cols.add(exclude) label_lower_bound, exclude = data_source.get_column( local_data, self.label_lower_bound ) if exclude: exclude_cols.add(exclude) label_upper_bound, exclude = data_source.get_column( local_data, self.label_upper_bound ) if exclude: exclude_cols.add(exclude) x = local_data if exclude_cols: x = x[[col for col in x.columns if col not in exclude_cols]] return ( x, label, weight, feature_weights, base_margin, label_lower_bound, label_upper_bound, qid, ) def load_data( self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None ) -> Tuple[Dict, int]: raise NotImplementedError class _CentralRayDMatrixLoader(_RayDMatrixLoader): """Load full dataset from a central location and put into object store""" def get_data_source(self) -> Type[DataSource]: if self.data_source: return self.data_source data_source = None for source in data_sources: if not source.supports_central_loading: continue try: if source.is_data_type(self.data, self.filetype): data_source = source break except Exception as exc: # If checking the data throws an exception, the data source # is not available. logger.warning( f"Checking data source {source.__name__} failed " f"with exception: {exc}" ) continue if not data_source: raise ValueError( "Unknown data source type: {} with FileType: {}." "\nFIX THIS by passing a supported data type. Supported " "data types include pandas.DataFrame, pandas.Series, " "np.ndarray, and CSV/Parquet file paths. If you specify a " "file, path, consider passing the `filetype` argument to " "specify the type of the source. Use the `RayFileType` " "enum for that. If using Modin, Dask, or Petastorm, " "make sure the library is installed.".format( type(self.data), self.filetype ) ) if ( self.label is not None and not isinstance(self.label, str) and not type(self.data) != type(self.label) # noqa: E721 ): # noqa: E721: # Label is an object of a different type than the main data. # We have to make sure they are compatible # if it's a parquet data source and label is a list, # then we consider it a multi-label data if not data_source.is_data_type(self.label) and not ( isinstance(self.label, List) and data_source.__name__ == "Parquet" ): raise ValueError( "The passed `data` and `label` types are not compatible." "\nFIX THIS by passing the same types to the " "`RayDMatrix` - e.g. a `pandas.DataFrame` as `data` " "and `label`. The `label` can always be a string. Got " "{} for the main data and {} for the label.".format( type(self.data), type(self.label) ) ) self.data_source = data_source self._cached_n = data_source.get_n(self.data) return self.data_source def load_data( self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None ) -> Tuple[Dict, int]: """ Load data into memory """ if not ray.is_initialized(): ray.init() if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] data_source = self.get_data_source() max_num_shards = self._cached_n or data_source.get_n(self.data) if num_actors > max_num_shards and data_source.needs_partitions: raise RuntimeError( f"Trying to shard data for {num_actors} actors, but the " f"maximum number of shards (i.e. the number of data rows) " f"is {max_num_shards}. Consider using fewer actors." ) # We're doing central data loading here, so we don't pass any indices, # yet. Instead, we'll be selecting the rows below. local_df = data_source.load_data( self.data, ignore=self.ignore, indices=None, **self.kwargs ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( local_df, data_source=data_source ) if isinstance(x, list): n = sum(len(a) for a in x) else: n = len(x) refs = {} for i in range(num_actors): # Here we actually want to split the data. indices = _get_sharding_indices(sharding, i, num_actors, n) actor_refs = { "data": ray.put(x.iloc[indices]), "label": ray.put(y.iloc[indices] if y is not None else None), "weight": ray.put(w.iloc[indices] if w is not None else None), "feature_weights": ray.put(fw), "base_margin": ray.put(b.iloc[indices] if b is not None else None), "label_lower_bound": ray.put( ll.iloc[indices] if ll is not None else None ), "label_upper_bound": ray.put( lu.iloc[indices] if lu is not None else None ), "qid": ray.put(qid.iloc[indices] if qid is not None else None), } refs[i] = actor_refs return refs, n class _DistributedRayDMatrixLoader(_RayDMatrixLoader): """Load each shard individually.""" def get_data_source(self) -> Type[DataSource]: if self.data_source: return self.data_source invalid_data = False if isinstance(self.data, str): if self.filetype == RayFileType.PETASTORM: self.data = [self.data] elif os.path.isdir(self.data): if self.filetype == RayFileType.PARQUET: self.data = sorted(glob.glob(f"{self.data}/**/*.parquet")) elif self.filetype == RayFileType.CSV: self.data = sorted(glob.glob(f"{self.data}/**/*.csv")) else: invalid_data = True elif os.path.exists(self.data): self.data = [self.data] else: invalid_data = True # Todo (krfricke): It would be good to have a more general way to # check for compatibility here. Combine with test below? if ( not ( isinstance(self.data, (Iterable, RayDataset)) or hasattr(self.data, "__partitioned__") ) or invalid_data ): raise ValueError( f"Distributed data loading only works with already " f"distributed datasets. These should be specified through a " f"list of locations (or a single string). " f"Got: {type(self.data)}." f"\nFIX THIS by passing a list of files (e.g. on S3) to the " f"RayDMatrix." ) if ( self.label is not None and not isinstance(self.label, str) and not isinstance(self.label, List) ): raise ValueError( f"Invalid `label` value for distributed datasets: " f"{self.label}. Only strings are supported. " f"\nFIX THIS by passing a string indicating the label " f"column of the dataset as the `label` argument." ) data_source = None for source in data_sources: if not source.supports_distributed_loading: continue try: if source.is_data_type(self.data, self.filetype): data_source = source break except Exception as exc: # If checking the data throws an exception, the data source # is not available. logger.warning( f"Checking data source {source.__name__} failed " f"with exception: {exc}" ) continue if not data_source: raise ValueError( f"Invalid data source type: {type(self.data)} " f"with FileType: {self.filetype} for a distributed dataset." "\nFIX THIS by passing a supported data type. Supported " "data types for distributed datasets are a list of " "CSV or Parquet sources. If using " "Modin, Dask, or Petastorm, make sure the library is " "installed." ) self.data_source = data_source self._cached_n = data_source.get_n(self.data) return self.data_source def assert_enough_shards_for_actors(self, num_actors: int): data_source = self.get_data_source() # Ray Datasets will be automatically split to match the number # of actors. if isinstance(data_source, RayDataset): return max_num_shards = self._cached_n or data_source.get_n(self.data) if num_actors > max_num_shards and data_source.needs_partitions: raise RuntimeError( f"Trying to shard data for {num_actors} actors, but the " f"maximum number of shards is {max_num_shards}. If you " f"want to shard the dataset by rows, consider " f"centralized loading by passing `distributed=False` to " f"the `RayDMatrix`. Otherwise consider using fewer actors " f"or re-partitioning your data." ) def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: if not isinstance(self.label, str): # Currently we only support fixed data sharding for datasets # that contain both the label and the data. return False if self.actor_shards: # Only assign once return True data_source = self.get_data_source() data, actor_shards = data_source.get_actor_shards(self.data, actors) if not actor_shards: return False self.data = data self.actor_shards = actor_shards return True def load_data( self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None ) -> Tuple[Dict, int]: """ Load data into memory """ if rank is None or not ray.is_initialized: raise ValueError( "Distributed loading should be done by the actors, not by the" "driver program. " "\nFIX THIS by refraining from calling `RayDMatrix.load()` " "manually for distributed datasets. Hint: You can check if " "`RayDMatrix.distributed` is set to True or False." ) if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] data_source = self.get_data_source() if self.actor_shards: if rank is None: raise RuntimeError( "Distributed loading requires a rank to be passed, " "got None" ) rank_shards = self.actor_shards[rank] local_df = data_source.load_data( self.data, indices=rank_shards, ignore=self.ignore, **self.kwargs ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( local_df, data_source=data_source ) if isinstance(x, list): n = sum(len(a) for a in x) else: n = len(x) else: n = self._cached_n or data_source.get_n(self.data) indices = _get_sharding_indices(sharding, rank, num_actors, n) if not indices: x, y, w, fw, b, ll, lu, qid = ( None, None, None, None, None, None, None, None, ) n = 0 else: local_df = data_source.load_data( self.data, ignore=self.ignore, indices=indices, **self.kwargs ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( local_df, data_source=data_source ) if isinstance(x, list): n = sum(len(a) for a in x) else: n = len(x) refs = { rank: { "data": ray.put(x), "label": ray.put(y), "weight": ray.put(w), "feature_weights": ray.put(fw), "base_margin": ray.put(b), "label_lower_bound": ray.put(ll), "label_upper_bound": ray.put(lu), "qid": ray.put(qid), } } return refs, n @PublicAPI(stability="beta") class RayDMatrix: """XGBoost on Ray DMatrix class. This is the data object that the training and prediction functions expect. This wrapper manages distributed data by sharding the data for the workers and storing the shards in the object store. If this class is called without the ``num_actors`` argument, it will be lazy loaded. Thus, it will return immediately and only load the data and store it in the Ray object store after ``load_data(num_actors)`` or ``get_data(rank, num_actors)`` is called at the beginning of training. If this class is instantiated with the ``num_actors`` argument, it will directly load the data and store them in the object store. If this should be deferred, pass ``lazy=True`` as an argument. Loading the data will store it in the Ray object store. This object then stores references to the data shards in the Ray object store. Actors can request these shards with the ``get_data(rank)`` method, returning dataframes according to the actor rank. The total number of actors has to remain constant and cannot be changed once it has been set. Args: data: Data object. Can be a pandas dataframe, pandas series, numpy array, modin dataframe, string pointing to a csv or parquet file, or list of strings pointing to csv or parquet files. label: Optional label object. Can be a pandas series, numpy array, modin series, string pointing to a csv or parquet file, or a string indicating the column of the data dataframe that contains the label. If this is not a string it must be of the same type as the data argument. num_actors: Number of actors to shard this data for. If this is not None, data will be loaded and stored into the object store after initialization. If this is None, it will be set by the ``xgboost_ray.train()`` function, and it will be loaded and stored in the object store then. Defaults to None. filetype: Type of data to read. This is disregarded if a data object like a pandas dataframe is passed as the ``data`` argument. For filenames, the filetype is automaticlly detected via the file name (e.g. ``.csv`` will be detected as ``RayFileType.CSV``). Passing this argument will overwrite the detected filename. If the filename cannot be determined from the ``data`` object, passing this is mandatory. Defaults to ``None`` (auto detection). ignore: Exclude these columns from the dataframe after loading the data. distributed: If True, use distributed loading (each worker loads a share of the dataset). If False, use central loading (the head node loads the whole dataset and distributed it). If None, auto-detect and default to distributed loading, if possible. sharding: How to shard the data for different workers. ``RayShardingMode.INTERLEAVED`` will divide the data per row, i.e. every i-th row will be passed to the first worker, every (i+1)th row to the second worker, etc. ``RayShardingMode.BATCH`` will divide the data in batches, i.e. the first 0-(m-1) rows will be passed to the first worker, the m-(2m-1) rows to the second worker, etc. Defaults to ``RayShardingMode.INTERLEAVED``. If using distributed data loading, sharding happens on a per-file basis, and not on a per-row basis, i.e. For interleaved every ith *file* will be passed into the first worker, etc. lazy: If ``num_actors`` is passed, setting this to ``True`` will defer data loading and storing until ``load_data()`` or ``get_data()`` is called. Defaults to ``False``. **kwargs: Keyword arguments will be passed to the data loading function. For instance, with ``RayFileType.PARQUET``, these arguments will be passed to ``pandas.read_parquet()``. .. code-block:: python from xgboost_ray import RayDMatrix, RayFileType files = ["data_one.parquet", "data_two.parquet"] columns = ["feature_1", "feature_2", "label_column"] dtrain = RayDMatrix( files, num_actors=4, # Will shard the data for four workers label="label_column", # Will select this column as the label columns=columns, # Will be passed to ``pandas.read_parquet()`` filetype=RayFileType.PARQUET) """ def __init__( self, data: Data, label: Optional[Data] = None, weight: Optional[Data] = None, feature_weights: Optional[Data] = None, base_margin: Optional[Data] = None, missing: Optional[float] = None, label_lower_bound: Optional[Data] = None, label_upper_bound: Optional[Data] = None, feature_names: Optional[List[str]] = None, feature_types: Optional[List[np.dtype]] = None, qid: Optional[Data] = None, enable_categorical: Optional[bool] = None, num_actors: Optional[int] = None, filetype: Optional[RayFileType] = None, ignore: Optional[List[str]] = None, distributed: Optional[bool] = None, sharding: RayShardingMode = RayShardingMode.INTERLEAVED, lazy: bool = False, **kwargs, ): if kwargs.get("group", None) is not None: raise ValueError( "`group` parameter is not supported. " "If you are using XGBoost-Ray, use `qid` parameter instead. " "If you are using LightGBM-Ray, ranking is not yet supported." ) if qid is not None and weight is not None: raise NotImplementedError("per-group weight is not implemented.") self._uid = uuid.uuid4().int self.feature_names = feature_names self.feature_types = feature_types self.qid = qid self.enable_categorical = enable_categorical self.missing = missing self.num_actors = num_actors self.sharding = sharding if distributed is None: distributed = _detect_distributed(data) else: if distributed and not _can_load_distributed(data): raise ValueError( f"You passed `distributed=True` to the `RayDMatrix` but " f"the specified data source of type {type(data)} cannot " f"be loaded in a distributed fashion. " f"\nFIX THIS by passing a list of sources (e.g. parquet " f"files stored in a network location) instead." ) self.distributed = distributed if self.distributed: self.loader = _DistributedRayDMatrixLoader( data=data, label=label, missing=missing, weight=weight, feature_weights=feature_weights, base_margin=base_margin, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound, feature_names=feature_names, feature_types=feature_types, enable_categorical=enable_categorical, filetype=filetype, ignore=ignore, qid=qid, **kwargs, ) else: self.loader = _CentralRayDMatrixLoader( data=data, label=label, missing=missing, weight=weight, feature_weights=feature_weights, base_margin=base_margin, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound, feature_names=feature_names, feature_types=feature_types, enable_categorical=enable_categorical, filetype=filetype, ignore=ignore, qid=qid, **kwargs, ) self.refs: Dict[int, Dict[str, ray.ObjectRef]] = {} self.n = None self.loaded = False if not distributed and num_actors is not None and not lazy: self.load_data(num_actors) @property def has_label(self): return self.loader.label is not None def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: success = self.loader.assign_shards_to_actors(actors) if success: self.sharding = RayShardingMode.FIXED return success def assert_enough_shards_for_actors(self, num_actors: int): self.loader.assert_enough_shards_for_actors(num_actors=num_actors) def load_data(self, num_actors: Optional[int] = None, rank: Optional[int] = None): """Load data, putting it into the Ray object store. If a rank is given, only data for this rank is loaded (for distributed data sources only). """ if not self.loaded: if num_actors is not None: if self.num_actors is not None and num_actors != self.num_actors: raise ValueError( f"The `RayDMatrix` was initialized or `load_data()`" f"has been called with a different numbers of" f"`actors`. Existing value: {self.num_actors}. " f"Current value: {num_actors}." f"\nFIX THIS by not instantiating the matrix with " f"`num_actors` and making sure calls to `load_data()` " f"or `get_data()` use the same numbers of actors " f"at each call." ) self.num_actors = num_actors if self.num_actors is None: raise ValueError( "Trying to load data for `RayDMatrix` object, but " "`num_actors` is not set." "\nFIX THIS by passing `num_actors` on instantiation " "of the `RayDMatrix` or when calling `load_data()`." ) refs, self.n = self.loader.load_data( self.num_actors, self.sharding, rank=rank ) self.refs.update(refs) self.loaded = True def get_data( self, rank: int, num_actors: Optional[int] = None ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]: """Get data, i.e. return dataframe for a specific actor. This method is called from an actor, given its rank and the total number of actors. If the data is not yet loaded, loading is triggered. """ self.load_data(num_actors=num_actors, rank=rank) refs = self.refs[rank] ray.get(list(refs.values())) data = {k: ray.get(v) for k, v in refs.items()} return data def unload_data(self): """Delete object references to clear object store""" for rank in list(self.refs.keys()): for name in list(self.refs[rank].keys()): del self.refs[rank][name] self.loaded = False def update_matrix_properties(self, matrix: "xgb.DMatrix"): self.loader.update_matrix_properties(matrix) def __hash__(self): return self._uid def __eq__(self, other): return self.__hash__() == other.__hash__() class RayQuantileDMatrix(RayDMatrix): """Currently just a thin wrapper for type detection""" pass class RayDeviceQuantileDMatrix(RayDMatrix): """Currently just a thin wrapper for type detection""" def __init__( self, data: Data, label: Optional[Data] = None, weight: Optional[Data] = None, base_margin: Optional[Data] = None, missing: Optional[float] = None, label_lower_bound: Optional[Data] = None, label_upper_bound: Optional[Data] = None, feature_names: Optional[List[str]] = None, feature_types: Optional[List[np.dtype]] = None, qid: Optional[Data] = None, enable_categorical: Optional[bool] = None, *args, **kwargs, ): if cp is None: raise RuntimeError( "RayDeviceQuantileDMatrix requires cupy to be installed." "\nFIX THIS by installing cupy: `pip install cupy-cudaXYZ` " "where XYZ is your local CUDA version." ) if label_lower_bound or label_upper_bound: raise RuntimeError( "RayDeviceQuantileDMatrix does not support " "`label_lower_bound` and `label_upper_bound` (just as the " "xgboost.DeviceQuantileDMatrix). Please pass None instead." ) super(RayDeviceQuantileDMatrix, self).__init__( data=data, label=label, weight=weight, base_margin=base_margin, missing=missing, label_lower_bound=None, label_upper_bound=None, feature_names=feature_names, feature_types=feature_types, qid=qid, enable_categorical=enable_categorical, *args, **kwargs, ) def get_data( self, rank: int, num_actors: Optional[int] = None ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]: data_dict = super(RayDeviceQuantileDMatrix, self).get_data( rank=rank, num_actors=num_actors ) # Remove some dict keys here that are generated automatically data_dict.pop("label_lower_bound", None) data_dict.pop("label_upper_bound", None) return data_dict def _can_load_distributed(source: Data) -> bool: """Returns True if it might be possible to use distributed data loading""" from xgboost_ray.data_sources.modin import Modin if isinstance(source, (int, float, bool)): return False elif Modin.is_data_type(source): return True elif isinstance(source, RayDataset): return True elif isinstance(source, str): # Strings should point to files or URLs # Usually parquet files point to directories return source.endswith(".parquet") elif isinstance(source, Sequence): # Sequence of strings should point to files or URLs return isinstance(source[0], str) elif isinstance(source, Iterable): # If we get an iterable but not a sequence, the best we can do # is check if we have a known non-distributed object if isinstance(source, (pd.DataFrame, pd.Series, np.ndarray)): return False # Per default, allow distributed loading. return True def _detect_distributed(source: Data) -> bool: """Returns True if we should try to use distributed data loading""" from xgboost_ray.data_sources.modin import Modin if not _can_load_distributed(source): return False if Modin.is_data_type(source): return True if isinstance(source, RayDataset): return True if ( isinstance(source, Iterable) and not isinstance(source, str) and not (isinstance(source, Sequence) and isinstance(source[0], str)) ): # This is an iterable but not a Sequence of strings, and not a # pandas dataframe, series, or numpy array. # Detect False per default, can be overridden by passing # `distributed=True` to the RayDMatrix object. return False # Otherwise, assume distributed loading is possible return True def _get_sharding_indices( sharding: RayShardingMode, rank: int, num_actors: int, n: int ): """Return indices that belong to worker with rank `rank`""" if sharding == RayShardingMode.BATCH: # based on numpy.array_split # github.com/numpy/numpy/blob/v1.21.0/numpy/lib/shape_base.py n_per_actor, extras = divmod(n, num_actors) div_points = np.array( [0] + extras * [n_per_actor + 1] + (num_actors - extras) * [n_per_actor] ).cumsum() indices = list(range(div_points[rank], div_points[rank + 1])) elif sharding == RayShardingMode.INTERLEAVED: indices = list(range(rank, n, num_actors)) else: raise ValueError( f"Invalid value for `sharding` parameter: " f"{sharding}" f"\nFIX THIS by passing any item of the " f"`RayShardingMode` enum, for instance " f"`RayShardingMode.BATCH`." ) return indices @DeveloperAPI def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray: if sharding not in (RayShardingMode.BATCH, RayShardingMode.INTERLEAVED): raise ValueError( f"Invalid value for `sharding` parameter: " f"{sharding}" f"\nFIX THIS by passing any item of the " f"`RayShardingMode` enum, for instance " f"`RayShardingMode.BATCH`." ) # discard empty arrays that show up with BATCH data = [d for d in data if len(d)] if data[0].ndim == 1: # most common case if sharding == RayShardingMode.BATCH: res = np.concatenate(data) elif sharding == RayShardingMode.INTERLEAVED: # Sometimes the lengths are off by 1 for uneven divisions min_len = min(len(d) for d in data) res = np.ravel(np.column_stack([d[0:min_len] for d in data])) # Append these here res = np.concatenate( [res] + [d[min_len:] for d in data if len(d) > min_len] ) else: # objective="multi:softprob" returns n-dimensional arrays that # need to be handled differently if sharding == RayShardingMode.BATCH: res = np.vstack(data) elif sharding == RayShardingMode.INTERLEAVED: # Sometimes the lengths are off by 1 for uneven divisions min_len = min(len(d) for d in data) # the number of classes will be constant, though class_len = data[0].shape[1] min_len_data = [d[0:min_len] for d in data] res = np.hstack(min_len_data).reshape( len(min_len_data) * min_len, class_len ) # Append these here res = np.concatenate( [res] + [d[min_len:] for d in data if len(d) > min_len] ) return res ================================================ FILE: xgboost_ray/session.py ================================================ from typing import Optional from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.queue import Queue @DeveloperAPI class RayXGBoostSession: def __init__(self, rank: int, queue: Optional[Queue]): self._rank = rank self._queue = queue def get_actor_rank(self): return self._rank def set_queue(self, queue): self._queue = queue def put_queue(self, item): if self._queue is None: raise ValueError( "Trying to put something into session queue, but queue " "was not initialized. This is probably a bug, please raise " "an issue at https://github.com/ray-project/xgboost_ray" ) self._queue.put((self._rank, item)) _session = None @DeveloperAPI def init_session(*args, **kwargs): global _session if _session: raise ValueError( "Trying to initialize RayXGBoostSession twice." "\nFIX THIS by not calling `init_session()` manually." ) _session = RayXGBoostSession(*args, **kwargs) @DeveloperAPI def get_session() -> RayXGBoostSession: global _session if not _session or not isinstance(_session, RayXGBoostSession): raise ValueError( "Trying to access RayXGBoostSession from outside an XGBoost run." "\nFIX THIS by calling function in `session.py` like " "`get_actor_rank()` only from within an XGBoost actor session." ) return _session @DeveloperAPI def set_session_queue(queue: Queue): session = get_session() session.set_queue(queue) @PublicAPI def get_actor_rank() -> int: session = get_session() return session.get_actor_rank() @PublicAPI def get_rabit_rank() -> int: import xgboost as xgb try: # From xgboost>=1.7.0, rabit is replaced by a collective communicator return xgb.collective.get_rank() except (ImportError, AttributeError): return xgb.rabit.get_rank() @PublicAPI def put_queue(*args, **kwargs): session = get_session() session.put_queue(*args, **kwargs) ================================================ FILE: xgboost_ray/sklearn.py ================================================ """scikit-learn wrapper for xgboost-ray. Based on xgboost 1.4.0 sklearn wrapper, with some provisions made for 1.5.0 and compatibility for legacy versions (not everything may work). Seems to error out with 1.0.0, but 0.90 and 1.1.0 work fine. Requires xgboost>=0.90""" # Copyright 2021 by XGBoost Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # File based on: # https://github.com/dmlc/xgboost/blob/c6a0bdbb5a68232cd59ea556c981c633cc0646ca/python-package/xgboost/sklearn.py # License: # https://github.com/dmlc/xgboost/blob/c6a0bdbb5a68232cd59ea556c981c633cc0646ca/LICENSE import functools import inspect import warnings from inspect import _finddoc from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np from packaging.version import Version from ray.util.annotations import DeveloperAPI, PublicAPI from xgboost import Booster from xgboost import __version__ as xgboost_version from xgboost.sklearn import ( XGBClassifier, XGBModel, XGBRanker, XGBRegressor, XGBRFClassifier, XGBRFRegressor, _objective_decorator, ) from xgboost_ray.main import LEGACY_WARNING, XGBOOST_VERSION, RayParams, predict, train from xgboost_ray.matrix import RayDMatrix # avoiding exception in xgboost==0.9.0 try: from xgboost.sklearn import _deprecate_positional_args except ImportError: def _deprecate_positional_args(f): """Dummy decorator, does nothing""" @functools.wraps(f) def inner_f(*args, **kwargs): return f(*args, **kwargs) return inner_f # If _wrap_evaluation_matrices has new arguments added in xgboost, update # RayXGBMixin._ray_get_wrap_evaluation_matrices_compat_kwargs try: from xgboost.sklearn import _wrap_evaluation_matrices except ImportError: # copied from the file in the top comment def _wrap_evaluation_matrices( missing: float, X: Any, y: Any, group: Optional[Any], qid: Optional[Any], sample_weight: Optional[Any], base_margin: Optional[Any], feature_weights: Optional[Any], eval_set: Optional[List[Tuple[Any, Any]]], sample_weight_eval_set: Optional[List[Any]], base_margin_eval_set: Optional[List[Any]], eval_group: Optional[List[Any]], eval_qid: Optional[List[Any]], create_dmatrix: Callable, label_transform: Callable = lambda x: x, ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]: """Convert array_like evaluation matrices into DMatrix. Perform validation on the way. """ train_dmatrix = create_dmatrix( data=X, label=label_transform(y), group=group, qid=qid, weight=sample_weight, base_margin=base_margin, feature_weights=feature_weights, missing=missing, ) n_validation = 0 if eval_set is None else len(eval_set) def validate_or_none(meta: Optional[List], name: str) -> List: if meta is None: return [None] * n_validation if len(meta) != n_validation: raise ValueError( f"{name}'s length does not eqaul to `eval_set`, " + f"expecting {n_validation}, got {len(meta)}" ) return meta if eval_set is not None: sample_weight_eval_set = validate_or_none( sample_weight_eval_set, "sample_weight_eval_set" ) base_margin_eval_set = validate_or_none( base_margin_eval_set, "base_margin_eval_set" ) eval_group = validate_or_none(eval_group, "eval_group") eval_qid = validate_or_none(eval_qid, "eval_qid") evals = [] for i, (valid_X, valid_y) in enumerate(eval_set): # Skip the duplicated entry. if all( ( valid_X is X, valid_y is y, sample_weight_eval_set[i] is sample_weight, base_margin_eval_set[i] is base_margin, eval_group[i] is group, eval_qid[i] is qid, ) ): evals.append(train_dmatrix) else: m = create_dmatrix( data=valid_X, label=label_transform(valid_y), weight=sample_weight_eval_set[i], group=eval_group[i], qid=eval_qid[i], base_margin=base_margin_eval_set[i], missing=missing, ) evals.append(m) nevals = len(evals) eval_names = ["validation_{}".format(i) for i in range(nevals)] evals = list(zip(evals, eval_names)) else: if any( meta is not None for meta in [ sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, ] ): raise ValueError( "`eval_set` is not set but one of the other evaluation " "meta info is not None." ) evals = [] return train_dmatrix, evals try: from xgboost.sklearn import _cls_predict_proba except ImportError: # copied from the file in the top comment def _cls_predict_proba(n_classes: int, prediction, vstack: Callable): assert len(prediction.shape) <= 2 if len(prediction.shape) == 2 and prediction.shape[1] == n_classes: return prediction # binary logistic function classone_probs = prediction classzero_probs = 1.0 - classone_probs return vstack((classzero_probs, classone_probs)).transpose() try: from xgboost.sklearn import _is_cudf_df, _is_cudf_ser, _is_cupy_array except ImportError: _is_cudf_df = None _is_cudf_ser = None _is_cupy_array = None _RAY_PARAMS_DOC = """ray_params : None or RayParams or Dict Parameters to configure Ray-specific behavior. See :class:`RayParams` for a list of valid configuration parameters. Will override ``n_jobs`` attribute with own ``num_actors`` parameter. _remote : bool Whether to run the driver process in a remote function. This is enabled by default in Ray client mode. ray_dmatrix_params : dict Dict of parameters (such as sharding mode) passed to the internal RayDMatrix initialization. """ _N_JOBS_DOC_REPLACE = ( """ n_jobs : int Number of parallel threads used to run xgboost. When used with other Scikit-Learn algorithms like grid search, you may choose which algorithm to parallelize and balance the threads. Creating thread contention will significantly slow down both algorithms.""", # noqa: E501, W291 """ n_jobs : int Number of Ray actors used to run xgboost in parallel. In order to set number of threads per actor, pass a :class:`RayParams` object to the relevant method as a ``ray_params`` argument. Will be overriden by the ``num_actors`` parameter of ``ray_params`` argument should it be passed to a method.""", # noqa: E501, W291 ) def _get_doc(object: Any) -> Optional[str]: """Same as ``inspect.getdoc``, but without ``cleandoc`` applied.""" try: doc = object.__doc__ except AttributeError: return None if doc is None: try: doc = _finddoc(object) except (AttributeError, TypeError): return None if not isinstance(doc, str): return None return doc def _treat_estimator_doc(doc: Optional[str]) -> Optional[str]: """Helper function to make nececssary changes in estimator docstrings""" if doc: doc = ( doc.replace(*_N_JOBS_DOC_REPLACE) .replace( "scikit-learn API for XGBoost", "scikit-learn API for Ray-distributed XGBoost", ) .replace(":doc:`tree method\n `", "tree method") ) return doc def _treat_X_doc(doc: Optional[str]) -> Optional[str]: if doc: doc = doc.replace( "Data to predict with.", "Data to predict with. Can also be a ``RayDMatrix``.", ) doc = doc.replace( "Feature matrix.", "Feature matrix. Can also be a ``RayDMatrix``." ) doc = doc.replace( "Feature matrix", "Feature matrix. Can also be a ``RayDMatrix``." ) return doc def _xgboost_version_warn(f): """Decorator to warn when xgboost version is < 1.4.0""" @functools.wraps(f) def inner_f(*args, **kwargs): if XGBOOST_VERSION < Version("1.4.0"): warnings.warn(LEGACY_WARNING) return f(*args, **kwargs) return inner_f def _check_if_params_are_ray_dmatrix( X, sample_weight, base_margin, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_qid=None, ): train_dmatrix = None evals = () eval_set = eval_set or () if isinstance(X, RayDMatrix): params_to_warn_about = ["y"] if sample_weight is not None: params_to_warn_about.append("sample_weight") if base_margin is not None: params_to_warn_about.append("base_margin") warnings.warn( f"X is a RayDMatrix, {', '.join(params_to_warn_about)}" " will be ignored!" ) train_dmatrix = X if eval_set: if any( not isinstance(eval_data, RayDMatrix) or not isinstance(eval_name, str) for eval_data, eval_name in eval_set ): raise ValueError( "If X is a RayDMatrix, all elements of " "`eval_set` must be (RayDMatrix, str) " "tuples." ) params_to_warn_about = [] if sample_weight_eval_set is not None: params_to_warn_about.append("sample_weight_eval_set") if base_margin_eval_set is not None: params_to_warn_about.append("base_margin_eval_set") if eval_qid is not None: params_to_warn_about.append("eval_qid") if params_to_warn_about: warnings.warn( "`eval_set` is composed of RayDMatrix tuples, " f"{', '.join(params_to_warn_about)} will be ignored!" ) evals = eval_set or () elif any( isinstance(eval_x, RayDMatrix) or isinstance(eval_y, RayDMatrix) for eval_x, eval_y in eval_set ): raise ValueError( "If X is not a RayDMatrix, all `eval_set` " "elements must be (array_like, array_like)" " tuples." ) return train_dmatrix, evals @DeveloperAPI class RayXGBMixin: """Mixin class to provide xgboost-ray functionality""" def _ray_set_ray_params_n_jobs( self, ray_params: Optional[Union[RayParams, dict]], n_jobs: Optional[int] ) -> RayParams: """Helper function to set num_actors in ray_params if not set by the user""" if ray_params is None: if not n_jobs or n_jobs < 1: n_jobs = 1 ray_params = RayParams(num_actors=n_jobs) elif n_jobs is not None: warnings.warn( "`ray_params` is not `None` and will override " "the `n_jobs` attribute." ) return ray_params def _ray_predict( self: "XGBModel", X, output_margin=False, validate_features=True, base_margin=None, iteration_range=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, **kwargs, ): """Distributed predict via Ray""" ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs) ray_dmatrix_params = ray_dmatrix_params or {} if not isinstance(X, RayDMatrix): test = RayDMatrix( X, base_margin=base_margin, missing=self.missing, **ray_dmatrix_params ) else: test = X if base_margin is not None: warnings.warn("X is a RayDMatrix, base_margin will be ignored!") return predict( self.get_booster(), data=test, output_margin=output_margin, validate_features=validate_features, ray_params=ray_params, _remote=_remote, **kwargs, ) def _ray_get_wrap_evaluation_matrices_compat_kwargs( self, label_transform=None ) -> dict: ret = {} wrap_evaluation_matrices_parameters = inspect.signature( _wrap_evaluation_matrices ).parameters if "label_transform" in wrap_evaluation_matrices_parameters: # XGBoost < 1.6.0 identity_func = lambda x: x # noqa ret["label_transform"] = label_transform or identity_func if ( hasattr(self, "enable_categorical") and "enable_categorical" in wrap_evaluation_matrices_parameters ): ret["enable_categorical"] = self.enable_categorical if ( hasattr(self, "feature_types") and "feature_types" in wrap_evaluation_matrices_parameters ): ret["feature_types"] = self.feature_types return ret # copied from the file in the top comment # provided here for compatibility with legacy xgboost versions # will be overwritten by vanilla xgboost if possible def _configure_fit( self, booster: Optional[Union[Booster, "XGBModel", str]], eval_metric: Optional[Union[Callable, str, List[str]]], params: Dict[str, Any], ) -> Tuple[Optional[Union[Booster, str]], Dict[str, Any]]: # pylint: disable=protected-access, no-self-use if isinstance(booster, XGBModel): # Handle the case when xgb_model is a sklearn model object model: Optional[Union[Booster, str]] = booster._Booster else: model = booster feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: if callable(eval_metric): eval_metric = None else: params.update({"eval_metric": eval_metric}) return model, feval, params # copied from the file in the top comment # provided here for compatibility with legacy xgboost versions # will be overwritten by vanilla xgboost if possible def _set_evaluation_result(self, evals_result) -> None: if evals_result: for val in evals_result.items(): evals_result_key = list(val[1].keys())[0] evals_result[val[0]][evals_result_key] = val[1][evals_result_key] self.evals_result_ = evals_result @PublicAPI(stability="beta") class RayXGBRegressor(XGBRegressor, RayXGBMixin): __init__ = _xgboost_version_warn(XGBRegressor.__init__) @_deprecate_positional_args def fit( self, X, y, *, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None, sample_weight_eval_set=None, base_margin_eval_set=None, feature_weights=None, callbacks=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, ): evals_result = {} ray_dmatrix_params = ray_dmatrix_params or {} train_dmatrix, evals = _check_if_params_are_ray_dmatrix( X, sample_weight, base_margin, eval_set, sample_weight_eval_set, base_margin_eval_set, ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( missing=self.missing, X=X, y=y, group=None, qid=None, sample_weight=sample_weight, base_margin=base_margin, feature_weights=feature_weights, eval_set=eval_set, sample_weight_eval_set=sample_weight_eval_set, base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, # changed in xgboost-ray: create_dmatrix=lambda **kwargs: RayDMatrix( **{**kwargs, **ray_dmatrix_params} ), **self._ray_get_wrap_evaluation_matrices_compat_kwargs(), ) params = self.get_xgb_params() if callable(self.objective): obj = _objective_decorator(self.objective) params["objective"] = "reg:squarederror" else: obj = None try: model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 ( model, feval, params, early_stopping_rounds, callbacks, ) = self._configure_fit( xgb_model, eval_metric, params, early_stopping_rounds, callbacks ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) params.pop("nthread", None) ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs) additional_results = {} self._Booster = train( params, train_dmatrix, self.get_num_boosting_rounds(), evals=evals, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, obj=obj, feval=feval, verbose_eval=verbose, xgb_model=model, callbacks=callbacks, # changed in xgboost-ray: additional_results=additional_results, ray_params=ray_params, _remote=_remote, ) self.additional_results_ = additional_results self._set_evaluation_result(evals_result) return self fit.__doc__ = _treat_X_doc(_get_doc(XGBRegressor.fit)) + _RAY_PARAMS_DOC def _can_use_inplace_predict(self) -> bool: return False def predict( self, X, output_margin=False, validate_features=True, base_margin=None, iteration_range=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, **kwargs, ): return self._ray_predict( X, output_margin=output_margin, validate_features=validate_features, base_margin=base_margin, iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, ray_dmatrix_params=ray_dmatrix_params, **kwargs, ) predict.__doc__ = _treat_X_doc(_get_doc(XGBRegressor.predict)) + _RAY_PARAMS_DOC def load_model(self, fname): if not hasattr(self, "_Booster"): self._Booster = Booster() return super().load_model(fname) RayXGBRegressor.__doc__ = _treat_estimator_doc(_get_doc(XGBRegressor)) class RayXGBRFRegressor(RayXGBRegressor): # too much work to make this compatible with 0.90 if xgboost_version == "0.90": def __init__(self, *args, **kwargs): raise ValueError("RayXGBRFRegressor not available with xgboost<1.0.0") else: @_deprecate_positional_args @_xgboost_version_warn def __init__( self, *, learning_rate=1, subsample=0.8, colsample_bynode=0.8, reg_lambda=1e-5, **kwargs, ): super().__init__( learning_rate=learning_rate, subsample=subsample, colsample_bynode=colsample_bynode, reg_lambda=reg_lambda, **kwargs, ) def get_xgb_params(self): params = super().get_xgb_params() params["num_parallel_tree"] = self.n_estimators return params def get_num_boosting_rounds(self): return 1 RayXGBRFRegressor.__doc__ = _treat_estimator_doc(_get_doc(XGBRFRegressor)) @PublicAPI(stability="beta") class RayXGBClassifier(XGBClassifier, RayXGBMixin): __init__ = _xgboost_version_warn(XGBClassifier.__init__) @_deprecate_positional_args def fit( self, X, y, *, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, base_margin_eval_set=None, feature_weights=None, callbacks=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, ): evals_result = {} ray_dmatrix_params = ray_dmatrix_params or {} params = self.get_xgb_params() train_dmatrix, evals = _check_if_params_are_ray_dmatrix( X, sample_weight, base_margin, eval_set, sample_weight_eval_set, base_margin_eval_set, ) if train_dmatrix is not None: if "num_class" not in params: raise ValueError( "`num_class` must be set during initalization if X" " is a RayDMatrix" ) if XGBOOST_VERSION < Version("2.0.0"): self.classes_ = list(range(0, params["num_class"])) self.n_classes_ = params["num_class"] if self.n_classes_ <= 2: params.pop("num_class") label_transform = lambda x: x # noqa: E731 else: if len(X.shape) != 2: # Simply raise an error here since there might be many # different ways of reshaping raise ValueError( "Please reshape the input data X into 2-dimensional " "matrix." ) label_transform = lambda x: x # noqa: E731 if XGBOOST_VERSION < Version("2.0.0"): self.classes_ = np.unique(y) self.n_classes_ = len(np.unique(y)) if callable(self.objective): obj = _objective_decorator(self.objective) # Use default value. Is it really not used ? params["objective"] = "binary:logistic" else: obj = None if self.n_classes_ > 2: # Switch to using a multiclass objective in the underlying # XGB instance params["objective"] = "multi:softprob" params["num_class"] = self.n_classes_ try: model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 ( model, feval, params, early_stopping_rounds, callbacks, ) = self._configure_fit( xgb_model, eval_metric, params, early_stopping_rounds, callbacks ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( missing=self.missing, X=X, y=y, group=None, qid=None, sample_weight=sample_weight, base_margin=base_margin, feature_weights=feature_weights, eval_set=eval_set, sample_weight_eval_set=sample_weight_eval_set, base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, # changed in xgboost-ray: create_dmatrix=lambda **kwargs: RayDMatrix( **{**kwargs, **ray_dmatrix_params} ), **self._ray_get_wrap_evaluation_matrices_compat_kwargs( label_transform=label_transform ), ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) params.pop("nthread", None) ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs) additional_results = {} self._Booster = train( params, train_dmatrix, self.get_num_boosting_rounds(), evals=evals, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, obj=obj, feval=feval, verbose_eval=verbose, xgb_model=model, callbacks=callbacks, # changed in xgboost-ray: additional_results=additional_results, ray_params=ray_params, _remote=_remote, ) if not callable(self.objective): self.objective = params["objective"] self.additional_results_ = additional_results self._set_evaluation_result(evals_result) return self fit.__doc__ = _treat_X_doc(_get_doc(XGBClassifier.fit)) + _RAY_PARAMS_DOC def _can_use_inplace_predict(self) -> bool: return False def predict( self, X, output_margin=False, validate_features=True, base_margin=None, iteration_range: Optional[Tuple[int, int]] = None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, **kwargs, ): class_probs = self._ray_predict( X=X, output_margin=output_margin, validate_features=validate_features, base_margin=base_margin, iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, ray_dmatrix_params=ray_dmatrix_params, **kwargs, ) if output_margin: # If output_margin is active, simply return the scores return class_probs if len(class_probs.shape) > 1: # turns softprob into softmax column_indexes = np.argmax(class_probs, axis=1) else: # turns soft logit into class label column_indexes = np.repeat(0, class_probs.shape[0]) column_indexes[class_probs > 0.5] = 1 if hasattr(self, "_le"): return self._le.inverse_transform(column_indexes) return column_indexes predict.__doc__ = _treat_X_doc(_get_doc(XGBModel.predict)) + _RAY_PARAMS_DOC def predict_proba( self, X, validate_features=False, base_margin=None, iteration_range: Optional[Tuple[int, int]] = None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, **kwargs, ) -> np.ndarray: class_probs = self._ray_predict( X=X, output_margin=self.objective == "multi:softmax", validate_features=validate_features, base_margin=base_margin, iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, ray_dmatrix_params=ray_dmatrix_params, **kwargs, ) # If model is loaded from a raw booster there's no `n_classes_` return _cls_predict_proba( getattr(self, "n_classes_", None), class_probs, np.vstack ) def load_model(self, fname): if not hasattr(self, "_Booster"): self._Booster = Booster() return super().load_model(fname) predict_proba.__doc__ = ( _treat_X_doc(_get_doc(XGBClassifier.predict_proba)) + _RAY_PARAMS_DOC ) RayXGBClassifier.__doc__ = _treat_estimator_doc(_get_doc(XGBClassifier)) class RayXGBRFClassifier(RayXGBClassifier): # too much work to make this compatible with 0.90 if xgboost_version == "0.90": def __init__(self, *args, **kwargs): raise ValueError("RayXGBRFClassifier not available with xgboost<1.0.0") else: @_deprecate_positional_args @_xgboost_version_warn def __init__( self, *, learning_rate=1, subsample=0.8, colsample_bynode=0.8, reg_lambda=1e-5, **kwargs, ): super().__init__( learning_rate=learning_rate, subsample=subsample, colsample_bynode=colsample_bynode, reg_lambda=reg_lambda, **kwargs, ) def get_xgb_params(self): params = super().get_xgb_params() params["num_parallel_tree"] = self.n_estimators return params def get_num_boosting_rounds(self): return 1 RayXGBRFClassifier.__doc__ = _treat_estimator_doc(_get_doc(XGBRFClassifier)) @PublicAPI(stability="beta") class RayXGBRanker(XGBRanker, RayXGBMixin): __init__ = _xgboost_version_warn(XGBRanker.__init__) @_deprecate_positional_args def fit( self, X, y, *, group=None, qid=None, sample_weight=None, base_margin=None, eval_set=None, eval_group=None, eval_qid=None, eval_metric=None, early_stopping_rounds=None, verbose=False, xgb_model: Optional[Union[Booster, str, XGBModel]] = None, sample_weight_eval_set=None, base_margin_eval_set=None, feature_weights=None, callbacks=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, ): if not (group is None and eval_group is None): raise ValueError("Use `qid` instead of `group` for RayXGBRanker.") if qid is None: raise ValueError("`qid` is required for ranking.") if eval_set is not None: if eval_qid is None: raise ValueError("`eval_qid `is required if" " `eval_set` is not None") evals_result = {} ray_dmatrix_params = ray_dmatrix_params or {} params = self.get_xgb_params() train_dmatrix, evals = _check_if_params_are_ray_dmatrix( X, sample_weight, base_margin, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_qid, ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( missing=self.missing, X=X, y=y, group=group, qid=qid, sample_weight=sample_weight, base_margin=base_margin, feature_weights=feature_weights, eval_set=eval_set, sample_weight_eval_set=sample_weight_eval_set, base_margin_eval_set=base_margin_eval_set, eval_group=eval_group, eval_qid=eval_qid, # changed in xgboost-ray: create_dmatrix=lambda **kwargs: RayDMatrix( **{**kwargs, **ray_dmatrix_params} ), **self._ray_get_wrap_evaluation_matrices_compat_kwargs(), ) try: model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 ( model, feval, params, early_stopping_rounds, callbacks, ) = self._configure_fit( xgb_model, eval_metric, params, early_stopping_rounds, callbacks ) if callable(feval): raise ValueError( "Custom evaluation metric is not yet supported for XGBRanker." ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) params.pop("nthread", None) ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs) additional_results = {} self._Booster = train( params, train_dmatrix, self.n_estimators, early_stopping_rounds=early_stopping_rounds, evals=evals, evals_result=evals_result, feval=feval, verbose_eval=verbose, xgb_model=model, callbacks=callbacks, # changed in xgboost-ray: additional_results=additional_results, ray_params=ray_params, _remote=_remote, ) self.objective = params["objective"] self.additional_results_ = additional_results self._set_evaluation_result(evals_result) return self fit.__doc__ = _treat_X_doc(_get_doc(XGBRanker.fit)) + _RAY_PARAMS_DOC def _can_use_inplace_predict(self) -> bool: return False def predict( self, X, output_margin=False, validate_features=True, base_margin=None, iteration_range=None, ray_params: Union[None, RayParams, Dict] = None, _remote: Optional[bool] = None, ray_dmatrix_params: Optional[Dict] = None, **kwargs, ): return self._ray_predict( X, output_margin=output_margin, validate_features=validate_features, base_margin=base_margin, iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, ray_dmatrix_params=ray_dmatrix_params, **kwargs, ) predict.__doc__ = _treat_X_doc(_get_doc(XGBRanker.predict)) + _RAY_PARAMS_DOC def load_model(self, fname): if not hasattr(self, "_Booster"): self._Booster = Booster() return super().load_model(fname) RayXGBRanker.__doc__ = _treat_estimator_doc(_get_doc(XGBRanker)) ================================================ FILE: xgboost_ray/tests/__init__.py ================================================ ================================================ FILE: xgboost_ray/tests/conftest.py ================================================ from contextlib import contextmanager from functools import partial import pytest import ray try: # Ray 1.3+ from ray._private.cluster_utils import Cluster except ImportError: from ray.cluster_utils import Cluster def get_default_fixure_system_config(): system_config = { "object_timeout_milliseconds": 200, "health_check_initial_delay_ms": 0, "health_check_failure_threshold": 10, "object_store_full_delay_ms": 100, } return system_config def get_default_fixture_ray_kwargs(): system_config = get_default_fixure_system_config() ray_kwargs = { "num_cpus": 1, "object_store_memory": 150 * 1024 * 1024, "dashboard_port": None, "namespace": "default_test_namespace", "_system_config": system_config, } return ray_kwargs @contextmanager def _ray_start_cluster(**kwargs): init_kwargs = get_default_fixture_ray_kwargs() num_nodes = 0 do_init = False # num_nodes & do_init are not arguments for ray.init, so delete them. if "num_nodes" in kwargs: num_nodes = kwargs["num_nodes"] del kwargs["num_nodes"] if "do_init" in kwargs: do_init = kwargs["do_init"] del kwargs["do_init"] elif num_nodes > 0: do_init = True init_kwargs.update(kwargs) cluster = Cluster() remote_nodes = [] for i in range(num_nodes): if i > 0 and "_system_config" in init_kwargs: del init_kwargs["_system_config"] remote_nodes.append(cluster.add_node(**init_kwargs)) # We assume driver will connect to the head (first node), # so ray init will be invoked if do_init is true if len(remote_nodes) == 1 and do_init: ray.init(address=cluster.address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown() # This fixture will start a cluster with empty nodes. @pytest.fixture(scope="function") def ray_start_cluster(request): param = getattr(request, "param", {}) request.cls.ray_start_cluster = partial(_ray_start_cluster, **param) ================================================ FILE: xgboost_ray/tests/env_info.sh ================================================ #!/bin/bash # shellcheck disable=SC2005 echo "Test environment information" echo "----------------------------" echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')" echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')" echo "Installed pip packages:" echo "$(python -m pip freeze 2>/dev/null || echo 'Pip not installed')" echo "----------------------------" ================================================ FILE: xgboost_ray/tests/fault_tolerance.py ================================================ import os import time from collections import defaultdict from typing import Dict, Set, Tuple import ray from ray.actor import ActorHandle from xgboost_ray.callback import DistributedCallback from xgboost_ray.compat import TrainingCallback from xgboost_ray.session import get_actor_rank @ray.remote(num_cpus=0) class FaultToleranceManager: def __init__(self, start_boost_round: int = 0): self.global_boost_round = start_boost_round # Dict from boost_round -> actor ranks to die self.scheduled_kill: Dict[int, Set[int]] = defaultdict(set) # Dict from actor rank -> starts/ends of boost rounds to sleep self.delayed_return: Dict[int, Set[Tuple[int, int]]] = defaultdict(set) # List of tuples (global_boost_round, actor_boost_round) to log # actor iterations self.training_logs = defaultdict(list) def schedule_kill(self, rank: int, boost_round: int): """Kill an actor when reaching this global boost round""" self.scheduled_kill[boost_round].add(rank) def delay_return(self, rank: int, start_boost_round: int, end_boost_round: int): """Do not allow an actor to finish data loading between these rounds""" self.delayed_return[rank].add((start_boost_round, end_boost_round)) def inc_boost_round(self, rank: int): """Increase global boosting round""" if rank == 0: self.global_boost_round += 1 def log_iteration(self, rank: int, boost_round: int): """Log iteration""" self.training_logs[rank].append((self.global_boost_round, boost_round)) def should_die(self, rank: int): """Returns True if the actor should terminate the training job now.""" die = False for round in range(self.global_boost_round + 1): # Loop through all rounds until now to deal with race conditions if rank in self.scheduled_kill[round]: self.scheduled_kill[round].remove(rank) die = True return die def should_sleep(self, rank: int): """Returns True if the actor should not finish data loading, yet.""" if self.delayed_return[rank]: for start, end in self.delayed_return[rank]: if start <= self.global_boost_round < end: return True return False def get_logs(self): return self.training_logs class DelayedLoadingCallback(DistributedCallback): """Used to control when actors return to training""" def __init__(self, ft_manager: ActorHandle, reload_data=True, sleep_time=0.5): self.ft_manager = ft_manager self.reload_data = reload_data self.sleep_time = sleep_time def after_data_loading(self, actor, data, *args, **kwargs): print(f"Rank {actor.rank} - after load") while ray.get(self.ft_manager.should_sleep.remote(actor.rank)): time.sleep(self.sleep_time) print(f"Rank {actor.rank} - returning now") class DieCallback(TrainingCallback): """Used to control when actors should die during training. Also can add delay to each boosting round. """ def __init__(self, ft_manager: ActorHandle, training_delay: float = 0): self.ft_manager = ft_manager self.training_delay = training_delay super(DieCallback, self).__init__() def before_iteration(self, model, epoch, evals_log): if ray.get(self.ft_manager.should_die.remote(get_actor_rank())): pid = os.getpid() print(f"Killing process: {pid}") print(f"Rank {get_actor_rank()} will now die.") time.sleep(1) os.kill(pid, 9) time.sleep(10) # Don't continue training, just die def after_iteration(self, model, epoch, evals_log): # ray.get to make sure this is up to date in the next iteration ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch)) if self.training_delay > 0: time.sleep(self.training_delay) if get_actor_rank() == 0: ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank())) ================================================ FILE: xgboost_ray/tests/release/benchmark_cpu_gpu.py ================================================ import argparse import glob import os import shutil import time import ray from xgboost_ray import ( RayDeviceQuantileDMatrix, RayDMatrix, RayFileType, RayParams, train, ) from xgboost_ray.tests.utils import create_parquet_in_tempdir if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] def train_ray( path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, smoke_test=False, ray_params=None, xgboost_params=None, **kwargs, ): if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): files = files + files path = files[0:num_files] use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) else: dtrain = RayDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) config = xgboost_params or {"tree_method": "hist" if not use_gpu else "gpu_hist"} if not regression: # Classification config.update( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } ) else: # Regression config.update( { "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], } ) start = time.time() evals_result = {} bst = train( config, dtrain, evals_result=evals_result, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams( max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=4 if not smoke_test else 1, gpus_per_actor=0 if not use_gpu else 1, ), evals=[(dtrain, "train")], **kwargs, ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) return bst, taken if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("num_workers", type=int, help="num workers") parser.add_argument("num_rounds", type=int, help="num boost rounds") parser.add_argument("num_files", type=int, help="num files") parser.add_argument( "--file", default="/data/parted.parquet", type=str, help="data file" ) parser.add_argument( "--regression", action="store_true", default=False, help="regression" ) parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( "--smoke-test", action="store_true", default=False, help="smoke test" ) args = parser.parse_args() num_workers = args.num_workers num_boost_rounds = args.num_rounds num_files = args.num_files use_gpu = args.gpu temp_dir = None if args.smoke_test: temp_dir, path = create_parquet_in_tempdir( filename="smoketest.parquet", num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10, ) use_gpu = False else: path = args.file if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` first." ) init_start = time.time() if args.smoke_test: ray.init(num_cpus=num_workers) else: ray.init(address="auto") init_taken = time.time() - init_start full_start = time.time() bst, train_taken = train_ray( path=path, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=args.regression, use_gpu=use_gpu, smoke_test=args.smoke_test, ) full_taken = time.time() - full_start print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)") if args.smoke_test: shutil.rmtree(temp_dir, ignore_errors=True) else: with open("res.csv", "at") as fp: fp.writelines( [ ",".join( [ str(e) for e in [ num_workers, num_files, int(use_gpu), num_boost_rounds, init_taken, full_taken, train_taken, ] ] ) + "\n" ] ) ================================================ FILE: xgboost_ray/tests/release/benchmark_ft.py ================================================ import argparse import glob import os from typing import Dict, List import numpy as np import ray from ray import tune from ray.tune import CLIReporter from xgboost_ray import ( RayDeviceQuantileDMatrix, RayDMatrix, RayFileType, RayParams, RayShardingMode, train, ) from xgboost_ray.callback import EnvironmentCallback from xgboost_ray.matrix import _get_sharding_indices from xgboost_ray.tests.fault_tolerance import ( DelayedLoadingCallback, DieCallback, FaultToleranceManager, ) from xgboost_ray.tests.utils import create_parquet_in_tempdir if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] def train_ray( train_files, eval_files, num_workers, num_boost_round, regression=False, use_gpu=False, ray_params=None, xgboost_params=None, ft_manager=None, aws=None, **kwargs, ): use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_gpu and use_device_matrix: dtrain = RayDeviceQuantileDMatrix( train_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) deval = RayDeviceQuantileDMatrix( eval_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) else: dtrain = RayDMatrix( train_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) deval = RayDMatrix( eval_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) config = xgboost_params or {"tree_method": "hist"} if use_gpu: config.update({"tree_method": "gpu_hist"}) if not regression: # Classification config.update( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } ) return_metric = "error" else: # Regression config.update( { "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], } ) return_metric = "rmse" xgboost_callbacks = [] distributed_callbacks = [] if ft_manager: delay_callback = DelayedLoadingCallback( ft_manager, reload_data=True, sleep_time=0.1 ) distributed_callbacks.append(delay_callback) die_callback = DieCallback(ft_manager, training_delay=0.1) xgboost_callbacks.append(die_callback) if aws: aws_callback = EnvironmentCallback(aws) distributed_callbacks.append(aws_callback) os.environ.update(aws) ray_params = ray_params or RayParams() ray_params.num_actors = num_workers ray_params.gpus_per_actor = 0 if not use_gpu else 1 ray_params.distributed_callbacks = distributed_callbacks evals_result = {} additional_results = {} bst = train( config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_round, ray_params=ray_params, evals=[(dtrain, "train"), (deval, "eval")], callbacks=xgboost_callbacks, **kwargs, ) bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print( "Final training error: {:.4f}".format(evals_result["train"][return_metric][-1]) ) results = { "train-logloss": evals_result["train"]["logloss"][-1], f"train-{return_metric}": evals_result["train"][return_metric][-1], "eval-logloss": evals_result["eval"]["logloss"][-1], f"eval-{return_metric}": evals_result["eval"][return_metric][-1], "total_n": additional_results["total_n"], } return bst, results def ft_setup( workers: List[int], num_rounds: int, die_round_factor: 0.25, comeback_round_factor: 0.75, ): """Setup fault tolerance manager, schedule kills and comebacks""" if workers is None: return None ft_manager = FaultToleranceManager.remote() # Choose some nodes to kill die_round = int(die_round_factor * num_rounds) comeback_round = int(comeback_round_factor * num_rounds) for worker in workers: ft_manager.schedule_kill.remote(rank=worker, boost_round=die_round) ft_manager.delay_return.remote( rank=1, start_boost_round=die_round - 2, end_boost_round=comeback_round - 1 ) print( f"Scheduled workers {list(workers)} to die at round {die_round} " f"and to come back at round {comeback_round} " f"(total {num_rounds} training rounds)" ) return ft_manager def run_experiments(config, files, aws): """Ray Tune-compatible function trainable to run experiments""" os.environ["RXGB_ALLOW_ELASTIC_TUNE"] = "1" condition = config["condition"] num_boost_round = config["num_boost_round"] num_workers = config["num_workers"] num_affected_workers = config["affected_workers"] regression = config["regression"] use_gpu = config["use_gpu"] seed = config["seed"] metric = "eval-error" if not regression else "eval-rmse" xgboost_params: Dict = config["xgboost_params"] ray_params: RayParams = config["ray_params"] # Select a fixed subset of the files for evaluation only np.random.seed(seed) np.random.shuffle(files) last_train_index = int(0.8 * len(files)) train_files = list(files[:last_train_index]) eval_files = list(files[last_train_index:]) if num_affected_workers: affected_workers = np.random.choice( np.arange(1, num_workers), size=num_affected_workers, replace=False ).tolist() else: affected_workers = None np.random.seed(seed) # Re-seed because of conditional evaluation # Dataset to train on sharding_mode = RayShardingMode.INTERLEAVED if condition == "calibrate": final_files = train_files final_workers = num_workers ray_params.num_actors = final_workers bst, results = train_ray( train_files=final_files, eval_files=eval_files, num_workers=final_workers, num_boost_round=num_boost_round, regression=regression, use_gpu=use_gpu, ray_params=ray_params, xgboost_params=xgboost_params, ft_manager=None, aws=aws, early_stopping_rounds=10, ) return results if condition == "fewer_workers": # Sanity check: Just train with fewer workers remove_shards = [] if affected_workers is not None: for rank in affected_workers: remove_shards += _get_sharding_indices( sharding=sharding_mode, rank=rank, num_actors=num_workers, n=len(train_files), ) mask = np.ones(len(train_files), dtype=bool) mask[remove_shards] = False final_files = np.array(train_files)[mask].tolist() final_workers = num_workers - len(affected_workers) else: final_files = train_files final_workers = num_workers ray_params.num_actors = final_workers bst, results = train_ray( train_files=final_files, eval_files=eval_files, num_workers=final_workers, num_boost_round=num_boost_round, regression=regression, use_gpu=use_gpu, ray_params=ray_params, xgboost_params=xgboost_params, ft_manager=None, aws=aws, ) return results if num_affected_workers == 0: # No duplicate baseline runs return {metric: float("inf")} if condition == "non_elastic": # Non-elastic training: Actors die after 50% and come back ray_params.elastic_training = False ray_params.max_failed_actors = len(affected_workers) ray_params.max_actor_restarts = 1 ft_manager = ft_setup( workers=affected_workers, num_rounds=num_boost_round, die_round_factor=0.5, comeback_round_factor=0.0, ) elif condition == "elastic_no_comeback": # Elastic training: Actors die after 50% and don't come back ray_params.elastic_training = True ray_params.max_failed_actors = len(affected_workers) ray_params.max_actor_restarts = 1 ft_manager = ft_setup( workers=affected_workers, num_rounds=num_boost_round, die_round_factor=0.5, comeback_round_factor=1.1, ) elif condition == "elastic_comeback": # Elastic training: Actors die after 50% and come back ray_params.elastic_training = True ray_params.max_failed_actors = len(affected_workers) ray_params.max_actor_restarts = 1 ft_manager = ft_setup( workers=affected_workers, num_rounds=num_boost_round, die_round_factor=0.5, comeback_round_factor=0.75, ) else: raise ValueError("Unknown condition:", condition) bst, results = train_ray( train_files=train_files, eval_files=eval_files, num_workers=num_workers, num_boost_round=num_boost_round, regression=regression, use_gpu=use_gpu, ray_params=ray_params, xgboost_params=xgboost_params, ft_manager=ft_manager, aws=aws, ) return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("num_workers", type=int, help="num workers") parser.add_argument("num_rounds", type=int, help="num boost rounds") parser.add_argument("num_files", type=int, help="num files") parser.add_argument("--cpu", default=0, type=int, help="num cpus per worker") parser.add_argument( "--file", default="/data/parted.parquet", type=str, help="data file" ) parser.add_argument( "--regression", action="store_true", default=False, help="regression" ) parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( "--calibrate", action="store_true", default=False, help="calibrate boost rounds" ) parser.add_argument( "--smoke-test", action="store_true", default=False, help="smoke test" ) args = parser.parse_args() num_workers = args.num_workers num_boost_round = args.num_rounds num_files = args.num_files use_gpu = args.gpu aws = None temp_dir = None if args.smoke_test: temp_dir, files = create_parquet_in_tempdir( filename="smoketest.parquet", num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10, ) use_gpu = False else: path = args.file if path.startswith("s3://"): base, num_partitions = path.split("#", maxsplit=1) num_partitions = int(num_partitions) files = [ f"{base}/partition={i}/part_{i}.parquet" for i in range(num_partitions) ] print( f"Using S3 dataset with base {base} and " f"{num_partitions} partitions." ) try: aws = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"], "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"], "AWS_SESSION_TOKEN": os.environ["AWS_SESSION_TOKEN"], } except KeyError as e: raise ValueError( "Trying to access AWS S3, but credentials are not set " "in the environment. Did you forget to set your " "credentials?" ) from e elif not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` first." ) else: files = sorted(glob.glob(f"{path}/**/*.parquet")) print( f"Using local dataset with base {path} and " f"{len(files)} partitions." ) if num_files: while num_files > len(files): files = files + files files = files[0:num_files] if args.smoke_test: ray.init(num_cpus=num_workers) else: ray.init(address="auto") ray_params = RayParams( num_actors=num_workers, cpus_per_actor=args.cpu, checkpoint_frequency=1, ) config = { "num_workers": num_workers, "num_boost_round": num_boost_round, "seed": 1000, "condition": tune.grid_search( [ "fewer_workers", "non_elastic", "elastic_no_comeback", "elastic_comeback", ] ), "affected_workers": tune.grid_search([0, 1, 2, 3]), "regression": args.regression, "use_gpu": args.gpu, "xgboost_params": {}, "ray_params": ray_params, } if args.calibrate: config["condition"] = "calibrate" config["affected_workers"] = 0 metric = "eval-error" if not args.regression else "eval-rmse" train_metric = "train-error" if not args.regression else "train-rmse" reporter = CLIReporter( parameter_columns=["condition", "affected_workers"], metric_columns=[ metric, "eval-logloss", train_metric, "total_n", "time_total_s", ], print_intermediate_tables=True, ) analysis = tune.run( tune.with_parameters(run_experiments, files=files, aws=aws), config=config, metric=metric, mode="min", resources_per_trial=ray_params.get_tune_resources(), reuse_actors=True, progress_reporter=reporter, log_to_file=True, verbose=2, ) print(f"Best config: {analysis.best_config} " f"with result {analysis.best_result}") ================================================ FILE: xgboost_ray/tests/release/cluster_cpu.yaml ================================================ cluster_name: xgboost_ray_release_tests_cpu_{{env["NUM_WORKERS"] | default(0)}} max_workers: {{env["NUM_WORKERS"] | default(0)}} upscaling_speed: 9999 idle_timeout_minutes: 15 docker: image: anyscale/ray:nightly container_name: ray_container pull_before_run: true run_options: - --privileged provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: false available_node_types: cpu_4_ondemand: node_config: InstanceType: m5.xlarge resources: {"CPU": 4} min_workers: {{env["NUM_WORKERS"] | default(0)}} max_workers: {{env["NUM_WORKERS"] | default(0)}} auth: ssh_user: ubuntu head_node_type: cpu_4_ondemand worker_default_node_type: cpu_4_ondemand file_mounts_sync_continuously: false setup_commands: - pip install -U {{env["RAY_WHEEL"] | default("ray")}} - pip install dask pytest - pip install -U {{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}} ================================================ FILE: xgboost_ray/tests/release/cluster_ft.yaml ================================================ cluster_name: xgboost_ray_release_tests_ft_cluster max_workers: 9 upscaling_speed: 32 idle_timeout_minutes: 15 docker: image: anyscale/ray:nightly container_name: ray_container pull_before_run: true provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: false available_node_types: cpu_16_ondemand: node_config: InstanceType: m5.4xlarge resources: {"CPU": 16} min_workers: 9 max_workers: 9 file_mounts: { "/release_tests": "./" } auth: ssh_user: ubuntu head_node_type: cpu_16_ondemand worker_default_node_type: cpu_16_ondemand setup_commands: - pip install -U awscli fsspec petastorm s3fs botocore - pip install -U {{env["RAY_WHEEL"] | default("ray")}} - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh file_mounts_sync_continuously: false ================================================ FILE: xgboost_ray/tests/release/cluster_gpu.yaml ================================================ cluster_name: xgboost_ray_release_tests_gpu_{{env["NUM_WORKERS"] | default(0)}} max_workers: {{env["NUM_WORKERS"] | default(0)}} upscaling_speed: 9999 idle_timeout_minutes: 15 docker: image: anyscale/ray:nightly-gpu container_name: ray_container pull_before_run: true run_options: - --privileged provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: false available_node_types: gpu_4_ondemand: node_config: InstanceType: p2.xlarge resources: {"CPU": 4, "GPU": 1} min_workers: {{env["NUM_WORKERS"] | default(0)}} max_workers: {{env["NUM_WORKERS"] | default(0)}} auth: ssh_user: ubuntu head_node_type: gpu_4_ondemand worker_default_node_type: gpu_4_ondemand file_mounts: { "~/xgboost_tests": "." } file_mounts_sync_continuously: false setup_commands: - pip install -U pyarrow cupy-cuda101 - pip install -U {{env["RAY_WHEEL"] | default("ray")}} - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh ================================================ FILE: xgboost_ray/tests/release/create_learnable_data.py ================================================ import argparse import os import numpy as np import pandas as pd from sklearn.datasets import make_classification, make_regression if __name__ == "__main__": if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] parser = argparse.ArgumentParser(description="Create fake data.") parser.add_argument("filename", type=str, default="/data/parted.parquet/") parser.add_argument( "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" ) parser.add_argument( "-p", "--num-partitions", required=False, type=int, default=100, help="num partitions", ) parser.add_argument( "-c", "--num-cols", required=False, type=int, default=4, help="num columns (features)", ) parser.add_argument( "-C", "--num-classes", required=False, type=int, default=2, help="num classes" ) parser.add_argument( "-s", "--seed", required=False, type=int, default=1234, help="random seed" ) parser.add_argument( "-T", "--target", required=False, type=float, default=0.8, help="target accuracy", ) args = parser.parse_args() seed = int(args.seed) np.random.seed(seed) num_rows = int(args.num_rows) num_cols = int(args.num_cols) num_classes = int(args.num_classes) target = float(args.target) if num_classes > 0: x, y = make_classification( n_samples=num_rows, n_features=num_cols, n_informative=num_cols // 2, n_redundant=num_cols // 10, n_repeated=0, n_classes=num_classes, n_clusters_per_class=2, flip_y=1 - target, random_state=seed, ) else: x, y = make_regression( n_samples=num_rows, n_features=num_cols, n_informative=num_cols // 2, n_targets=1, noise=0.1, random_state=seed, ) filename = args.filename num_partitions = args.num_partitions data = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(num_cols)]) rows_per_partition = np.floor(len(data) / num_partitions) partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition) if len(partition_arr) < len(data): # If this was not evenly divided, append missing = len(data) - len(partition_arr) partition_arr = np.append(partition_arr, np.arange(missing)) partition = pd.Series(partition_arr, copy=False, dtype=np.int32) data["labels"] = y data["partition"] = partition os.makedirs(filename, 0o755, exist_ok=True) # Write partition-wise to avoid OOM errors for i in range(num_partitions): part = data[partition_arr == i] part.to_parquet( filename, partition_cols=["partition"], engine="pyarrow", partition_filename_cb=lambda key: f"part_{key[0]}.parquet", ) ================================================ FILE: xgboost_ray/tests/release/create_test_data.py ================================================ import argparse import os import numpy as np from xgboost_ray.tests.utils import create_parquet if __name__ == "__main__": if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] parser = argparse.ArgumentParser(description="Create fake data.") parser.add_argument( "filename", type=str, default="/data/parted.parquet/", help="ray/dask" ) parser.add_argument( "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" ) parser.add_argument( "-p", "--num-partitions", required=False, type=int, default=100, help="num partitions", ) parser.add_argument( "-c", "--num-cols", required=False, type=int, default=4, help="num columns (features)", ) parser.add_argument( "-C", "--num-classes", required=False, type=int, default=2, help="num classes" ) parser.add_argument( "-s", "--seed", required=False, type=int, default=1234, help="random seed" ) args = parser.parse_args() np.random.seed(args.seed) create_parquet( args.filename, num_rows=int(args.num_rows), num_partitions=int(args.num_partitions), num_features=int(args.num_cols), num_classes=int(args.num_classes), ) ================================================ FILE: xgboost_ray/tests/release/custom_objective_metric.py ================================================ import ray from xgboost_ray.tests.test_xgboost_api import XGBoostAPITest class XGBoostDistributedAPITest(XGBoostAPITest): def _init_ray(self): if not ray.is_initialized(): ray.init(address="auto") if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"])) ================================================ FILE: xgboost_ray/tests/release/run_e2e_gpu.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi NOW=$(date +%s) export SESSION_NAME="xgboost_ray_ci_gpu_${NOW}" export NUM_WORKERS=3 export XGBOOST_RAY_PACKAGE="git+https://github.com/ray-project/xgboost_ray.git@${GITHUB_SHA:-master}#egg=xgboost_ray" export NO_TMUX=1 ./start_gpu_cluster.sh ./submit_cpu_gpu_benchmark.sh 4 100 100 --gpu --file /data/classification.parquet anyscale down "${SESSION_NAME}" ================================================ FILE: xgboost_ray/tests/release/setup_xgboost.sh ================================================ #!/bin/bash pip install pytest # Uninstall any existing xgboost_ray repositories pip uninstall -y xgboost_ray || true # Install xgboost package pip install -U "${XGBOOST_RAY_PACKAGE:-xgboost_ray}" # Create test dataset sudo mkdir -p /data || true sudo chown ray:1000 /data || true rm -rf /data/classification.parquet || true cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed" python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 ================================================ FILE: xgboost_ray/tests/release/start_cpu_cluster.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" export NUM_WORKERS="${NUM_WORKERS:-3}" SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_cpu_$(date +%s)} echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_cpu.yaml ${SESSION_NAME}" echo "Running: ${CMD}" ${CMD} ================================================ FILE: xgboost_ray/tests/release/start_ft_cluster.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_ft_$(date +%s)} echo "Starting FT cluster" echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_ft.yaml ${SESSION_NAME}" echo "Running: ${CMD}" ${CMD} ================================================ FILE: xgboost_ray/tests/release/start_gpu_cluster.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" export NUM_WORKERS="${NUM_WORKERS:-3}" SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_gpu_$(date +%s)} echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_gpu.yaml ${SESSION_NAME}" echo "Running: ${CMD}" ${CMD} ================================================ FILE: xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi ANYSCALE_CMD="python ~/xgboost_tests/benchmark_cpu_gpu.py $*" SESSION_STR="" if [ -n "${SESSION_NAME}" ]; then SESSION_STR="--session-name ${SESSION_NAME}" fi TMUX="--tmux" if [ "${NO_TMUX}" = "1" ]; then TMUX="" fi CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}" echo "Running: ${CMD}" ${CMD} ================================================ FILE: xgboost_ray/tests/release/submit_ft_benchmark.sh ================================================ #!/bin/bash if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 fi ANYSCALE_CMD="python ~/xgboost_tests/benchmark_ft.py $*" SESSION_STR="" if [ -n "${SESSION_NAME}" ]; then SESSION_STR="--session-name ${SESSION_NAME}" fi TMUX="--tmux" if [ "${NO_TMUX}" = "1" ]; then TMUX="" fi CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}" echo "Running: ${CMD}" ${CMD} ================================================ FILE: xgboost_ray/tests/release/tune_cluster.yaml ================================================ cluster_name: xgboost_ray_release_tests_tune min_workers: 4 max_workers: 4 initial_workers: 4 autoscaling_mode: default docker: image: "rayproject/ray:latest" container_name: ray_container pull_before_run: false run_options: - --privileged target_utilization_fraction: 0.8 idle_timeout_minutes: 5 provider: type: aws region: us-west-2 availability_zone: us-west-2a cache_stopped_nodes: true auth: ssh_user: ubuntu head_node: InstanceType: m5.xlarge ImageId: ami-05ac7a76b4c679a79 worker_nodes: InstanceType: m5.xlarge ImageId: ami-05ac7a76b4c679a79 InstanceMarketOptions: MarketType: spot file_mounts: { "/release_tests": "./" } cluster_synced_files: [] file_mounts_sync_continuously: true initialization_commands: [] setup_commands: - pip install -U ray - pip install -U git+https://github.com/ray-project/xgboost_ray#egg=xgboost-ray - pip install -U git+https://github.com/amogkam/xgboost_ray.git@colocation#egg=xgboost-ray - mkdir -p /data - rm -rf /data/tune_test.parquet || true - python /release_tests/create_test_data.py /data/tune_test.parquet --seed 1234 --num-rows 2000 --num-cols 4 --num-partitions 40 --num-classes 2 head_setup_commands: [] worker_setup_commands: [] head_start_ray_commands: - ray stop - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"actor_cpus\": 0}'" worker_start_ray_commands: - ray stop - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"actor_cpus\": 4}'" metadata: anyscale: working_dir: "/release_tests" ================================================ FILE: xgboost_ray/tests/release/tune_placement.py ================================================ """ NOTE: This example is currently broken (very outdated) and not run in CI. Test Ray Tune trial placement across cluster nodes. Example: Run this script on a cluster with 4 workers nodes a 4 CPUs. ray up -y tune_cluster.yaml ray attach tune_cluster.yaml python /release_tests/tune_placement.py 4 4 10 10 --fake-data This starts 4 trials à 4 actors training 10 boost rounds on 10 data partitions per actor. This will use fake data created before training. This test will then confirm that actors of the same trial are PACKed on the same nodes. In practice we check that each node IP address only hosts actors of the same Ray Tune trial. """ import argparse import json import os import shutil import tempfile import time from collections import defaultdict import ray import ray.train from benchmark_cpu_gpu import train_ray from ray import tune from ray.tune.integration.docker import DockerSyncer from ray.tune.session import get_trial_id from ray.util import get_node_ip_address from xgboost_ray import RayParams from xgboost_ray.compat import TrainingCallback from xgboost_ray.session import put_queue from xgboost_ray.tests.utils import create_parquet if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] class PlacementCallback(TrainingCallback): """This callback collects the Ray Tune trial ID and node IP""" def before_training(self, model): ip_address = get_node_ip_address() put_queue(ip_address) return model def after_iteration(self, model, epoch, evals_log): if epoch == 1: time.sleep(2) elif epoch == 2: time.sleep(8) def tune_test( path, num_trials, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, fake_data=False, smoke_test=False, ): ray_params = RayParams( elastic_training=False, max_actor_restarts=0, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=0 if not use_gpu else 1, ) def local_train(config): temp_dir = None if fake_data or smoke_test: temp_dir = "/tmp/release_test_data" if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(temp_dir, 0o755) local_path = os.path.join(temp_dir, "smoketest.parquet") create_parquet( filename=local_path, num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10, ) else: if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` " f"on all nodes first." ) local_path = path xgboost_params = { "tree_method": "hist" if not use_gpu else "gpu_hist", } xgboost_params.update( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } ) xgboost_params.update(config) additional_results = {} bst, time_taken = train_ray( path=local_path, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=regression, use_gpu=use_gpu, smoke_test=smoke_test, ray_params=ray_params, xgboost_params=xgboost_params, # kwargs additional_results=additional_results, callbacks=[PlacementCallback()], ) bst.save_model("tuned.xgb") trial_ips = [] for rank, ips in enumerate(additional_results["callback_returns"]): for ip in ips: trial_ips.append(ip) tune_trial = get_trial_id() with tempfile.TemporaryDirectory() as temp_checkpoint_dir: with open( os.path.join(temp_checkpoint_dir, "callback_returns.json"), "wt" ) as f: json.dump({tune_trial: trial_ips}, f) ray.train.report( {}, checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir) ) if temp_dir: shutil.rmtree(temp_dir) search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), } analysis = tune.run( local_train, config=search_space, num_samples=num_trials, sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer), resources_per_trial=ray_params.get_tune_resources(), ) # In our PACK scheduling, we expect that each IP hosts only workers # for one Ray Tune trial. ip_to_trials = defaultdict(list) for trial in analysis.trials: trial = trial with open( os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt" ) as f: trial_to_ips = json.load(f) for tune_trial, ips in trial_to_ips.items(): for node_ip in ips: ip_to_trials[node_ip].append(tune_trial) fail = False for ip, trial_ids in ip_to_trials.items(): print(f"For IP {ip} got trial IDs {trial_ids}") fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids) if fail: raise ValueError("Different trial IDs found on same node.") else: print("Success.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test Ray Tune placement " "strategy") parser.add_argument("num_trials", type=int, help="num trials") parser.add_argument("num_workers", type=int, help="num workers (per trial)") parser.add_argument("num_rounds", type=int, help="num boost rounds") parser.add_argument("num_files", type=int, help="num files (per trial)") parser.add_argument( "--file", default="/data/parted.parquet", type=str, help="data file" ) parser.add_argument( "--regression", action="store_true", default=False, help="regression" ) parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( "--fake-data", action="store_true", default=False, help="fake data" ) parser.add_argument( "--smoke-test", action="store_true", default=False, help="smoke test" ) args = parser.parse_args() num_trials = args.num_trials num_workers = args.num_workers num_boost_rounds = args.num_rounds num_files = args.num_files use_gpu = args.gpu if args.smoke_test: use_gpu = False init_start = time.time() if args.smoke_test: ray.init(num_cpus=num_workers) else: ray.init(address="auto") full_start = time.time() tune_test( path=args.file, num_trials=num_trials, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=args.regression, use_gpu=use_gpu, fake_data=args.fake_data, smoke_test=args.smoke_test, ) full_taken = time.time() - full_start print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds ") ================================================ FILE: xgboost_ray/tests/test_client.py ================================================ import os import pytest import ray from ray.util.client.ray_client_helpers import ray_start_client_server from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE @pytest.fixture def start_client_server_4_cpus(): ray.init(num_cpus=4) with ray_start_client_server() as client: yield client @pytest.fixture def start_client_server_5_cpus(): ray.init(num_cpus=5) with ray_start_client_server() as client: yield client @pytest.fixture def start_client_server_5_cpus_modin(monkeypatch): monkeypatch.setenv("__MODIN_AUTOIMPORT_PANDAS__", "1") ray.init(num_cpus=5, runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}) with ray_start_client_server() as client: yield client def test_simple_train(start_client_server_4_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple import main main(num_actors=4, cpus_per_actor=1) @pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests") def test_simple_tune(start_client_server_4_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_tune import main main(cpus_per_actor=1, num_actors=1, num_samples=4) def test_simple_dask(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_dask import main main(cpus_per_actor=1, num_actors=4) def test_simple_modin(start_client_server_5_cpus_modin): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_modin import main main(cpus_per_actor=1, num_actors=4) def test_client_actor_cpus(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @ray.remote class DummyTrainActor: def test(self): import xgboost_ray return xgboost_ray.main._ray_get_actor_cpus() actor = DummyTrainActor.options(num_cpus=2).remote() assert ray.get(actor.test.remote()) == 2 pg = ray.util.placement_group([{"CPU": 2}]) ray.get(pg.ready()) actor2 = DummyTrainActor.options( num_cpus=2, scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), ).remote() assert ray.get(actor2.test.remote()) == 2 @pytest.mark.skipif( not RAY_DATASET_AVAILABLE, reason="Ray datasets are not available in this version of Ray", ) def test_simple_ray_dataset(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_ray_dataset import main main(cpus_per_actor=1, num_actors=4) if __name__ == "__main__": import sys import pytest # noqa: F811 sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_colocation.py ================================================ import os import shutil import tempfile import unittest from unittest.mock import patch import numpy as np import pytest import ray from ray.util.queue import _QueueActor from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.main import _train from xgboost_ray.util import _EventActor class _MockQueueActor(_QueueActor): def get_node_id(self): return ray.get_runtime_context().get_node_id() class _MockEventActor(_EventActor): def get_node_id(self): return ray.get_runtime_context().get_node_id() @pytest.mark.usefixtures("ray_start_cluster") class TestColocation(unittest.TestCase): def setUp(self) -> None: repeat = 8 # Repeat data a couple of times for stability self.x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 0 [0, 0, 1, 0], # Feature 2+!3 -> Label 1 ] * repeat ) self.y = np.array([0, 1, 0, 1] * repeat) self.params = { "booster": "gbtree", "tree_method": "hist", "nthread": 1, "max_depth": 2, "objective": "binary:logistic", "seed": 1000, } self.kwargs = {} self.tmpdir = str(tempfile.mkdtemp()) self.die_lock_file = "/tmp/died_worker.lock" if os.path.exists(self.die_lock_file): os.remove(self.die_lock_file) def tearDown(self) -> None: if os.path.exists(self.tmpdir): shutil.rmtree(self.tmpdir) ray.shutdown() @patch("ray.util.queue._QueueActor", _MockQueueActor) @patch("xgboost_ray.util._EventActor", _MockEventActor) def test_communication_colocation(self): """Checks that Queue and Event actors are colocated with the driver.""" os.environ["RXGB_COMMUNICATION_SOFT_PLACEMENT"] = "0" with self.ray_start_cluster() as cluster: cluster.add_node(num_cpus=3) cluster.add_node(num_cpus=3) cluster.wait_for_nodes() ray.init(address=cluster.address) local_node = ray.get_runtime_context().get_node_id() # Note that these will have the same IP in the test cluster assert len(ray.nodes()) == 2 assert local_node in [node["NodeID"] for node in ray.nodes()] def _mock_train(*args, _training_state, **kwargs): assert ( ray.get(_training_state.queue.actor.get_node_id.remote()) == local_node ) assert ( ray.get(_training_state.stop_event.actor.get_node_id.remote()) == local_node ) return _train(*args, _training_state=_training_state, **kwargs) with patch("xgboost_ray.main._train") as mocked: mocked.side_effect = _mock_train train( self.params, RayDMatrix(self.x, self.y), num_boost_round=2, ray_params=RayParams(max_actor_restarts=1, num_actors=6), ) os.environ.pop("RXGB_COMMUNICATION_SOFT_PLACEMENT", None) def test_no_tune_spread(self): """Tests whether workers are spread when not using Tune.""" with self.ray_start_cluster() as cluster: cluster.add_node(num_cpus=2) cluster.add_node(num_cpus=2) cluster.wait_for_nodes() ray.init(address=cluster.address) ray_params = RayParams(max_actor_restarts=1, num_actors=2, cpus_per_actor=2) def _mock_train(*args, _training_state, **kwargs): try: results = _train(*args, _training_state=_training_state, **kwargs) return results except Exception: raise finally: assert len(_training_state.actors) == 2 if not any(a is None for a in _training_state.actors): actor_infos = ray.state.actors() actor_nodes = [] for a in _training_state.actors: actor_info = actor_infos.get(a._actor_id.hex()) actor_node = actor_info["Address"]["NodeID"] actor_nodes.append(actor_node) assert actor_nodes[0] != actor_nodes[1] with patch("xgboost_ray.main._train", _mock_train): train( self.params, RayDMatrix(self.x, self.y), num_boost_round=4, ray_params=ray_params, ) def test_tune_pack(self): """Tests whether workers are packed when using Tune.""" try: from ray import tune except ImportError: self.skipTest("Tune is not installed.") return with self.ray_start_cluster() as cluster: num_actors = 2 cluster.add_node(num_cpus=3) cluster.add_node(num_cpus=3) ray.init(address=cluster.address) ray_params = RayParams( max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1 ) def _mock_train(*args, _training_state, **kwargs): try: results = _train(*args, _training_state=_training_state, **kwargs) return results except Exception: raise finally: assert len(_training_state.actors) == num_actors if not any(a is None for a in _training_state.actors): actor_infos = ray.state.actors() actor_nodes = [] for a in _training_state.actors: actor_info = actor_infos.get(a._actor_id.hex()) actor_node = actor_info["Address"]["NodeID"] actor_nodes.append(actor_node) assert actor_nodes[0] == actor_nodes[1] def train_func(params, x, y, ray_params): def inner_func(config): with patch("xgboost_ray.main._train", _mock_train): train( params, RayDMatrix(x, y), num_boost_round=4, ray_params=ray_params, ) return inner_func tune.run( train_func(self.params, self.x, self.y, ray_params), resources_per_trial=ray_params.get_tune_resources(), num_samples=1, ) def test_timeout(self): """Checks that an error occurs when placement group setup times out.""" os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "5" with self.ray_start_cluster() as cluster: ray.init(address=cluster.address) with self.assertRaises(TimeoutError): train( self.params, RayDMatrix(self.x, self.y), num_boost_round=2, ray_params=RayParams( max_actor_restarts=1, num_actors=2, resources_per_actor={"invalid": 1}, ), ) if __name__ == "__main__": import sys import pytest # noqa: F811 sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_data_source.py ================================================ import unittest from typing import List, Sequence from unittest.mock import patch import numpy as np import pandas as pd import ray from ray import ObjectRef from xgboost_ray.data_sources import Dask, Modin, Partitioned from xgboost_ray.data_sources.dask import DASK_INSTALLED from xgboost_ray.data_sources.modin import MODIN_INSTALLED from xgboost_ray.main import _RemoteRayXGBoostActor class _DistributedDataSourceTest: def setUp(self): repeat = 8 # Repeat data a couple of times for stability self.x = np.repeat(range(8), 16).reshape((32, 4)) self.y = np.array([0, 1, 2, 3] * repeat) self._init_ray() def tearDown(self) -> None: if ray.is_initialized(): ray.shutdown() def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=1) def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): raise NotImplementedError def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): raise NotImplementedError def testAssignEvenTrivial(self): """Assign actors to co-located partitions, trivial case. In this test case, partitions are already evenly distributed across nodes. """ part_nodes = [0, 0, 1, 1, 2, 2, 3, 3] actor_nodes = [0, 1, 2, 3] expected_actor_parts = { 0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [6, 7], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignEvenRedistributeOne(self): """Assign actors to co-located partitions, non-trivial case. Here two actors are located on node0, but only one can be filled with co-located partitions. The other one has to fetch data from node1. """ part_nodes = [0, 0, 0, 1, 1, 1, 2, 2] actor_nodes = [0, 0, 1, 2] expected_actor_parts = { 0: [0, 2], 1: [1, 5], 2: [3, 4], 3: [6, 7], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignEvenRedistributeMost(self): """Assign actors to co-located partitions, redistribute case. In this test case, partitions are all on one node. """ part_nodes = [0, 0, 0, 0, 0, 0, 0, 0] actor_nodes = [0, 1, 2, 3] expected_actor_parts = { 0: [0, 1], 1: [2, 5], 2: [3, 6], 3: [4, 7], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) # This part of the test never works - Modin materializes partitions # onto different nodes while unwrapping. # self._testDataSourceAssignment(part_nodes, actor_nodes, # expected_actor_parts) def testAssignUnevenTrivial(self): """Assign actors to co-located partitions, trivial uneven case. In this test case, not all actors get the same amount of partitions. """ part_nodes = [0, 0, 0, 1, 1, 2, 2, 2] actor_nodes = [0, 1, 2] expected_actor_parts = { 0: [0, 1, 2], 1: [3, 4], 2: [5, 6, 7], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistribute(self): """Assign actors to co-located partitions, redistribute uneven case. In this test case, not all actors get the same amount of partitions. Some actors have to fetch partitions from other nodes """ part_nodes = [0, 0, 1, 1, 1, 1, 2, 3] actor_nodes = [0, 1, 2] expected_actor_parts = { 0: [0, 1, 5], 1: [2, 3, 4], 2: [6, 7], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistributeColocated(self): """Assign actors to co-located partitions, redistribute uneven case. Here we have an uneven split of partitions. One actor does not get a co-located shard assigned in favor for a non-coloated actor. """ part_nodes = [0, 0, 0, 0, 0, 0, 0] actor_nodes = [0, 0, 1] expected_actor_parts = { 0: [0, 2, 4], 1: [1, 3], 2: [5, 6], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistributeAll(self): """Assign actors to co-located partitions, redistribute uneven case. In this test case, not all actors get the same amount of partitions. Some actors have to fetch partitions from other nodes """ part_nodes = [1, 1, 1, 1, 0, 0, 0] actor_nodes = [1, 1, 2] expected_actor_parts = { 0: [0, 2, 4], 1: [1, 3], 2: [5, 6], } self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) @unittest.skipIf( not MODIN_INSTALLED, reason="Modin is not installed in a supported version." ) class ModinDataSourceTest(_DistributedDataSourceTest, unittest.TestCase): """This test suite validates core RayDMatrix functionality.""" def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): partitions = [ray.put(p) for p in np.array_split(self.x, len(part_nodes))] # Dict from partition (obj ref) to node host part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes])) node_to_part = [(ray.put(n), p) for p, n in part_to_node.items()] actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) actor_to_parts = self._getActorToParts(actors_to_node, node_to_part) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): self.assertEqual( actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) def _getActorToParts(self, actors_to_node, node_to_part): def unwrap(data, *args, **kwargs): return data def actor_ranks(actors): return actors_to_node with patch( "modin.distributed.dataframe.pandas.unwrap_partitions" ) as mock_unwrap, patch( "xgboost_ray.data_sources.modin.get_actor_rank_ips" ) as mock_ranks: mock_unwrap.side_effect = unwrap mock_ranks.side_effect = actor_ranks _, actor_to_parts = Modin.get_actor_shards(data=node_to_part, actors=[]) return actor_to_parts def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return actor_node_ips = [node_ips[nid] for nid in actor_nodes] part_node_ips = [node_ips[nid] for nid in part_nodes] # Initialize data frames on remote nodes # This way we can control which partition is on which node @ray.remote(num_cpus=0.1) def create_remote_df(arr): return ray.put(pd.DataFrame(arr)) partitions = np.array_split(self.x, len(part_nodes)) node_dfs: Sequence[ObjectRef] = ray.get( [ create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( partitions[pid] ) for pid, pip in enumerate(part_node_ips) ] ) node_ip_dfs = [ (ray.put(part_node_ips[pid]), node_df) for pid, node_df in enumerate(node_dfs) ] # Create modin dataframe from distributed partitions from modin.distributed.dataframe.pandas import ( from_partitions, unwrap_partitions, ) modin_df = from_partitions(node_ip_dfs, axis=0) # Sanity check unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True) ip_objs, df_objs = zip(*unwrapped) try: self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in ray.get(list(df_objs))], msg="Modin mixed up the partition order", ) self.assertSequenceEqual( part_node_ips, ray.get(list(ip_objs)), msg="Modin moved partitions to different IPs", ) except AssertionError as exc: print(f"Modin part of the test failed: {exc}") print( "This is a stochastic test failure. Ignoring the rest " "of this test." ) return # Create ray actors actors = [ _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( rank=rank, num_actors=len(actor_nodes) ) for rank, nip in enumerate(actor_node_ips) ] # Calculate shards _, actor_to_parts = Modin.get_actor_shards(modin_df, actors) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): assigned_df = ray.get(actor_to_parts[actor_rank][i]) part_df = pd.DataFrame(partitions[part_id]) self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) @unittest.skipIf( not DASK_INSTALLED, reason="Dask is not installed in a supported version." ) class DaskDataSourceTest(_DistributedDataSourceTest, unittest.TestCase): """This test suite validates core RayDMatrix functionality.""" def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): partitions = list(range(len(part_nodes))) # Dict from partition (id) to node host part_to_node = dict( zip(range(len(partitions)), [f"node{n}" for n in part_nodes]) ) node_to_part = [(n, p) for p, n in part_to_node.items()] actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) actor_to_parts = self._getActorToParts(actors_to_node, node_to_part) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): self.assertEqual( actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) def _getActorToParts(self, actors_to_node, node_to_part): def ip_to_parts(data, *args, **kwargs): from collections import defaultdict ip_to_parts_dict = defaultdict(list) for node, pid in data: ip_to_parts_dict[node].append(pid) return ip_to_parts_dict def actor_ranks(actors): return actors_to_node with patch( "xgboost_ray.data_sources.dask.get_ip_to_parts" ) as mock_parts, patch( "xgboost_ray.data_sources.dask.get_actor_rank_ips" ) as mock_ranks: mock_parts.side_effect = ip_to_parts mock_ranks.side_effect = actor_ranks _, actor_to_parts = Dask.get_actor_shards(data=node_to_part, actors=[]) return actor_to_parts def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): self.skipTest("Data-locality aware scheduling using Dask is currently broken.") import dask import dask.dataframe as dd from ray.util.dask import ray_dask_get dask.config.set(scheduler=ray_dask_get) node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return actor_node_ips = [node_ips[nid] for nid in actor_nodes] part_node_ips = [node_ips[nid] for nid in part_nodes] # Initialize data frames on remote nodes # This way we can control which partition is on which node @ray.remote(num_cpus=0.1) def create_remote_df(arr): return dd.from_array(arr) partitions = np.array_split(self.x, len(part_nodes)) node_dfs: List[dd.DataFrame] = ray.get( [ create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( partitions[pid] ) for pid, pip in enumerate(part_node_ips) ] ) node_dfs_concat = dd.concat(node_dfs).persist() # Get node IPs partition_locations_df = node_dfs_concat.map_partitions( lambda df: pd.DataFrame([ray.util.get_node_ip_address()]) ).compute() partition_locations = [ partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size) ] # Create dask dataframe from distributed partitions dask_df = dd.concat(node_dfs, axis=0) # Sanity check # persisted = dask_df.persist() try: self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in dask_df.partitions.compute()], msg="Dask mixed up the partition order", ) self.assertSequenceEqual( part_node_ips, partition_locations, msg="Dask moved partitions to different IPs", ) except AssertionError as exc: print(f"Dask part of the test failed: {exc}") print( "This is a stochastic test failure. Ignoring the rest " "of this test." ) return # Create ray actors actors = [ _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( rank=rank, num_actors=len(actor_nodes) ) for rank, nip in enumerate(actor_node_ips) ] # Calculate shards _, actor_to_parts = Dask.get_actor_shards(dask_df, actors) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): assigned_df = ray.get(actor_to_parts[actor_rank][i]) part_df = pd.DataFrame(partitions[part_id]) self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) # Ray Datasets data source is not tested, as we do not make use of xgboost-ray # partition-to-actor assign logic. Furthermore, xgboost-ray with Ray Datasets # is tested in ray-project/ray. class PartitionedSourceTest(_DistributedDataSourceTest, unittest.TestCase): def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): partitions = [ ray.put(pd.DataFrame(p)) for p in np.array_split(self.x, len(part_nodes)) ] # Dict from partition (obj ref) to node host part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes])) actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) actor_to_parts = self._getActorToParts( actors_to_node, partitions, part_to_node, part_nodes ) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): self.assertEqual( actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) def _mk_partitioned(self, part_to_node, nr, nc, shapes): class Parted: """Class exposing __partitioned__""" def __init__(self, parted): self.__partitioned__ = parted num_parts = len(part_to_node) data = { "shape": (nr, nc), "partition_tiling": (num_parts, 1), "get": lambda x: ray.get(x), "partitions": {}, } startx = 0 for i, pn in enumerate(part_to_node.items()): partref, node = pn data["partitions"][(i, 0)] = { "start": (startx, 0), "shape": shapes[partref], "data": partref, "location": [node], } startx = startx + shapes[partref][0] return Parted(data) def _getActorToParts(self, actors_to_node, partitions, part_to_node, part_nodes): def actor_ranks(actors): return actors_to_node with patch( "xgboost_ray.data_sources.partitioned.get_actor_rank_ips" ) as mock_ranks: mock_ranks.side_effect = actor_ranks nr, nc = self.x.shape data = self._mk_partitioned( part_to_node, nr, nc, {p: ray.get(p).shape for p in partitions} ) _, actor_to_parts = Partitioned.get_actor_shards(data=data, actors=[]) return actor_to_parts def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return actor_node_ips = [node_ips[nid] for nid in actor_nodes] part_node_ips = [node_ips[nid] for nid in part_nodes] # Initialize data frames on remote nodes # This way we can control which partition is on which node @ray.remote(num_cpus=0.1) def create_remote_df(arr): return ray.put(pd.DataFrame(arr)) partitions = np.array_split(self.x, len(part_nodes)) node_dfs, shapes = {}, {} for pid, pip in enumerate(part_node_ips): pref = ray.get( create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( partitions[pid] ) ) node_dfs[pref] = pip shapes[pref] = partitions[pid].shape nr, nc = self.x.shape # Create structure with __partitioned__ from distributed partitions parted = self._mk_partitioned(node_dfs, nr, nc, shapes) # Create ray actors actors = [ _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( rank=rank, num_actors=len(actor_nodes) ) for rank, nip in enumerate(actor_node_ips) ] # Calculate shards _, actor_to_parts = Partitioned.get_actor_shards(parted, actors) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): assigned_df = ray.get(actor_to_parts[actor_rank][i]) part_df = pd.DataFrame(partitions[part_id]) self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.", ) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_end_to_end.py ================================================ import os import shutil import tempfile import unittest import numpy as np import ray import xgboost as xgb from ray.exceptions import RayActorError, RayTaskError from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from scipy.sparse import csr_matrix from xgboost_ray import RayDMatrix, RayParams, RayShardingMode, predict, train from xgboost_ray.callback import DistributedCallback from xgboost_ray.main import RayXGBoostTrainingError from xgboost_ray.tests.utils import get_num_trees def _make_callback(tmpdir: str) -> DistributedCallback: class TestDistributedCallback(DistributedCallback): logdir = tmpdir def on_init(self, actor, *args, **kwargs): log_file = os.path.join(self.logdir, f"rank_{actor.rank}.log") actor.log_fp = open(log_file, "at") actor.log_fp.write(f"Actor {actor.rank}: Init\n") actor.log_fp.flush() def before_data_loading(self, actor, data, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: Before loading\n") actor.log_fp.flush() def after_data_loading(self, actor, data, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: After loading\n") actor.log_fp.flush() def before_train(self, actor, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: Before train\n") actor.log_fp.flush() def after_train(self, actor, result_dict, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: After train\n") actor.log_fp.flush() def before_predict(self, actor, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: Before predict\n") actor.log_fp.flush() def after_predict(self, actor, predictions, *args, **kwargs): actor.log_fp.write(f"Actor {actor.rank}: After predict\n") actor.log_fp.flush() return TestDistributedCallback() class XGBoostRayEndToEndTest(unittest.TestCase): """In this test suite we validate Ray-XGBoost multi class prediction. First, we validate that XGBoost is able to achieve 100% accuracy on a simple training task. Then we split the dataset into two halves. These halves don't have access to all relevant data, so overfit on their respective data. I.e. the first half always predicts feature 2 -> label 2, while the second half always predicts feature 2 -> label 3. We then train using Ray XGBoost. Again both halves will be trained separately, but because of Rabit's allreduce, they should end up being able to achieve 100% accuracy, again.""" def setUp(self): repeat = 8 # Repeat data a couple of times for stability self.x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 2 [0, 0, 1, 0], # Feature 2+!3 -> Label 3 ] * repeat ) self.y = np.array([0, 1, 2, 3] * repeat) self.params = { "booster": "gbtree", "nthread": 1, "max_depth": 2, "objective": "multi:softmax", "num_class": 4, } def tearDown(self): if ray.is_initialized: ray.shutdown() def testSingleTraining(self): """Test that XGBoost learns to predict full matrix""" dtrain = xgb.DMatrix(self.x, self.y) bst = xgb.train(self.params, dtrain, num_boost_round=2) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) def testHalfTraining(self): """Test that XGBoost learns to predict half matrices individually""" x_first = self.x[::2] y_first = self.y[::2] x_second = self.x[1::2] y_second = self.y[1::2] # Test case: The first model only sees feature 2 --> label 2 # and the second model only sees feature 2 --> label 3 test_X = xgb.DMatrix(np.array([[0, 0, 1, 1], [0, 0, 1, 0]])) test_y_first = [2, 2] test_y_second = [3, 3] # First half dtrain = xgb.DMatrix(x_first, y_first) bst = xgb.train(self.params, dtrain, num_boost_round=2) x_mat = xgb.DMatrix(x_first) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(y_first), list(pred_y)) pred_test = bst.predict(test_X) self.assertSequenceEqual(test_y_first, list(pred_test)) # Second half dtrain = xgb.DMatrix(x_second, y_second) bst = xgb.train(self.params, dtrain, num_boost_round=2) x_mat = xgb.DMatrix(x_second) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(y_second), list(pred_y)) pred_test = bst.predict(test_X) self.assertSequenceEqual(test_y_second, list(pred_test)) def test_client_actor_cpus(self): ray.init(num_cpus=5, num_gpus=0) @ray.remote class DummyTrainActor: def test(self): import xgboost_ray return xgboost_ray.main._ray_get_actor_cpus() actor = DummyTrainActor.options(num_cpus=2).remote() assert ray.get(actor.test.remote()) == 2 pg = ray.util.placement_group([{"CPU": 2}]) ray.get(pg.ready()) actor2 = DummyTrainActor.options( num_cpus=2, scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), ).remote() assert ray.get(actor2.test.remote()) == 2 def _testJointTraining(self, sharding=RayShardingMode.INTERLEAVED, softprob=False): """Train with Ray. The data will be split, but the trees should be combined together and find the true model.""" params = self.params.copy() if softprob: params["objective"] = "multi:softprob" bst = train( params, RayDMatrix(self.x, self.y, sharding=sharding), ray_params=RayParams(num_actors=2), ) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y), list(pred_y)) x_mat = RayDMatrix(self.x, sharding=sharding) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2)) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y), list(pred_y)) # try on an odd number of rows bst = train( params, RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding), ray_params=RayParams(num_actors=2), ) x_mat = RayDMatrix(self.x[:-1], sharding=sharding) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2)) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y[:-1]), list(pred_y)) def testJointTrainingInterleaved(self): ray.init(num_cpus=2, num_gpus=0) self._testJointTraining(sharding=RayShardingMode.INTERLEAVED) self._testJointTraining(sharding=RayShardingMode.INTERLEAVED, softprob=True) def testJointTrainingBatch(self): ray.init(num_cpus=2, num_gpus=0) self._testJointTraining(sharding=RayShardingMode.BATCH) self._testJointTraining(sharding=RayShardingMode.BATCH, softprob=True) def testTrainPredict( self, init=True, remote=None, softprob=False, **ray_param_dict ): """Train with evaluation and predict""" if init: ray.init(num_cpus=2, num_gpus=0) dtrain = RayDMatrix(self.x, self.y) params = self.params if softprob: params = params.copy() params["objective"] = "multi:softprob" evals_result = {} bst = train( params, dtrain, num_boost_round=38, ray_params=RayParams(num_actors=2, **ray_param_dict), evals=[(dtrain, "dtrain")], evals_result=evals_result, _remote=remote, ) self.assertEqual(get_num_trees(bst), 38) self.assertTrue("dtrain" in evals_result) x_mat = RayDMatrix(self.x) pred_y = predict( bst, x_mat, ray_params=RayParams(num_actors=2, **ray_param_dict), _remote=remote, ) if softprob: self.assertEqual(pred_y.shape[1], len(np.unique(self.y))) pred_y = np.argmax(pred_y, axis=1) self.assertSequenceEqual(list(self.y), list(pred_y)) def testTrainPredictSoftprob(self): """Train with evaluation and predict on softprob objective (which returns predictions in a 2d array) """ self.testTrainPredict(init=True, softprob=True) def testTrainPredictRemote(self): """Train with evaluation and predict in a remote call""" self.testTrainPredict(init=True, remote=True) def testTrainPredictClient(self): """Train with evaluation and predict in a client session""" if ray.__version__ <= "1.2.0": self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") from ray.util.client.ray_client_helpers import ray_start_client_server ray.init(num_cpus=2, num_gpus=0) self.assertFalse(ray.util.client.ray.is_connected()) with ray_start_client_server(): self.assertTrue(ray.util.client.ray.is_connected()) self.testTrainPredict(init=False, remote=None) def testDistributedCallbacksTrainPredict(self, init=True, remote=False): """Test distributed callbacks for train/predict""" tmpdir = tempfile.mkdtemp() test_callback = _make_callback(tmpdir) self.testTrainPredict( init=init, remote=remote, distributed_callbacks=[test_callback] ) rank_0_log_file = os.path.join(tmpdir, "rank_0.log") rank_1_log_file = os.path.join(tmpdir, "rank_1.log") self.assertTrue(os.path.exists(rank_1_log_file)) rank_0_log = open(rank_0_log_file, "rt").read() self.assertEqual( rank_0_log, "Actor 0: Init\n" "Actor 0: Before loading\n" "Actor 0: After loading\n" "Actor 0: Before train\n" "Actor 0: After train\n" "Actor 0: Init\n" "Actor 0: Before loading\n" "Actor 0: After loading\n" "Actor 0: Before predict\n" "Actor 0: After predict\n", ) shutil.rmtree(tmpdir) def testDistributedCallbacksTrainPredictClient(self): """Test distributed callbacks for train/predict via Ray client""" if ray.__version__ <= "1.2.0": self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") from ray.util.client.ray_client_helpers import ray_start_client_server ray.init(num_cpus=2, num_gpus=0) self.assertFalse(ray.util.client.ray.is_connected()) with ray_start_client_server(): self.assertTrue(ray.util.client.ray.is_connected()) self.testDistributedCallbacksTrainPredict(init=False, remote=None) def testFailPrintErrors(self): """Test that XGBoost training errors are propagated""" x = np.random.uniform(0, 1, size=(100, 4)) y = np.random.randint(0, 2, size=100) train_set = RayDMatrix(x, y) try: train( { "objective": "multi:softmax", "num_class": 2, "eval_metric": ["logloss", "error"], }, # This will error train_set, evals=[(train_set, "train")], ray_params=RayParams(num_actors=1, max_actor_restarts=0), ) except RuntimeError as exc: self.assertTrue(exc.__cause__) self.assertTrue(isinstance(exc.__cause__, RayActorError)) self.assertTrue(exc.__cause__.__cause__) self.assertTrue(isinstance(exc.__cause__.__cause__, RayTaskError)) self.assertTrue(exc.__cause__.__cause__.cause) self.assertTrue( isinstance(exc.__cause__.__cause__.cause, RayXGBoostTrainingError) ) self.assertIn( "label and prediction size not match", str(exc.__cause__.__cause__) ) def testKwargsValidation(self): x = np.random.uniform(0, 1, size=(100, 4)) y = np.random.randint(0, 1, size=100) train_set = RayDMatrix(x, y) with self.assertRaisesRegex(TypeError, "totally_invalid_kwarg"): train( { "objective": "multi:softmax", "num_class": 2, "eval_metric": ["logloss", "error"], }, train_set, evals=[(train_set, "train")], ray_params=RayParams(num_actors=1, max_actor_restarts=0), totally_invalid_kwarg="", ) def testRanking(self): Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17]) Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3]) X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)).toarray() y = np.array( [ 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ] ) qid = np.array([0] * 5 + [1] * 5 + [2] * 5 + [3] * 5) dtrain = RayDMatrix(X, label=y, qid=qid) params = { "eta": 1, "objective": "rank:pairwise", "eval_metric": ["auc", "aucpr"], "max_depth": 1, } evals_result = {} train( params, dtrain, 10, evals=[(dtrain, "train")], evals_result=evals_result, ray_params=RayParams(num_actors=2, max_actor_restarts=0), ) auc_rec = evals_result["train"]["auc"] self.assertTrue(all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))) auc_rec = evals_result["train"]["aucpr"] self.assertTrue((p <= q for p, q in zip(auc_rec, auc_rec[1:]))) @unittest.skipIf( xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}" ) def testFeatureWeightsParam(self): """Test the feature_weights parameter for xgb version >= 1.3.0. Adapted from the official demo codes: http://xgboost.readthedocs.io/en/stable/python/examples/ feature_weights.html""" rng = np.random.RandomState(1994) kRows = 1000 kCols = 10 X = rng.randn(kRows, kCols) y = rng.randn(kRows) fw = np.ones(shape=(kCols,)) for i in range(kCols): fw[i] *= float(i) train_set = RayDMatrix(X, y, feature_weights=fw) evals_result = {} bst = train( { "objective": "reg:squarederror", "eval_metric": ["rmse", "error"], "colsample_bynode": 0.1, }, train_set, num_boost_round=250, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=RayParams( num_actors=2, cpus_per_actor=1 # Number of remote actors ), ) feature_map = bst.get_fscore() # feature zero has 0 weight self.assertTrue(feature_map.get("f0", None) is None) self.assertTrue(max(feature_map.values()) == feature_map.get("f9")) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_fault_tolerance.py ================================================ import logging import os import shutil import tempfile import time import unittest from unittest.mock import DEFAULT, MagicMock, patch import numpy as np import ray import xgboost as xgb from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.main import RayXGBoostActorAvailable from xgboost_ray.tests.fault_tolerance import ( DelayedLoadingCallback, DieCallback, FaultToleranceManager, ) from xgboost_ray.tests.utils import ( _checkpoint_callback, _fail_callback, _kill_callback, flatten_obj, get_num_trees, tree_obj, ) class _FakeTask(MagicMock): ready = False def is_ready(self): return self.ready class XGBoostRayFaultToleranceTest(unittest.TestCase): """In this test suite we validate fault tolerance when a Ray actor dies. For this, we set up a callback that makes one worker die exactly once. """ def setUp(self): # Set default os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0" repeat = 8 # Repeat data a couple of times for stability self.x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 2 [0, 0, 1, 0], # Feature 2+!3 -> Label 3 ] * repeat ) self.y = np.array([0, 1, 2, 3] * repeat) self.params = { "booster": "gbtree", "nthread": 1, "max_depth": 2, "objective": "multi:softmax", "num_class": 4, } self.tmpdir = str(tempfile.mkdtemp()) self.die_lock_file = "/tmp/died_worker.lock" if os.path.exists(self.die_lock_file): os.remove(self.die_lock_file) self.die_lock_file_2 = "/tmp/died_worker_2.lock" if os.path.exists(self.die_lock_file_2): os.remove(self.die_lock_file_2) ray.init(num_cpus=2, num_gpus=0, log_to_driver=True) def tearDown(self) -> None: if os.path.exists(self.tmpdir): shutil.rmtree(self.tmpdir) ray.shutdown() if os.path.exists(self.die_lock_file): os.remove(self.die_lock_file) if os.path.exists(self.die_lock_file_2): os.remove(self.die_lock_file_2) def testTrainingContinuationKilled(self): """This should continue after one actor died.""" additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train( self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2), additional_results=additional_results, ) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # End with two working actors self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Two workers finished, so N=32 self.assertEqual(additional_results["total_n"], 32) def testTrainingContinuationElasticKilled(self): """This should continue after one actor died.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1" logging.getLogger().setLevel(10) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train( self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams( max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1, ), additional_results=additional_results, ) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor does not get recreated self.assertEqual(actors[0], None) self.assertTrue(actors[1]) # Only one worker finished, so n=16 self.assertEqual(additional_results["total_n"], 16) def testTrainingContinuationElasticKilledRestarted(self): """This should continue after one actor died and restart it.""" logging.getLogger().setLevel(10) ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=0, boost_round=6) ft_manager.delay_return.remote(rank=1, start_boost_round=12, end_boost_round=21) delay_callback = DelayedLoadingCallback( ft_manager, reload_data=True, sleep_time=0.1 ) die_callback = DieCallback(ft_manager, training_delay=0.25) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train( self.params, RayDMatrix(self.x, self.y), callbacks=[die_callback], num_boost_round=20, ray_params=RayParams( max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1, distributed_callbacks=[delay_callback], ), additional_results=additional_results, ) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor gets recreated self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Both workers finished, so n=32 self.assertEqual(additional_results["total_n"], 32) def testTrainingContinuationElasticMultiKilled(self): """This should still show 20 boost rounds after two failures.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1" logging.getLogger().setLevel(10) additional_results = {} bst = train( self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, fail_iteration=6, actor_rank=0), _kill_callback(self.die_lock_file_2, fail_iteration=14, actor_rank=1), ], num_boost_round=20, ray_params=RayParams( max_actor_restarts=2, num_actors=2, elastic_training=True, max_failed_actors=2, ), additional_results=additional_results, ) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") def testTrainingContinuationElasticFailed(self): """This should continue after one actor failed training.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1" additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train( self.params, RayDMatrix(self.x, self.y), callbacks=[_fail_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams( max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1, ), additional_results=additional_results, ) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # End with two working actors since only the training failed self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Two workers finished, so n=32 self.assertEqual(additional_results["total_n"], 32) def testTrainingStop(self): """This should now stop training after one actor died.""" # The `train()` function raises a RuntimeError with self.assertRaises(RuntimeError): train( self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=0, num_actors=2), ) def testTrainingStopElastic(self): """This should now stop training after one actor died.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0" # The `train()` function raises a RuntimeError ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=0, boost_round=3) ft_manager.schedule_kill.remote(rank=1, boost_round=6) ft_manager.delay_return.remote(rank=0, start_boost_round=4, end_boost_round=5) delay_callback = DelayedLoadingCallback( ft_manager, reload_data=True, sleep_time=0.1 ) die_callback = DieCallback(ft_manager, training_delay=0.25) with self.assertRaises(RuntimeError): train( self.params, RayDMatrix(self.x, self.y), callbacks=[die_callback], num_boost_round=20, ray_params=RayParams( elastic_training=True, max_failed_actors=1, max_actor_restarts=1, num_actors=2, distributed_callbacks=[delay_callback], ), ) def testCheckpointContinuationValidity(self): """Test that checkpoints are stored and loaded correctly""" # Train once, get checkpoint via callback returns res_1 = {} bst_1 = train( self.params, RayDMatrix(self.x, self.y), callbacks=[_checkpoint_callback(frequency=1, before_iteration_=False)], num_boost_round=2, ray_params=RayParams(num_actors=2), additional_results=res_1, ) last_checkpoint_1 = res_1["callback_returns"][0][-1] last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1] # Sanity check lc1 = xgb.Booster() lc1.load_model(last_checkpoint_1) self.assertEqual(last_checkpoint_1, last_checkpoint_other_rank_1) self.assertEqual(last_checkpoint_1, lc1.save_raw()) self.assertEqual(bst_1.get_dump(), lc1.get_dump()) # Start new training run, starting from existing model res_2 = {} bst_2 = train( self.params, RayDMatrix(self.x, self.y), callbacks=[ _checkpoint_callback(frequency=1, before_iteration_=True), _checkpoint_callback(frequency=1, before_iteration_=False), ], num_boost_round=4, ray_params=RayParams(num_actors=2), additional_results=res_2, xgb_model=lc1, ) first_checkpoint_2 = res_2["callback_returns"][0][0] first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0] last_checkpoint_2 = res_2["callback_returns"][0][-1] last_checkpoint_other_actor_2 = res_2["callback_returns"][1][-1] fcp_bst = xgb.Booster() fcp_bst.load_model(first_checkpoint_2) lcp_bst = xgb.Booster() lcp_bst.load_model(last_checkpoint_2) # Sanity check self.assertEqual(first_checkpoint_2, first_checkpoint_other_actor_2) self.assertEqual(last_checkpoint_2, last_checkpoint_other_actor_2) self.assertEqual(bst_2.get_dump(), lcp_bst.get_dump()) # Training should not have proceeded for the first checkpoint, # so trees should be equal self.assertEqual(lc1.get_dump(), fcp_bst.get_dump()) # Training should have proceeded for the last checkpoint, # so trees should not be equal self.assertNotEqual(fcp_bst.get_dump(), lcp_bst.get_dump()) def testSameResultWithAndWithoutError(self): """Get the same model with and without errors during training.""" # Run training bst_noerror = train( self.params, RayDMatrix(self.x, self.y), num_boost_round=10, ray_params=RayParams(max_actor_restarts=0, num_actors=2), ) bst_2part_1 = train( self.params, RayDMatrix(self.x, self.y), num_boost_round=5, ray_params=RayParams(max_actor_restarts=0, num_actors=2), ) bst_2part_2 = train( self.params, RayDMatrix(self.x, self.y), num_boost_round=5, ray_params=RayParams(max_actor_restarts=0, num_actors=2), xgb_model=bst_2part_1, ) res_error = {} bst_error = train( self.params, RayDMatrix(self.x, self.y), callbacks=[_fail_callback(self.die_lock_file, fail_iteration=7)], num_boost_round=10, ray_params=RayParams( max_actor_restarts=1, num_actors=2, checkpoint_frequency=5 ), additional_results=res_error, ) flat_noerror = flatten_obj({"tree": tree_obj(bst_noerror)}) flat_error = flatten_obj({"tree": tree_obj(bst_error)}) flat_2part = flatten_obj({"tree": tree_obj(bst_2part_2)}) for key in flat_noerror: self.assertAlmostEqual(flat_noerror[key], flat_error[key]) self.assertAlmostEqual(flat_noerror[key], flat_2part[key]) # We fail at iteration 7, but checkpoints are saved at iteration 5 # Thus we have two additional returns here. print("Callback returns:", res_error["callback_returns"][0]) self.assertEqual(len(res_error["callback_returns"][0]), 10 + 2) @patch("xgboost_ray.main._PrepareActorTask", _FakeTask) @patch("xgboost_ray.elastic._PrepareActorTask", _FakeTask) @patch("xgboost_ray.main._RemoteRayXGBoostActor", MagicMock) def testMaybeScheduleNewActors(self): """Test scheduling of new actors if resources become available. Context: We are training with num_actors=8, of which 3 actors are dead. The cluster has resources to restart 2 of these actors. In this test, we walk through the `_maybe_schedule_new_actors` and `_update_scheduled_actor_states` methods, checking their state after each call. """ from xgboost_ray.elastic import ( _maybe_schedule_new_actors, _update_scheduled_actor_states, ) from xgboost_ray.main import _TrainingState os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30" # Three actors are dead actors = [ MagicMock(), None, MagicMock(), MagicMock(), None, MagicMock(), None, MagicMock(), ] # Mock training state state = _TrainingState( actors=actors, queue=MagicMock(), stop_event=MagicMock(), checkpoint=MagicMock(), additional_results={}, failed_actor_ranks=set(), ) created_actors = [] def fake_create_actor(rank, *args, **kwargs): created_actors.append(rank) return MagicMock() with patch("xgboost_ray.elastic._create_actor") as create_actor: create_actor.side_effect = fake_create_actor _maybe_schedule_new_actors( training_state=state, num_cpus_per_actor=8, num_gpus_per_actor=0, resources_per_actor={"custom": 1.0}, load_data=[], ray_params=RayParams( num_actors=8, elastic_training=True, max_failed_actors=1, max_actor_restarts=2, ), ) # 3 new actors should have been created self.assertEqual(len(created_actors), 3) self.assertEqual(len(state.pending_actors), 3) # The number of created actors shouldn't change even # if we run this function again. _maybe_schedule_new_actors( training_state=state, num_cpus_per_actor=8, num_gpus_per_actor=0, resources_per_actor={"custom": 1.0}, load_data=[], ray_params=RayParams( num_actors=8, elastic_training=True, max_failed_actors=1, max_actor_restarts=2, ), ) self.assertEqual(len(created_actors), 3) self.assertEqual(len(state.pending_actors), 3) # The actors have not yet been promoted because the # loading task has not finished. self.assertFalse(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Update status, nothing should change _update_scheduled_actor_states(training_state=state) self.assertFalse(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Set loading task status to finished, but only for first actor for _, (_, task) in state.pending_actors.items(): task.ready = True break # Update status. This shouldn't raise RayXGBoostActorAvailable # because we still have a grace period to wait for the second # actor. _update_scheduled_actor_states(training_state=state) # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S # Allow for some slack in test execution self.assertGreaterEqual(state.restart_training_at, time.time() + 22) # The first actor should have been promoted to full actor self.assertTrue(actors[1]) self.assertFalse(actors[4]) self.assertFalse(actors[6]) # Set loading task status to finished for all actors for _, (_, task) in state.pending_actors.items(): task.ready = True # Update status. This should now raise RayXGBoostActorAvailable # immediately as there are no pending actors left to wait for. with self.assertRaises(RayXGBoostActorAvailable): _update_scheduled_actor_states(training_state=state) # All restarted actors should have been promoted to full actors self.assertTrue(actors[1]) self.assertTrue(actors[4]) self.assertTrue(actors[6]) def testFaultToleranceManager(self): ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=1, boost_round=16) ft_manager.delay_return.remote(rank=1, start_boost_round=14, end_boost_round=68) delay_callback = DelayedLoadingCallback( ft_manager, reload_data=True, sleep_time=0.1 ) die_callback = DieCallback(ft_manager, training_delay=0.25) res_1 = {} train( self.params, RayDMatrix(self.x, self.y), callbacks=[die_callback], num_boost_round=100, ray_params=RayParams( num_actors=2, checkpoint_frequency=1, elastic_training=True, max_failed_actors=1, max_actor_restarts=1, distributed_callbacks=[delay_callback], ), additional_results=res_1, ) logs = ray.get(ft_manager.get_logs.remote()) print(logs) self.assertSequenceEqual([g for g, _ in logs[0][0:99]], range(99)) # Which steps exactly are executed is stochastic. The rank 1 actor # will die at iteration 16, so at least 15 will be logged (though 16 # might be logged as well). Iterations 17 to 67 should never be # logged. It comes back some time after iteration 68, so this might # be iter 68, 69, or later. We just make sure it comes back at all # (iter 70+). global_steps = [g for g, _ in logs[1]] self.assertTrue(global_steps) self.assertIn(15, global_steps) self.assertNotIn(17, global_steps) self.assertNotIn(67, global_steps) self.assertIn(75, global_steps) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_matrix.py ================================================ import inspect import os import tempfile import unittest import numpy as np import pandas as pd import ray import xgboost as xgb try: import ray.data as ray_data except (ImportError, ModuleNotFoundError): ray_data = None from xgboost_ray import RayDMatrix from xgboost_ray.matrix import RayShardingMode, _get_sharding_indices, concat_dataframes class XGBoostRayDMatrixTest(unittest.TestCase): """This test suite validates core RayDMatrix functionality.""" def setUp(self): repeat = 8 # Repeat data a couple of times for stability self.x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 2 [0, 0, 1, 0], # Feature 2+!3 -> Label 3 ] * repeat ) self.y = np.array([0, 1, 2, 3] * repeat) self.multi_y = np.array( [ [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1], [0, 0, 1, 0], ] * repeat ) @classmethod def setUpClass(cls): ray.init() @classmethod def tearDownClass(cls): ray.shutdown() def testSameObject(self): """Test that matrices are recognized as the same in an actor task.""" @ray.remote def same(one, two): return one == two data = RayDMatrix(self.x, self.y) self.assertTrue(ray.get(same.remote(data, data))) def testColumnOrdering(self): """When excluding cols, the remaining col order should be preserved.""" cols = [str(i) for i in range(50)] df = pd.DataFrame(np.random.randn(1, len(cols)), columns=cols) matrix = RayDMatrix(df, label=cols[-1], num_actors=1) data = matrix.get_data(0)["data"] assert data.columns.tolist() == cols[:-1] def _testMatrixCreation(self, in_x, in_y, multi_label=False, **kwargs): if "sharding" not in kwargs: kwargs["sharding"] = RayShardingMode.BATCH mat = RayDMatrix(in_x, in_y, **kwargs) def _load_data(params): x = params["data"] y = params["label"] if isinstance(x, list): x = concat_dataframes(x) if isinstance(y, list): y = concat_dataframes(y) return x, y params = mat.get_data(rank=0, num_actors=1) x, y = _load_data(params) self.assertTrue(np.allclose(self.x, x)) if multi_label: self.assertTrue(np.allclose(self.multi_y, y)) else: self.assertTrue(np.allclose(self.y, y)) # Multi actor check mat = RayDMatrix(in_x, in_y, **kwargs) params = mat.get_data(rank=0, num_actors=2) x1, y1 = _load_data(params) mat.unload_data() params = mat.get_data(rank=1, num_actors=2) x2, y2 = _load_data(params) self.assertTrue(np.allclose(self.x, concat_dataframes([x1, x2]))) if multi_label: self.assertTrue(np.allclose(self.multi_y, concat_dataframes([y1, y2]))) else: self.assertTrue(np.allclose(self.y, concat_dataframes([y1, y2]))) def testFromNumpy(self): in_x = self.x in_y = self.y self._testMatrixCreation(in_x, in_y) def testFromPandasDfDf(self): in_x = pd.DataFrame(self.x) in_y = pd.DataFrame(self.y) self._testMatrixCreation(in_x, in_y) def testFromPandasDfSeries(self): in_x = pd.DataFrame(self.x) in_y = pd.Series(self.y) self._testMatrixCreation(in_x, in_y) def testFromPandasDfString(self): in_df = pd.DataFrame(self.x) in_df["label"] = self.y self._testMatrixCreation(in_df, "label") def testFromModinDfDf(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return from modin.pandas import DataFrame in_x = DataFrame(self.x) in_y = DataFrame(self.y) self._testMatrixCreation(in_x, in_y, distributed=False) def testFromModinDfSeries(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return from modin.pandas import DataFrame, Series in_x = DataFrame(self.x) in_y = Series(self.y) self._testMatrixCreation(in_x, in_y, distributed=False) def testFromModinDfString(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return from modin.pandas import DataFrame in_df = DataFrame(self.x) in_df["label"] = self.y self._testMatrixCreation(in_df, "label", distributed=False) self._testMatrixCreation(in_df, "label", distributed=True) def testFromDaskDfSeries(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED if not DASK_INSTALLED: self.skipTest("Dask not installed.") return import dask.dataframe as dd in_x = dd.from_array(self.x) in_y = dd.from_array(self.y) self._testMatrixCreation(in_x, in_y, distributed=False) def testFromDaskDfArray(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED if not DASK_INSTALLED: self.skipTest("Dask not installed.") return import dask.array as da import dask.dataframe as dd in_x = dd.from_array(self.x) in_y = da.from_array(self.y) self._testMatrixCreation(in_x, in_y, distributed=False) def testFromDaskDfString(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED if not DASK_INSTALLED: self.skipTest("Dask not installed.") return import dask.dataframe as dd in_df = dd.from_array(self.x) in_df["label"] = dd.from_array(self.y) self._testMatrixCreation(in_df, "label", distributed=False) self._testMatrixCreation(in_df, "label", distributed=True) def testFromPetastormParquetString(self): try: import petastorm # noqa: F401 except ImportError: self.skipTest("Petastorm not installed.") return with tempfile.TemporaryDirectory() as dir: data_file = os.path.join(dir, "data.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df.to_parquet(data_file) self._testMatrixCreation(f"file://{data_file}", "label", distributed=False) self._testMatrixCreation(f"file://{data_file}", "label", distributed=True) def testFromPetastormMultiParquetString(self): with tempfile.TemporaryDirectory() as dir: data_file_1 = os.path.join(dir, "data_1.parquet") data_file_2 = os.path.join(dir, "data_2.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) df_1 = data_df[0 : len(data_df) // 2] df_2 = data_df[len(data_df) // 2 :] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) self._testMatrixCreation( [f"file://{data_file_1}", f"file://{data_file_2}"], "label", distributed=False, ) self._testMatrixCreation( [f"file://{data_file_1}", f"file://{data_file_2}"], "label", distributed=True, ) def testFromCSVString(self): with tempfile.TemporaryDirectory() as dir: data_file = os.path.join(dir, "data.csv") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df.to_csv(data_file, header=True, index=False) self._testMatrixCreation(data_file, "label", distributed=False) with self.assertRaises(ValueError): self._testMatrixCreation(data_file, "label", distributed=True) def testFromMultiCSVString(self): with tempfile.TemporaryDirectory() as dir: data_file_1 = os.path.join(dir, "data_1.csv") data_file_2 = os.path.join(dir, "data_2.csv") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) df_1 = data_df[0 : len(data_df) // 2] df_2 = data_df[len(data_df) // 2 :] df_1.to_csv(data_file_1, header=True, index=False) df_2.to_csv(data_file_2, header=True, index=False) self._testMatrixCreation( [data_file_1, data_file_2], "label", distributed=False ) self._testMatrixCreation( [data_file_1, data_file_2], "label", distributed=True ) def testFromParquetStringMultiLabel(self): with tempfile.TemporaryDirectory() as dir: data_file = os.path.join(dir, "data.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) labels = [f"label_{label}" for label in range(4)] data_df[labels] = self.multi_y data_df.to_parquet(data_file) self._testMatrixCreation( data_file, labels, multi_label=True, distributed=False ) self._testMatrixCreation( data_file, labels, multi_label=True, distributed=True ) def testFromParquetString(self): with tempfile.TemporaryDirectory() as dir: data_file = os.path.join(dir, "data.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df.to_parquet(data_file) self._testMatrixCreation(data_file, "label", distributed=False) self._testMatrixCreation(data_file, "label", distributed=True) def testFromMultiParquetStringMultiLabel(self): with tempfile.TemporaryDirectory() as dir: data_file_1 = os.path.join(dir, "data_1.parquet") data_file_2 = os.path.join(dir, "data_2.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) labels = [f"label_{label}" for label in range(4)] data_df[labels] = self.multi_y df_1 = data_df[0 : len(data_df) // 2] df_2 = data_df[len(data_df) // 2 :] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) self._testMatrixCreation( [data_file_1, data_file_2], labels, multi_label=True, distributed=False ) self._testMatrixCreation( [data_file_1, data_file_2], labels, multi_label=True, distributed=True ) def testFromMultiParquetString(self): with tempfile.TemporaryDirectory() as dir: data_file_1 = os.path.join(dir, "data_1.parquet") data_file_2 = os.path.join(dir, "data_2.parquet") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) df_1 = data_df[0 : len(data_df) // 2] df_2 = data_df[len(data_df) // 2 :] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) self._testMatrixCreation( [data_file_1, data_file_2], "label", distributed=False ) self._testMatrixCreation( [data_file_1, data_file_2], "label", distributed=True ) def testDetectDistributed(self): with tempfile.TemporaryDirectory() as dir: parquet_file = os.path.join(dir, "file.parquet") csv_file = os.path.join(dir, "file.csv") data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df.to_parquet(parquet_file) data_df.to_csv(csv_file) mat = RayDMatrix(parquet_file, lazy=True) self.assertTrue(mat.distributed) mat = RayDMatrix(csv_file, lazy=True) # Single CSV files should not be distributed self.assertFalse(mat.distributed) mat = RayDMatrix([parquet_file] * 3, lazy=True) self.assertTrue(mat.distributed) mat = RayDMatrix([csv_file] * 3, lazy=True) self.assertTrue(mat.distributed) if ray_data: ds = ray_data.read_parquet(parquet_file) mat = RayDMatrix(ds) self.assertTrue(mat.distributed) def testTooManyActorsDistributed(self): """Test error when too many actors are passed""" with self.assertRaises(RuntimeError): dtrain = RayDMatrix(["foo.csv"], num_actors=4, distributed=True) dtrain.assert_enough_shards_for_actors(4) def testTooManyActorsCentral(self): """Test error when too many actors are passed""" data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) with self.assertRaises(RuntimeError): RayDMatrix(data_df, num_actors=34, distributed=False) def testBatchShardingAllActorsGetIndices(self): """Check if all actors get indices with batch mode""" for i in range(16): self.assertTrue(_get_sharding_indices(RayShardingMode.BATCH, i, 16, 100)) def testLegacyParams(self): """Test if all params can be set regardless of xgb version""" in_x = self.x in_y = self.y weight = np.array([1] * len(in_y)) qid = np.array([0] + [1] * len(in_y - 1)) base_margin = np.array([1] * len(in_y)) label_lower_bound = np.array([0.1] * len(in_y)) label_upper_bound = np.array([1] * len(in_y)) self._testMatrixCreation( in_x, in_y, weight=weight, base_margin=base_margin, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound, ) self._testMatrixCreation( in_x, in_y, qid=qid, base_margin=base_margin, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound, ) @unittest.skipIf( xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}" ) def testFeatureWeightsParam(self): """Test the feature_weights parameter for xgb version >= 1.3.0""" in_x = self.x in_y = self.y feature_weights = np.arange(len(in_y)) self._testMatrixCreation(in_x, in_y, feature_weights=feature_weights) @unittest.skipIf( "qid" not in inspect.signature(xgb.DMatrix).parameters, f"not supported in xgb version {xgb.__version__}", ) def testQidSortedBehaviorXGBoost(self): """Test that data with unsorted qid is sorted in RayDMatrix""" in_x = self.x in_y = self.y unsorted_qid = np.array([1, 2] * 16) from xgboost import DMatrix with self.assertRaises(ValueError): DMatrix(**{"data": in_x, "label": in_y, "qid": unsorted_qid}) DMatrix( **{"data": in_x, "label": in_y, "qid": np.sort(unsorted_qid)} ) # no exception # test RayDMatrix handles sorting automatically mat = RayDMatrix(in_x, in_y, qid=unsorted_qid) params = mat.get_data(rank=0, num_actors=1) DMatrix(**params) @unittest.skipIf( "qid" not in inspect.signature(xgb.DMatrix).parameters, f"not supported in xgb version {xgb.__version__}", ) def testQidSortedParquet(self): from xgboost import DMatrix with tempfile.TemporaryDirectory() as dir: parquet_file1 = os.path.join(dir, "file1.parquet") parquet_file2 = os.path.join(dir, "file2.parquet") unsorted_qid1 = np.array([2, 4] * 16) unsorted_qid2 = np.array([1, 3] * 16) # parquet 1 data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df["group"] = pd.Series(unsorted_qid1) data_df.to_parquet(parquet_file1) # parquet 2 data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) data_df["group"] = pd.Series(unsorted_qid2) data_df.to_parquet(parquet_file2) mat = RayDMatrix( [parquet_file1, parquet_file2], columns=["a", "b", "c", "d", "label", "group"], label="label", qid="group", ) params = mat.get_data(rank=0, num_actors=1) DMatrix(**params) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_sklearn.py ================================================ """Copied almost verbatim from https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/tests/python/test_with_sklearn.py in order to ensure 1:1 coverage, with minimal modifications. Some tests were disabled due to not being applicable for a distributed setting.""" # noqa: E501 # Copyright 2021 by XGBoost Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # File based on: # https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/tests/python/test_with_sklearn.py # License: # https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/LICENSE import json import os import shutil # import io # from contextlib import redirect_stdout, redirect_stderr import tempfile import unittest # import collections # import importlib.util import numpy as np import ray import xgboost as xgb from packaging.version import Version from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix, RayParams, predict, train from xgboost_ray.matrix import RayShardingMode from xgboost_ray.sklearn import ( RayXGBClassifier, RayXGBRanker, RayXGBRegressor, RayXGBRFClassifier, RayXGBRFRegressor, ) def softmax(x): e = np.exp(x) return e / np.sum(e) def softprob_obj(classes): def objective(labels, predt): rows = labels.shape[0] grad = np.zeros((rows, classes), dtype=float) hess = np.zeros((rows, classes), dtype=float) eps = 1e-6 for r in range(predt.shape[0]): target = labels[r] p = softmax(predt[r, :]) for c in range(predt.shape[1]): assert target >= 0 or target <= classes g = p[c] - 1.0 if c == target else p[c] h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps) grad[r, c] = g hess[r, c] = h grad = grad.reshape((rows * classes, 1)) hess = hess.reshape((rows * classes, 1)) return grad, hess return objective def get_basescore(model: xgb.XGBModel) -> float: """Get base score from an XGBoost sklearn estimator.""" base_score = float( json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][ "base_score" ] ) return base_score class TemporaryDirectory(object): """Context manager for tempfile.mkdtemp()""" def __enter__(self): self.name = tempfile.mkdtemp() return self.name def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) class XGBoostRaySklearnTest(unittest.TestCase): def setUp(self): self.seed = 1994 self.rng = np.random.RandomState(self.seed) def tearDown(self) -> None: if ray.is_initialized(): ray.shutdown() def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=4) def run_binary_classification(self, cls, ray_dmatrix_params=None): self._init_ray() from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): clf = cls(random_state=42) xgb_model = clf.fit( X[train_index], y[train_index], eval_metric=["auc", "logloss"], ray_dmatrix_params=ray_dmatrix_params, ) preds = xgb_model.predict( X[test_index], ray_dmatrix_params=ray_dmatrix_params ) labels = y[test_index] err = sum( 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] ) / float(len(preds)) assert err < 0.1 def test_binary_classification(self): self.run_binary_classification(RayXGBClassifier) def test_binary_classification_dmatrix_params(self): self.run_binary_classification( RayXGBClassifier, ray_dmatrix_params={"sharding": RayShardingMode.BATCH} ) # ray: added for legacy CI test @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_binary_rf_classification(self): self.run_binary_classification(RayXGBRFClassifier) def test_multiclass_classification(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.model_selection import KFold def check_pred(preds, labels, output_margin): if output_margin: err = sum( 1 for i in range(len(preds)) if preds[i].argmax() != labels[i] ) / float(len(preds)) else: err = sum( 1 for i in range(len(preds)) if preds[i] != labels[i] ) / float(len(preds)) assert err < 0.4 iris = load_iris() y = iris["target"] X = iris["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index]) if hasattr(xgb_model.get_booster(), "num_boosted_rounds"): assert ( xgb_model.get_booster().num_boosted_rounds() == xgb_model.get_num_boosting_rounds() ) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit preds2 = xgb_model.predict(X[test_index], output_margin=True) preds3 = xgb_model.predict(X[test_index], output_margin=False) labels = y[test_index] check_pred(preds, labels, output_margin=False) check_pred(preds2, labels, output_margin=True) check_pred(preds3, labels, output_margin=False) cls = RayXGBClassifier(n_estimators=4).fit(X, y) assert cls.n_classes_ == 3 proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == cls.n_classes_ # custom objective, the default is multi:softprob # so no transformation is required. cls = RayXGBClassifier(n_estimators=4, objective=softprob_obj(3)).fit(X, y) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == cls.n_classes_ def test_stacking_regression(self): self._init_ray() from sklearn.datasets import load_diabetes from sklearn.ensemble import RandomForestRegressor, StackingRegressor from sklearn.linear_model import RidgeCV from sklearn.model_selection import train_test_split X, y = load_diabetes(return_X_y=True) estimators = [ ("gbm", RayXGBRegressor(objective="reg:squarederror")), ("lr", RidgeCV()), ] reg = StackingRegressor( estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10, random_state=42), ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test) def test_stacking_classification(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.ensemble import StackingClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC X, y = load_iris(return_X_y=True) estimators = [ ("gbm", RayXGBClassifier()), ( "svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42)), ), ] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression() ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # exact tree method doesn't support distributed training # def test_feature_importances_weight(self) # def test_feature_importances_gain(self) def test_select_feature(self): self._init_ray() from sklearn.datasets import load_digits from sklearn.feature_selection import SelectFromModel digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] cls = RayXGBClassifier() cls.fit(X, y) selector = SelectFromModel(cls, prefit=True, max_features=1) X_selected = selector.transform(X) assert X_selected.shape[1] == 1 def test_num_parallel_tree(self): self._init_ray() from sklearn.datasets import fetch_california_housing reg = RayXGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist") ds = fetch_california_housing() bst = reg.fit(X=ds["data"], y=ds["target"]) dump = bst.get_booster().get_dump(dump_format="json") assert len(dump) == 16 if XGBOOST_VERSION != Version("0.90"): reg = RayXGBRFRegressor(n_estimators=4) bst = reg.fit(X=ds["data"], y=ds["target"]) dump = bst.get_booster().get_dump(dump_format="json") assert len(dump) == 4 if XGBOOST_VERSION >= Version("1.6.0"): config = json.loads(bst.get_booster().save_config()) assert ( int( config["learner"]["gradient_booster"]["gbtree_model_param"][ "num_parallel_tree" ] ) == 4 ) else: config = json.loads(bst.get_booster().save_config()) assert ( int( config["learner"]["gradient_booster"]["gbtree_train_param"][ "num_parallel_tree" ] ) == 4 ) def test_california_housing_regression(self): self._init_ray() from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold ds = fetch_california_housing() y = ds["target"] X = ds["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBRegressor().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True) preds3 = xgb_model.predict(X[test_index], output_margin=False) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 350 assert mean_squared_error(preds3, labels) < 350 @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def run_california_housing_rf_regression(self, tree_method): from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBRFRegressor(random_state=42, tree_method=tree_method).fit( X[train_index], y[train_index] ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 35 def test_california_housing_rf_regression(self): self._init_ray() self.run_california_housing_rf_regression("hist") def test_parameter_tuning(self): self._init_ray() from sklearn.datasets import fetch_california_housing from sklearn.model_selection import GridSearchCV ds = fetch_california_housing() y = ds["target"] X = ds["data"] xgb_model = RayXGBRegressor(learning_rate=0.1) clf = GridSearchCV( xgb_model, {"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]}, cv=3, verbose=1, ) clf.fit(X, y) assert clf.best_score_ < 0.7 assert clf.best_params_ == {"n_estimators": 200, "max_depth": 4} def test_regression_with_custom_objective(self): self._init_ray() from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold def objective_ls(y_true, y_pred): grad = y_pred - y_true hess = np.ones(len(y_true)) return grad, hess ds = fetch_california_housing() y = ds["target"] X = ds["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBRegressor(objective=objective_ls).fit( X[train_index], y[train_index] ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_pred): raise XGBCustomObjectiveException() xgb_model = RayXGBRegressor(objective=dummy_objective) # TODO figure out how to assertRaises XGBCustomObjectiveException with self.assertRaises(RuntimeError): xgb_model.fit(X, y) def test_classification_with_custom_objective(self): self._init_ray() from sklearn.datasets import load_digits from sklearn.model_selection import KFold def logregobj(y_true, y_pred): y_pred = 1.0 / (1.0 + np.exp(-y_pred)) grad = y_pred - y_true hess = y_pred * (1.0 - y_pred) return grad, hess digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier(objective=logregobj) xgb_model.fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum( 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] ) / float(len(preds)) assert err < 0.1 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_preds): raise XGBCustomObjectiveException() xgb_model = RayXGBClassifier(objective=dummy_objective) # TODO figure out how to assertRaises XGBCustomObjectiveException with self.assertRaises(RuntimeError): xgb_model.fit(X, y) # cls = RayXGBClassifier(use_label_encoder=False) # cls.fit(X, y) # is_called = [False] # def wrapped(y, p): # is_called[0] = True # return logregobj(y, p) # cls.set_params(objective=wrapped) # cls.predict(X) # no throw # cls.fit(X, y) # assert is_called[0] def test_sklearn_api(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split( iris.data, iris.target, train_size=120, test_size=0.2 ) classifier = RayXGBClassifier( booster="gbtree", n_estimators=10, random_state=self.seed ) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) assert err < 0.2 def test_sklearn_api_gblinear(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split( iris.data, iris.target, train_size=120 ) classifier = RayXGBClassifier( booster="gblinear", n_estimators=100, random_state=self.seed ) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) assert err < 0.5 @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_sklearn_random_state(self): self._init_ray() clf = RayXGBClassifier(random_state=402) assert clf.get_xgb_params()["random_state"] == 402 clf = RayXGBClassifier(random_state=401) assert clf.get_xgb_params()["random_state"] == 401 random_state = np.random.RandomState(seed=403) clf = RayXGBClassifier(random_state=random_state) assert isinstance(clf.get_xgb_params()["random_state"], int) @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_sklearn_n_jobs(self): self._init_ray() clf = RayXGBClassifier(n_jobs=1) assert clf.get_xgb_params()["n_jobs"] == 1 clf = RayXGBClassifier(n_jobs=2) assert clf.get_xgb_params()["n_jobs"] == 2 @unittest.skipIf( XGBOOST_VERSION < Version("1.3.0"), f"not supported in xgb version {xgb.__version__}", ) def test_parameters_access(self): self._init_ray() from sklearn import datasets params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1} clf = RayXGBClassifier(n_estimators=1000, **params) assert clf.get_params()["updater"] == "grow_gpu_hist" assert clf.get_params()["subsample"] == 0.5 assert clf.get_params()["n_estimators"] == 1000 clf = RayXGBClassifier(n_estimators=1, nthread=4) X, y = datasets.load_iris(return_X_y=True) clf.fit(X, y) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 4 clf.set_params(nthread=16) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 16 clf.predict(X) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 16 def test_kwargs_error(self): self._init_ray() params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1} with self.assertRaises(TypeError): clf = RayXGBClassifier(n_jobs=1000, **params) assert isinstance(clf, RayXGBClassifier) def test_kwargs_grid_search(self): self._init_ray() from sklearn import datasets from sklearn.model_selection import GridSearchCV params = {"tree_method": "hist"} clf = RayXGBClassifier(n_estimators=1, learning_rate=1.0, **params) assert clf.get_params()["tree_method"] == "hist" # 'max_leaves' is not a default argument of XGBClassifier # Check we can still do grid search over this parameter search_params = {"max_leaves": range(2, 5)} grid_cv = GridSearchCV(clf, search_params, cv=5) iris = datasets.load_iris() grid_cv.fit(iris.data, iris.target) # Expect unique results for each parameter value. # This confirms sklearn is able to successfully update the parameter. means = grid_cv.cv_results_["mean_test_score"] assert len(means) == len(set(means)) def test_sklearn_clone(self): self._init_ray() from sklearn.base import clone clf = RayXGBClassifier(n_jobs=2) clf.n_jobs = -1 clone(clf) @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_sklearn_get_default_params(self): self._init_ray() from sklearn.datasets import load_digits digits_2class = load_digits(n_class=2) X = digits_2class["data"] y = digits_2class["target"] cls = RayXGBClassifier() assert cls.get_params()["base_score"] is None cls.fit(X[:4, ...], y[:4, ...]) base_score = get_basescore(cls) np.testing.assert_equal(base_score, 0.5) @unittest.skipIf( XGBOOST_VERSION < Version("1.1.0"), f"not supported in xgb version {xgb.__version__}", ) def test_validation_weights_xgbmodel(self): self._init_ray() from sklearn.datasets import make_hastie_10_2 # prepare training and test data X, y = make_hastie_10_2(n_samples=2000, random_state=42) labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:1600], X[1600:] y_train, y_test = y[:1600], y[1600:] # instantiate model param_dist = { "objective": "binary:logistic", "n_estimators": 2, "random_state": 123, } clf = xgb.sklearn.XGBModel(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], eval_metric="logloss", verbose=False, ) # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"][ "logloss" ] # now use weights for the test set np.random.seed(0) weights_test = np.random.choice([1, 2], len(X_test)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[weights_test], eval_metric="logloss", verbose=False, ) evals_result_with_weights = clf.evals_result() logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them assert all( (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]) ) with self.assertRaises((ValueError, AssertionError)): # length of eval set and sample weight doesn't match. clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_test, y_test)], sample_weight_eval_set=[weights_train], ) with self.assertRaises((ValueError, AssertionError)): cls = RayXGBClassifier() cls.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_test, y_test)], sample_weight_eval_set=[weights_train], ) def test_validation_weights_xgbclassifier(self): self._init_ray() from sklearn.datasets import make_hastie_10_2 # prepare training and test data X, y = make_hastie_10_2(n_samples=2000, random_state=42) labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:1600], X[1600:] y_train, y_test = y[:1600], y[1600:] # instantiate model param_dist = { "objective": "binary:logistic", "n_estimators": 2, "random_state": 123, } clf = RayXGBClassifier(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], eval_metric="logloss", verbose=False, ) # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"][ "logloss" ] # now use weights for the test set np.random.seed(0) weights_test = np.random.choice([1, 2], len(X_test)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[weights_test], eval_metric="logloss", verbose=False, ) evals_result_with_weights = clf.evals_result() logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them assert all( (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]) ) def save_load_model(self, model_path): from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index]) xgb_model.save_model(model_path) xgb_model = RayXGBClassifier() xgb_model.load_model(model_path) assert isinstance(xgb_model.classes_, np.ndarray) assert isinstance(xgb_model._Booster, xgb.Booster) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum( 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] ) / float(len(preds)) assert err < 0.1 assert xgb_model.get_booster().attr("scikit_learn") is None # test native booster preds = xgb_model.predict(X[test_index], output_margin=True) booster = xgb.Booster(model_file=model_path) predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True) assert np.allclose(preds, predt_1) with self.assertRaises(TypeError): xgb_model = xgb.XGBModel() xgb_model.load_model(model_path) @unittest.skipIf( XGBOOST_VERSION < Version("1.3.0"), f"not supported in xgb version {xgb.__version__}", ) def test_save_load_model(self): self._init_ray() with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model") self.save_load_model(model_path) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") self.save_load_model(model_path) from sklearn.datasets import load_digits with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] booster = xgb.train( {"tree_method": "hist", "objective": "binary:logistic"}, dtrain=xgb.DMatrix(X, y), num_boost_round=4, ) predt_0 = booster.predict(xgb.DMatrix(X)) booster.save_model(model_path) cls = RayXGBClassifier() cls.load_model(model_path) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == 2 # binary predt_1 = cls.predict_proba(X)[:, 1] assert np.allclose(predt_0, predt_1) cls = xgb.XGBModel() cls.load_model(model_path) predt_1 = cls.predict(X) assert np.allclose(predt_0, predt_1) # # forcing it to be last as it's the longest test by far # def test_zzzzzzz_RFECV(self): # self._init_ray() # from sklearn.datasets import fetch_california_housing # from sklearn.datasets import load_breast_cancer # from sklearn.datasets import load_iris # from sklearn.feature_selection import RFECV # # Regression # X, y = fetch_california_housing(return_X_y=True) # bst = RayXGBRegressor( # booster="gblinear", # learning_rate=0.1, # n_estimators=10, # objective="reg:squarederror", # random_state=0, # verbosity=0, # ) # rfecv = RFECV( # estimator=bst, step=1, cv=3, scoring="neg_mean_squared_error") # rfecv.fit(X, y) # # Binary classification # X, y = load_breast_cancer(return_X_y=True) # bst = RayXGBClassifier( # booster="gblinear", # learning_rate=0.1, # n_estimators=10, # objective="binary:logistic", # random_state=0, # verbosity=0, # use_label_encoder=False, # ) # rfecv = RFECV(estimator=bst, step=1, cv=3, scoring="roc_auc") # rfecv.fit(X, y) # # Multi-class classification # X, y = load_iris(return_X_y=True) # bst = RayXGBClassifier( # base_score=0.4, # booster="gblinear", # learning_rate=0.1, # n_estimators=10, # objective="multi:softprob", # random_state=0, # reg_alpha=0.001, # reg_lambda=0.01, # scale_pos_weight=0.5, # verbosity=0, # use_label_encoder=False, # ) # rfecv = RFECV(estimator=bst, step=1, cv=3, scoring="neg_log_loss") # rfecv.fit(X, y) # X[0:4, :] = np.nan # verify scikit_learn doesn't throw with nan # reg = RayXGBRegressor() # rfecv = RFECV(estimator=reg) # rfecv.fit(X, y) # cls = RayXGBClassifier(use_label_encoder=False) # rfecv = RFECV( # estimator=cls, step=1, cv=3, scoring="neg_mean_squared_error") # rfecv.fit(X, y) def test_XGBClassifier_resume(self): self._init_ray() from sklearn.datasets import load_breast_cancer from sklearn.metrics import log_loss with TemporaryDirectory() as tempdir: model1_path = os.path.join(tempdir, "test_XGBClassifier.model") model1_booster_path = os.path.join(tempdir, "test_XGBClassifier.booster") X, Y = load_breast_cancer(return_X_y=True) model1 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model1.fit(X, Y) pred1 = model1.predict(X) log_loss1 = log_loss(pred1, Y) # file name of stored xgb model model1.save_model(model1_path) model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 # file name of 'Booster' instance Xgb model model1.get_booster().save_model(model1_booster_path) model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_booster_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_constraint_parameters(self): self._init_ray() reg = RayXGBRegressor(interaction_constraints="[[0, 1], [2, 3, 4]]") X = np.random.randn(10, 10) y = np.random.randn(10) reg.fit(X, y) config = json.loads(reg.get_booster().save_config()) if XGBOOST_VERSION >= Version("2.0.0"): assert ( config["learner"]["gradient_booster"]["tree_train_param"][ "interaction_constraints" ] == "[[0, 1], [2, 3, 4]]" ) elif XGBOOST_VERSION >= Version("1.6.0"): assert ( config["learner"]["gradient_booster"]["updater"]["grow_histmaker"][ "train_param" ]["interaction_constraints"] == "[[0, 1], [2, 3, 4]]" ) else: assert ( config["learner"]["gradient_booster"]["updater"]["prune"][ "train_param" ]["interaction_constraints"] == "[[0, 1], [2, 3, 4]]" ) # TODO check why this is not working (output is empty, probably due to Ray) # def test_parameter_validation(self): # self._init_ray() # reg = RayXGBRegressor(foo='bar', verbosity=1) # X = np.random.randn(10, 10) # y = np.random.randn(10) # out = io.StringIO() # err = io.StringIO() # with redirect_stdout(out), redirect_stderr(err): # reg.fit(X, y) # output = out.getvalue().strip() # print(output) # assert output.find('foo') != -1 # reg = RayXGBRegressor(n_estimators=2, # missing=3, # importance_type='gain', # verbosity=1) # X = np.random.randn(10, 10) # y = np.random.randn(10) # out = io.StringIO() # err = io.StringIO() # with redirect_stdout(out), redirect_stderr(err): # reg.fit(X, y) # output = out.getvalue().strip() # assert len(output) == 0 # def test_deprecate_position_arg(self): # self._init_ray() # from sklearn.datasets import load_digits # X, y = load_digits(return_X_y=True, n_class=2) # w = y # with self.assertWarns(FutureWarning): # RayXGBRegressor(3, learning_rate=0.1) # model = RayXGBRegressor(n_estimators=1) # with self.assertWarns(FutureWarning): # model.fit(X, y, w) # with self.assertWarns(FutureWarning): # RayXGBClassifier(1, use_label_encoder=False) # model = RayXGBClassifier(n_estimators=1, use_label_encoder=False) # with self.assertWarns(FutureWarning): # model.fit(X, y, w) # with self.assertWarns(FutureWarning): # RayXGBRanker("rank:ndcg", learning_rate=0.1) # model = RayXGBRanker(n_estimators=1) # group = np.repeat(1, X.shape[0]) # with self.assertWarns(FutureWarning): # model.fit(X, y, group) # with self.assertWarns(FutureWarning): # RayXGBRFRegressor(1, learning_rate=0.1) # model = RayXGBRFRegressor(n_estimators=1) # with self.assertWarns(FutureWarning): # model.fit(X, y, w) # with self.assertWarns(FutureWarning): # RayXGBRFClassifier(1, use_label_encoder=True) # model = RayXGBRFClassifier(n_estimators=1) # with self.assertWarns(FutureWarning): # model.fit(X, y, w) def test_pandas_input(self): self._init_ray() import pandas as pd from sklearn.calibration import CalibratedClassifierCV rng = np.random.RandomState(self.seed) kRows = 100 kCols = 6 X = rng.randint(low=0, high=2, size=kRows * kCols) X = X.reshape(kRows, kCols) df = pd.DataFrame(X) feature_names = [] for i in range(1, kCols): feature_names += ["k" + str(i)] df.columns = ["status"] + feature_names target = df["status"] train = df.drop(columns=["status"]) model = RayXGBClassifier() model.fit(train, target) clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic") clf_isotonic.fit(train, target) try: estimator = clf_isotonic.calibrated_classifiers_[0].base_estimator except AttributeError: # sklearn>=1.2 estimator = clf_isotonic.calibrated_classifiers_[0].estimator assert isinstance( estimator, RayXGBClassifier, ) self.assertTrue(np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))) # def run_feature_weights(self, X, y, fw, model=RayXGBRegressor): # with TemporaryDirectory() as tmpdir: # colsample_bynode = 0.5 # reg = model( # tree_method='hist', colsample_bynode=colsample_bynode # ) # reg.fit(X, y, feature_weights=fw) # model_path = os.path.join(tmpdir, 'model.json') # reg.save_model(model_path) # with open(model_path) as fd: # model = json.load(fd) # parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model', # 'json_parser.py') # spec = importlib.util.spec_from_file_location( # "JsonParser", parser_path) # foo = importlib.util.module_from_spec(spec) # spec.loader.exec_module(foo) # model = foo.Model(model) # splits = {} # total_nodes = 0 # for tree in model.trees: # n_nodes = len(tree.nodes) # total_nodes += n_nodes # for n in range(n_nodes): # if tree.is_leaf(n): # continue # if splits.get(tree.split_index(n), None) is None: # splits[tree.split_index(n)] = 1 # else: # splits[tree.split_index(n)] += 1 # od = collections.OrderedDict(sorted(splits.items())) # tuples = [(k, v) for k, v in od.items()] # k, v = list(zip(*tuples)) # w = np.polyfit(k, v, deg=1) # return w # def test_feature_weights(self): # kRows = 512 # kCols = 64 # X = self.rng.randn(kRows, kCols) # y = self.rng.randn(kRows) # fw = np.ones(shape=(kCols, )) # for i in range(kCols): # fw[i] *= float(i) # poly_increasing = self.run_feature_weights(X, y, fw, RayXGBRegressor) # fw = np.ones(shape=(kCols, )) # for i in range(kCols): # fw[i] *= float(kCols - i) # poly_decreasing = self.run_feature_weights(X, y, fw, RayXGBRegressor) # # Approxmated test, this is dependent on the implementation of random # # number generator in std library. # assert poly_increasing[0] > 0.08 # assert poly_decreasing[0] < -0.08 def run_boost_from_prediction(self, tree_method): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model_0 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method, ) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) model_1 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method, ) model_1.fit(X=X, y=y, base_margin=margin) predictions_1 = model_1.predict(X, base_margin=margin) cls_2 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method, ) cls_2.fit(X=X, y=y) predictions_2 = cls_2.predict(X) assert np.all(predictions_1 == predictions_2) def boost_from_prediction(self, tree_method): self._init_ray() self.run_boost_from_prediction(tree_method) @unittest.skipIf( XGBOOST_VERSION < Version("1.0.0"), f"not supported in xgb version {xgb.__version__}", ) def test_boost_from_prediction_hist(self): self.run_boost_from_prediction("hist") @unittest.skipIf( XGBOOST_VERSION < Version("1.2.0"), f"not supported in xgb version {xgb.__version__}", ) def test_boost_from_prediction_approx(self): self.run_boost_from_prediction("approx") # Updater `grow_colmaker` or `exact` tree method doesn't support # distributed training def test_boost_from_prediction_exact(self): with self.assertRaises(ValueError): self.run_boost_from_prediction("exact") @unittest.skipIf( XGBOOST_VERSION < Version("1.4.0"), f"not supported in xgb version {xgb.__version__}", ) def test_estimator_type(self): self._init_ray() assert RayXGBClassifier._estimator_type == "classifier" assert RayXGBRFClassifier._estimator_type == "classifier" assert RayXGBRegressor._estimator_type == "regressor" assert RayXGBRFRegressor._estimator_type == "regressor" assert RayXGBRanker._estimator_type == "ranker" from sklearn.datasets import load_digits X, y = load_digits(n_class=2, return_X_y=True) cls = RayXGBClassifier(n_estimators=2).fit(X, y) with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "cls.json") cls.save_model(path) reg = RayXGBRegressor() with self.assertRaises(TypeError): reg.load_model(path) cls = RayXGBClassifier() cls.load_model(path) # no error def test_ranking(self): # generate random data x_train = np.random.rand(1000, 10) y_train = np.random.randint(5, size=1000) train_qid = np.repeat(np.array([list(range(20))]), 50) x_valid = np.random.rand(200, 10) y_valid = np.random.randint(5, size=200) valid_qid = np.repeat(np.array([list(range(4))]), 50) x_test = np.random.rand(100, 10) params = { "objective": "rank:pairwise", "learning_rate": 0.1, "gamma": 1.0, "min_child_weight": 0.1, "max_depth": 6, "n_estimators": 4, "random_state": 1, "n_jobs": 2, } model = RayXGBRanker(**params) model.fit( x_train, y_train, qid=train_qid, eval_set=[(x_valid, y_valid)], eval_qid=[valid_qid], ) assert model.evals_result() pred = model.predict(x_test) train_data = RayDMatrix(x_train, y_train, qid=train_qid) valid_data = RayDMatrix(x_valid, y_valid, qid=valid_qid) test_data = RayDMatrix(x_test) params_orig = { "objective": "rank:pairwise", "eta": 0.1, "gamma": 1.0, "min_child_weight": 0.1, "max_depth": 6, "random_state": 1, } xgb_model_orig = train( params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")], ray_params=RayParams(num_actors=2, max_actor_restarts=0), ) pred_orig = predict( xgb_model_orig, test_data, ray_params=RayParams(num_actors=2, max_actor_restarts=0), ) np.testing.assert_almost_equal(pred, pred_orig) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_sklearn_matrix.py ================================================ import unittest import numpy as np import ray import xgboost as xgb from packaging.version import Version from sklearn.model_selection import train_test_split from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor has_label_encoder = XGBOOST_VERSION >= Version("1.0.0") and XGBOOST_VERSION < Version( "1.6.0" ) class XGBoostRaySklearnMatrixTest(unittest.TestCase): def setUp(self): self.seed = 1994 self.rng = np.random.RandomState(self.seed) self.params = {"n_estimators": 10} def tearDown(self) -> None: if ray.is_initialized(): ray.shutdown() def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=4) @unittest.skipIf( has_label_encoder, f"not supported in xgb version {xgb.__version__}" ) def testClassifierNoLabelEncoder(self, n_class=2): self._init_ray() from sklearn.datasets import load_digits digits = load_digits(n_class=n_class) y = digits["target"] X = digits["data"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) with self.assertRaisesRegex(Exception, "num_class"): RayXGBClassifier(**self.params).fit(train_matrix, None) with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): RayXGBClassifier(**self.params).fit( train_matrix, None, eval_set=[(X_test, y_test)] ) with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): RayXGBClassifier(**self.params).fit( X_train, y_train, eval_set=[(test_matrix, "eval")] ) RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None) clf = RayXGBClassifier(num_class=n_class, **self.params).fit( train_matrix, None, eval_set=[(test_matrix, "eval")] ) clf.predict(test_matrix) clf.predict_proba(test_matrix) @unittest.skipIf( has_label_encoder, f"not supported in xgb version {xgb.__version__}" ) def testClassifierMulticlassNoLabelEncoder(self): self.testClassifierNoLabelEncoder(n_class=3) def testRegressor(self): self._init_ray() from sklearn.datasets import fetch_california_housing ds = fetch_california_housing() y = ds["target"] X = ds["data"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): RayXGBRegressor(**self.params).fit( train_matrix, None, eval_set=[(X_test, y_test)] ) with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): RayXGBRegressor(**self.params).fit( X_train, y_train, eval_set=[(test_matrix, "eval")] ) RayXGBRegressor(**self.params).fit(train_matrix, None) reg = RayXGBRegressor(**self.params).fit( train_matrix, None, eval_set=[(test_matrix, "eval")] ) reg.predict(test_matrix) ================================================ FILE: xgboost_ray/tests/test_tune.py ================================================ import os import shutil import tempfile import unittest from unittest.mock import MagicMock, patch import numpy as np import ray from ray import tune from ray.tune import TuneError from ray.tune.integration.xgboost import ( TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, ) from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.tune import TuneReportCheckpointCallback, _try_add_tune_callback class XGBoostRayTuneTest(unittest.TestCase): def setUp(self): ray.init(num_cpus=4) repeat = 8 # Repeat data a couple of times for stability x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 2 [0, 0, 1, 0], # Feature 2+!3 -> Label 3 ] * repeat ) y = np.array([0, 1, 2, 3] * repeat) self.params = { "xgb": { "booster": "gbtree", "nthread": 1, "max_depth": 2, "objective": "multi:softmax", "num_class": 4, "eval_metric": ["mlogloss", "merror"], }, "num_boost_round": tune.choice([1, 3]), } def train_func( ray_params, callbacks=None, check_for_spread_strategy=False, **kwargs ): def _inner_train(config): if check_for_spread_strategy: assert ( ray.train.get_context().get_trial_resources().strategy == "SPREAD" ) train_set = RayDMatrix(x, y) train( config["xgb"], dtrain=train_set, ray_params=ray_params, num_boost_round=config["num_boost_round"], evals=[(train_set, "train")], callbacks=callbacks, **kwargs ) return _inner_train self.train_func = train_func self.experiment_dir = tempfile.mkdtemp() def tearDown(self): ray.shutdown() shutil.rmtree(self.experiment_dir) # noinspection PyTypeChecker @patch.dict(os.environ, {"TUNE_RESULT_DELIM": "/"}) def testNumIters(self): """Test that the number of reported tune results is correct""" ray_params = RayParams(cpus_per_actor=1, num_actors=2) params = self.params.copy() params["num_boost_round"] = tune.grid_search([1, 3]) # TODO(justinvyu): Remove this once the xgboost integration # has been updated on the Ray side. try: callback = TuneReportCheckpointCallback( frequency=1, checkpoint_at_end=False ) except TypeError: callback = TuneReportCheckpointCallback(frequency=1) analysis = tune.run( self.train_func(ray_params, callbacks=[callback]), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, ) self.assertSequenceEqual( list(analysis.results_df["training_iteration"]), list(analysis.results_df["config/num_boost_round"]), ) def testNumItersClient(self): """Test ray client mode""" if ray.__version__ <= "1.2.0": self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") from ray.util.client.ray_client_helpers import ray_start_client_server self.assertFalse(ray.util.client.ray.is_connected()) with ray_start_client_server(): self.assertTrue(ray.util.client.ray.is_connected()) self.testNumIters() def testPlacementOptions(self): ray_params = RayParams( cpus_per_actor=1, num_actors=1, placement_options={"strategy": "SPREAD"} ) tune.run( self.train_func(ray_params, check_for_spread_strategy=True), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, ) def testElasticFails(self): """Test if error is thrown when using Tune with elastic training.""" ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True) with self.assertRaises(TuneError): tune.run( self.train_func(ray_params), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, ) def testReplaceTuneCheckpoints(self): """Test if ray.tune.integration.xgboost callbacks are replaced""" # Report and checkpointing callback in_cp = [OrigTuneReportCheckpointCallback(metrics="met")] in_dict = {"callbacks": in_cp} with patch("ray.train.get_context") as mocked: mocked.return_value = MagicMock(return_value=True) _try_add_tune_callback(in_dict) replaced = in_dict["callbacks"][0] self.assertTrue(isinstance(replaced, TuneReportCheckpointCallback)) self.assertSequenceEqual(replaced._metrics, ["met"]) def testEndToEndCheckpointing(self): ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func( ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)] ), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, metric="train-mlogloss", mode="min", log_to_file=True, local_dir=self.experiment_dir, ) self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) def testEndToEndCheckpointingOrigTune(self): ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func( ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)] ), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, metric="train-mlogloss", mode="min", local_dir=self.experiment_dir, ) self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/test_xgboost_api.py ================================================ import unittest from typing import Tuple import numpy as np import ray import xgboost as xgb from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.compat import TrainingCallback # From XGBoost documentation: # https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html from xgboost_ray.session import get_actor_rank, put_queue def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: y = dtrain.get_label() return (np.log1p(predt) - np.log1p(y)) / (predt + 1) def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: y = dtrain.get_label() return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2) def squared_log( predt: np.ndarray, dtrain: xgb.DMatrix ) -> Tuple[np.ndarray, np.ndarray]: predt[predt < -1] = -1 + 1e-6 grad = gradient(predt, dtrain) hess = hessian(predt, dtrain) return grad, hess def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: y = dtrain.get_label() predt[predt < -1] = -1 + 1e-6 elements = np.power(np.log1p(y) - np.log1p(predt), 2) return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y))) class XGBoostAPITest(unittest.TestCase): """This test suite validates core XGBoost API functionality.""" def setUp(self): repeat = 8 # Repeat data a couple of times for stability self.x = np.array( [ [1, 0, 0, 0], # Feature 0 -> Label 0 [0, 1, 0, 0], # Feature 1 -> Label 1 [0, 0, 1, 1], # Feature 2+3 -> Label 0 [0, 0, 1, 0], # Feature 2+!3 -> Label 1 ] * repeat ) self.y = np.array([0, 1, 0, 1] * repeat) self.params = { "booster": "gbtree", "tree_method": "hist", "nthread": 1, "max_depth": 2, "objective": "binary:logistic", "seed": 1000, } self.kwargs = {} def tearDown(self) -> None: if ray.is_initialized(): ray.shutdown() def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=4) def testCustomObjectiveFunction(self): """Ensure that custom objective functions work. Runs a custom objective function with pure XGBoost and XGBoost on Ray and compares the prediction outputs.""" self._init_ray() params = self.params.copy() params.pop("objective", None) bst_xgb = xgb.train(params, xgb.DMatrix(self.x, self.y), obj=squared_log) bst_ray = train( params, RayDMatrix(self.x, self.y), ray_params=RayParams(num_actors=2), obj=squared_log, **self.kwargs, ) x_mat = xgb.DMatrix(self.x) pred_y_xgb = np.round(bst_xgb.predict(x_mat)) pred_y_ray = np.round(bst_ray.predict(x_mat)) self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray)) self.assertSequenceEqual(list(self.y), list(pred_y_ray)) def testCustomMetricFunction(self): """Ensure that custom objective functions work. Runs a custom objective function with pure XGBoost and XGBoost on Ray and compares the prediction outputs.""" self._init_ray() params = self.params.copy() params.pop("objective", None) params["disable_default_eval_metric"] = 1 dtrain_xgb = xgb.DMatrix(self.x, self.y) evals_result_xgb = {} bst_xgb = xgb.train( params, dtrain_xgb, obj=squared_log, feval=rmsle, evals=[(dtrain_xgb, "dtrain")], evals_result=evals_result_xgb, ) dtrain_ray = RayDMatrix(self.x, self.y) evals_result_ray = {} bst_ray = train( params, dtrain_ray, ray_params=RayParams(num_actors=2), obj=squared_log, feval=rmsle, evals=[(dtrain_ray, "dtrain")], evals_result=evals_result_ray, **self.kwargs, ) x_mat = xgb.DMatrix(self.x) pred_y_xgb = np.round(bst_xgb.predict(x_mat)) pred_y_ray = np.round(bst_ray.predict(x_mat)) self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray)) self.assertSequenceEqual(list(self.y), list(pred_y_ray)) self.assertTrue( np.allclose( evals_result_xgb["dtrain"]["PyRMSLE"], evals_result_ray["dtrain"]["PyRMSLE"], atol=0.1, ) ) def testCallbacks(self): class _Callback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): print(f"My rank: {get_actor_rank()}") put_queue(("rank", get_actor_rank())) callback = _Callback() additional_results = {} train( self.params, RayDMatrix(self.x, self.y), ray_params=RayParams(num_actors=2), callbacks=[callback], additional_results=additional_results, **self.kwargs, ) self.assertEqual(len(additional_results["callback_returns"]), 2) self.assertTrue( all(rank == 0 for (_, rank) in additional_results["callback_returns"][0]) ) self.assertTrue( all(rank == 1 for (_, rank) in additional_results["callback_returns"][1]) ) if __name__ == "__main__": import sys import pytest sys.exit(pytest.main(["-v", __file__])) ================================================ FILE: xgboost_ray/tests/utils.py ================================================ import json import os import tempfile import time from typing import Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd import xgboost as xgb from xgboost_ray.compat import TrainingCallback from xgboost_ray.session import get_actor_rank, put_queue def get_num_trees(bst: xgb.Booster): import json data = [json.loads(d) for d in bst.get_dump(dump_format="json")] return len(data) // 4 def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32): return pd.DataFrame( np.random.uniform(0.0, 10.0, size=(num_rows, num_cols)), columns=[f"feature_{i}" for i in range(num_cols)], dtype=dtype, ) def create_labels( num_rows: int, num_classes: int = 2, dtype: Optional[np.dtype] = None ): if num_classes == 0: # Create regression label dtype = dtype or np.float32 return pd.Series( np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label" ) dtype = dtype or np.int32 return pd.Series( np.random.randint(0, num_classes, size=num_rows), dtype=dtype, name="label" ) def create_parquet( filename: str, num_rows: int, num_features: int, num_classes: int = 2, num_partitions: int = 1, ): partition_rows = num_rows // num_partitions for partition in range(num_partitions): print(f"Creating partition {partition}") data = create_data(partition_rows, num_features) labels = create_labels(partition_rows, num_classes) partition = pd.Series(np.full(partition_rows, partition), dtype=np.int32) data["labels"] = labels data["partition"] = partition os.makedirs(filename, 0o755, exist_ok=True) data.to_parquet( filename, partition_cols=["partition"], engine="pyarrow", partition_filename_cb=lambda key: f"part_{key[0]}.parquet", ) def create_parquet_in_tempdir( filename: str, num_rows: int, num_features: int, num_classes: int = 2, num_partitions: int = 1, ) -> Tuple[str, str]: temp_dir = tempfile.mkdtemp() path = os.path.join(temp_dir, filename) create_parquet( path, num_rows=num_rows, num_features=num_features, num_classes=num_classes, num_partitions=num_partitions, ) return temp_dir, path def flatten_obj(obj: Union[List, Dict], keys=None, base=None): keys = keys or [] base = base if base is not None else {} # Keep same object if empty dict if isinstance(obj, list): for i, o in enumerate(obj): flatten_obj(o, keys + [str(i)], base) elif isinstance(obj, dict): for k, o in obj.items(): flatten_obj(o, keys + [str(k)], base) else: base["/".join(keys)] = obj return base def tree_obj(bst: xgb.Booster): return [json.loads(j) for j in bst.get_dump(dump_format="json")] def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): """Returns a callback to kill an actor process. Args: die_lock_file: A file lock used to prevent race conditions when killing the actor. actor_rank: The rank of the actor to kill. fail_iteration: The iteration after which the actor is killed. """ class _KillCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if ( get_actor_rank() == actor_rank and epoch == fail_iteration and not os.path.exists(die_lock_file) ): # Get PID pid = os.getpid() print(f"Killing process: {pid}") with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) print(f"Testing: Rank {get_actor_rank()} will now die.") os.kill(pid, 9) return _KillCallback() def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): """Returns a callback to cause an Xgboost actor to fail training. Args: die_lock_file: A file lock used to prevent race conditions when causing the actor to fail. actor_rank: The rank of the actor to fail. fail_iteration: The iteration after which the training for the specified actor fails. """ class _FailCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if ( get_actor_rank() == actor_rank and epoch == fail_iteration and not os.path.exists(die_lock_file) ): with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) import sys print(f"Testing: Rank {get_actor_rank()} will now fail.") sys.exit(1) return _FailCallback() def _checkpoint_callback(frequency: int = 1, before_iteration_=False): """Returns a callback to checkpoint a model. Args: frequency: The interval at which checkpointing occurs. If frequency is set to n, checkpointing occurs every n epochs. before_iteration_: If True, checkpoint before the iteration begins. Else, checkpoint after the iteration ends. """ class _CheckpointCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if epoch % frequency == 0: put_queue(model.save_raw()) if before_iteration_: def _before_iteration(self, model, epoch, evals_log): self.after_iteration(model, epoch, evals_log) _CheckpointCallback.before_iteration = _before_iteration return _CheckpointCallback() def _sleep_callback(sleep_iteration: int = 6, sleep_seconds: int = 5): """Returns a callback to sleep after an iteration. This artificially inflates training time. Args: sleep_iteration: The iteration after which the actor should sleep. sleep_seconds: Time in seconds the actor should sleep. """ class _SleepCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if epoch == sleep_iteration: print( f"Testing: Rank {get_actor_rank()} will now sleep " f"for {sleep_seconds} seconds." ) time.sleep(sleep_seconds) return _SleepCallback() ================================================ FILE: xgboost_ray/tune.py ================================================ import logging from typing import Dict, Optional import ray from ray.util.annotations import PublicAPI from xgboost_ray.session import get_rabit_rank, put_queue from xgboost_ray.util import force_on_current_node from xgboost_ray.xgb import xgboost as xgb try: from ray import train, tune # noqa: F401 except (ImportError, ModuleNotFoundError) as e: raise RuntimeError( "Ray Train and Ray Tune are required dependencies of xgboost_ray. " 'Please install with: `pip install "ray[train]"`' ) from e import ray.train from ray.tune.integration.xgboost import TuneReportCallback as OrigTuneReportCallback from ray.tune.integration.xgboost import ( TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, ) class TuneReportCheckpointCallback(OrigTuneReportCheckpointCallback): def after_iteration(self, model, epoch: int, evals_log: Dict): # NOTE: We need to update `evals_log` here (even though the super method # already does it) because the actual callback method gets run # in a different process, so *this* instance of the callback will not have # access to the `evals_log` dict in `after_training`. self._evals_log = evals_log if get_rabit_rank() == 0: put_queue( lambda: super(TuneReportCheckpointCallback, self).after_iteration( model=model, epoch=epoch, evals_log=evals_log ) ) def after_training(self, model): if get_rabit_rank() == 0: put_queue( lambda: super(TuneReportCheckpointCallback, self).after_training( model=model ) ) return model class TuneReportCallback(OrigTuneReportCallback): def __new__(cls: type, *args, **kwargs): # TODO(justinvyu): [code_removal] Remove in Ray 2.11. raise DeprecationWarning( "`TuneReportCallback` is deprecated. " "Use `xgboost_ray.tune.TuneReportCheckpointCallback` instead." ) def _try_add_tune_callback(kwargs: Dict): ray_train_context_initialized = ( ray.train.get_context().get_trial_resources() is not None ) if ray_train_context_initialized: callbacks = kwargs.get("callbacks", []) or [] new_callbacks = [] has_tune_callback = False REPLACE_MSG = ( "Replaced `{orig}` with `{target}`. If you want to " "avoid this warning, pass `{target}` as a callback " "directly in your calls to `xgboost_ray.train()`." ) for cb in callbacks: if isinstance(cb, TuneReportCheckpointCallback): has_tune_callback = True new_callbacks.append(cb) elif isinstance(cb, OrigTuneReportCheckpointCallback): orig_metrics = cb._metrics orig_frequency = cb._frequency replace_cb = TuneReportCheckpointCallback( metrics=orig_metrics, frequency=orig_frequency ) new_callbacks.append(replace_cb) logging.warning( REPLACE_MSG.format( orig="ray.tune.integration.xgboost." "TuneReportCheckpointCallback", target="xgboost_ray.tune.TuneReportCheckpointCallback", ) ) has_tune_callback = True else: new_callbacks.append(cb) if not has_tune_callback: new_callbacks.append(TuneReportCheckpointCallback(frequency=0)) kwargs["callbacks"] = new_callbacks return True else: return False def _get_tune_resources( num_actors: int, cpus_per_actor: int, gpus_per_actor: int, resources_per_actor: Optional[Dict], placement_options: Optional[Dict], ): """Returns object to use for ``resources_per_trial`` with Ray Tune.""" from ray.tune import PlacementGroupFactory head_bundle = {} child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} child_bundle_extra = {} if resources_per_actor is None else resources_per_actor child_bundles = [{**child_bundle, **child_bundle_extra} for _ in range(num_actors)] bundles = [head_bundle] + child_bundles placement_options = placement_options or {} placement_options.setdefault("strategy", "PACK") placement_group_factory = PlacementGroupFactory(bundles, **placement_options) return placement_group_factory @PublicAPI(stability="beta") def load_model(model_path): """Loads the model stored in the provided model_path. If using Ray Client, this will automatically handle loading the path on the server by using a Ray task. Returns: xgb.Booster object of the model stored in the provided model_path """ def load_model_fn(model_path): best_bst = xgb.Booster() best_bst.load_model(model_path) return best_bst # Load the model checkpoint. if ray.util.client.ray.is_connected(): # If using Ray Client, the best model is saved on the server. # So we have to wrap the model loading in a ray task. remote_load = ray.remote(load_model_fn) remote_load = force_on_current_node(remote_load) bst = ray.get(remote_load.remote(model_path)) else: bst = load_model_fn(model_path) return bst ================================================ FILE: xgboost_ray/util.py ================================================ import asyncio from typing import Dict, List, Optional import ray from ray.util.annotations import DeveloperAPI @DeveloperAPI class Unavailable: """No object should be instance of this class""" def __init__(self): raise RuntimeError("This class should never be instantiated.") class _EventActor: def __init__(self): self._event = asyncio.Event() def set(self): self._event.set() def clear(self): self._event.clear() def is_set(self): return self._event.is_set() @DeveloperAPI class Event: def __init__(self, actor_options: Optional[Dict] = None): actor_options = {} if not actor_options else actor_options self.actor = ray.remote(_EventActor).options(**actor_options).remote() def set(self): self.actor.set.remote() def clear(self): self.actor.clear.remote() def is_set(self): return ray.get(self.actor.is_set.remote()) def shutdown(self): if self.actor: ray.kill(self.actor) self.actor = None @DeveloperAPI class MultiActorTask: """Utility class to hold multiple futures. The `is_ready()` method will return True once all futures are ready. Args: pending_futures: List of object references (futures) that should be tracked. """ def __init__(self, pending_futures: Optional[List[ray.ObjectRef]] = None): self._pending_futures = pending_futures or [] self._ready_futures = [] def is_ready(self): if not self._pending_futures: return True ready = True while ready: ready, not_ready = ray.wait(self._pending_futures, timeout=0) if ready: for obj in ready: self._pending_futures.remove(obj) self._ready_futures.append(obj) return not bool(self._pending_futures) @DeveloperAPI def get_current_node_resource_key() -> str: """Get the Ray resource key for current node. It can be used for actor placement. If using Ray Client, this will return the resource key for the node that is running the client server. """ current_node_id = ray.get_runtime_context().get_node_id() for node in ray.nodes(): if node["NodeID"] == current_node_id: # Found the node. for key in node["Resources"].keys(): if key.startswith("node:"): return key else: raise ValueError("Cannot found the node dictionary for current node.") @DeveloperAPI def force_on_current_node(task_or_actor): """Given a task or actor, place it on the current node. If the task or actor that is passed in already has custom resource requirements, then they will be overridden. If using Ray Client, the current node is the client server node. """ node_resource_key = get_current_node_resource_key() options = {"resources": {node_resource_key: 0.01}} return task_or_actor.options(**options) ================================================ FILE: xgboost_ray/xgb.py ================================================ from typing import TYPE_CHECKING if TYPE_CHECKING: import xgboost else: try: import xgboost except ImportError: xgboost = None __all__ = ["xgboost"]