Repository: ray-project/xgboost_ray
Branch: master
Commit: e9049256575e
Files: 85
Total size: 505.3 KB

Directory structure:
gitextract_x7_6mxkx/

├── .flake8
├── .github/
│   └── workflows/
│       ├── gpu.yaml
│       └── test.yaml
├── .gitignore
├── LICENSE
├── README.md
├── format.sh
├── requirements/
│   ├── lint-requirements.txt
│   └── test-requirements.txt
├── run_ci_examples.sh
├── run_ci_tests.sh
├── setup.py
└── xgboost_ray/
    ├── __init__.py
    ├── callback.py
    ├── compat/
    │   ├── __init__.py
    │   └── tracker.py
    ├── data_sources/
    │   ├── __init__.py
    │   ├── _distributed.py
    │   ├── csv.py
    │   ├── dask.py
    │   ├── data_source.py
    │   ├── modin.py
    │   ├── numpy.py
    │   ├── object_store.py
    │   ├── pandas.py
    │   ├── parquet.py
    │   ├── partitioned.py
    │   ├── petastorm.py
    │   └── ray_dataset.py
    ├── elastic.py
    ├── examples/
    │   ├── __init__.py
    │   ├── create_test_data.py
    │   ├── higgs.py
    │   ├── higgs_parquet.py
    │   ├── readme.py
    │   ├── readme_sklearn_api.py
    │   ├── simple.py
    │   ├── simple_dask.py
    │   ├── simple_modin.py
    │   ├── simple_objectstore.py
    │   ├── simple_partitioned.py
    │   ├── simple_predict.py
    │   ├── simple_ray_dataset.py
    │   ├── simple_tune.py
    │   ├── train_on_test_data.py
    │   └── train_with_ml_dataset.py
    ├── main.py
    ├── matrix.py
    ├── session.py
    ├── sklearn.py
    ├── tests/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── env_info.sh
    │   ├── fault_tolerance.py
    │   ├── release/
    │   │   ├── benchmark_cpu_gpu.py
    │   │   ├── benchmark_ft.py
    │   │   ├── cluster_cpu.yaml
    │   │   ├── cluster_ft.yaml
    │   │   ├── cluster_gpu.yaml
    │   │   ├── create_learnable_data.py
    │   │   ├── create_test_data.py
    │   │   ├── custom_objective_metric.py
    │   │   ├── run_e2e_gpu.sh
    │   │   ├── setup_xgboost.sh
    │   │   ├── start_cpu_cluster.sh
    │   │   ├── start_ft_cluster.sh
    │   │   ├── start_gpu_cluster.sh
    │   │   ├── submit_cpu_gpu_benchmark.sh
    │   │   ├── submit_ft_benchmark.sh
    │   │   ├── tune_cluster.yaml
    │   │   └── tune_placement.py
    │   ├── test_client.py
    │   ├── test_colocation.py
    │   ├── test_data_source.py
    │   ├── test_end_to_end.py
    │   ├── test_fault_tolerance.py
    │   ├── test_matrix.py
    │   ├── test_sklearn.py
    │   ├── test_sklearn_matrix.py
    │   ├── test_tune.py
    │   ├── test_xgboost_api.py
    │   └── utils.py
    ├── tune.py
    ├── util.py
    └── xgb.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .flake8
================================================
[flake8]
max-line-length = 88
inline-quotes = "
ignore =
  C408
  C417
  E121
  E123
  E126
  E203
  E226
  E24
  E704
  W503
  W504
  W605
  I
  N
  B001
  B002
  B003
  B004
  B005
  B007
  B008
  B009
  B010
  B011
  B012
  B013
  B014
  B015
  B016
  B017
avoid-escape = no
# Error E731 is ignored because of the migration from YAPF to Black.
# See https://github.com/ray-project/ray/issues/21315 for more information.
per-file-ignores =
    rllib/evaluation/worker_set.py:E731
    rllib/evaluation/sampler.py:E731


================================================
FILE: .github/workflows/gpu.yaml
================================================
name: GPU on manual trigger

on:
  workflow_dispatch

jobs:
  test_gpu:
    runs-on: ubuntu-latest
    timeout-minutes: 20
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python 3.8
      uses: actions/setup-python@v3
      with:
        python-version: 3.8
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install -U anyscale pyyaml
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Set anyscale project
      env:
        ANYSCALE_PROJECT: ${{ secrets.ANYSCALE_PROJECT }}
      run: |
        echo "project_id: ${ANYSCALE_PROJECT}" > ./xgboost_ray/tests/release/.anyscale.yaml
    - name: Run end to end GPU test
      env:
        ANYSCALE_CLI_TOKEN: ${{ secrets.ANYSCALE_CLI_TOKEN }}
      run: |
        pushd ./xgboost_ray/tests/release
        ./run_e2e_gpu.sh
        popd || true


================================================
FILE: .github/workflows/test.yaml
================================================
name: pytest on push

on:
  push:
  pull_request:
  schedule:
    - cron: "0 5 * * *"

jobs:
  test_lint:
    runs-on: ubuntu-latest
    timeout-minutes: 3
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python 3.8
      uses: actions/setup-python@v3
      with:
        python-version: 3.8
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run format script
      run: |
        ls -alp
        ./format.sh --all

  test_linux_ray_master:
    runs-on: ubuntu-latest
    timeout-minutes: 160
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10"]
        include:
          - python-version: "3.8"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
          - python-version: "3.9"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl
          - python-version: "3.10"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        python -m pip install -U ${{ matrix.ray-wheel }}
        if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
    - name: Install package
      run: |
        python -m pip install -e .
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run tests
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 45
        max_attempts: 3
        command: bash ./run_ci_tests.sh
    - name: Run examples
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 10
        max_attempts: 3
        command: bash ./run_ci_examples.sh

  test_linux_ray_release:
    runs-on: ubuntu-latest
    timeout-minutes: 160
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10"]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        python -m pip install -U ray
        if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
    - name: Install package
      run: |
        python -m pip install -e .
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run tests
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 45
        max_attempts: 3
        command: bash ./run_ci_tests.sh
    - name: Run examples
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 10
        max_attempts: 3
        command: bash ./run_ci_examples.sh

  test_linux_compat:
    # Test compatibility when some optional libraries are missing
    # Test runs on latest ray release
    runs-on: ubuntu-latest
    timeout-minutes: 160
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10"]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        python -m pip install -U ray
        if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
    - name: Uninstall unavailable dependencies
      # Disables modin and Ray Tune (via tabulate)
      run: |
        python -m pip uninstall -y modin
        python -m pip uninstall -y tabulate
    - name: Install package
      run: |
        python -m pip install -e .
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run tests
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 45
        max_attempts: 3
        command: bash ./run_ci_tests.sh --no-tune
    - name: Run examples
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 10
        max_attempts: 3
        command: bash ./run_ci_examples.sh --no-tune

  test_linux_cutting_edge:
    # Tests on cutting edge, i.e. latest Ray master, latest XGBoost master
    runs-on: ubuntu-latest
    timeout-minutes: 160
    strategy:
      matrix:
        # no new versions for xgboost are published for 3.6
        python-version: ["3.8", "3.9", "3.10"]
        include:
          - python-version: "3.8"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
          - python-version: "3.9"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl
          - python-version: "3.10"
            ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        python -m pip install -U ${{ matrix.ray-wheel }}
        if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
    - name: Install Ubuntu system dependencies
      run: |
        sudo apt-get install -y --no-install-recommends ninja-build
    - name: Install package
      run: |
        python -m pip install -e .
    - name: Clone XGBoost repo
      uses: actions/checkout@v3
      with:
        repository: dmlc/xgboost
        path: xgboost
        submodules: true
    - name: Install XGBoost from source
      shell: bash -l {0}
      run: |
        pushd ${GITHUB_WORKSPACE}/xgboost/python-package
        python --version
        python setup.py sdist
        pip install -v ./dist/xgboost-*.tar.gz
        popd
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run tests
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 45
        max_attempts: 3
        command: bash ./run_ci_tests.sh
    - name: Run examples
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 10
        max_attempts: 3
        command: bash ./run_ci_examples.sh

  test_linux_xgboost_legacy:
    # Tests on XGBoost 0.90 and latest Ray release
    runs-on: ubuntu-latest
    timeout-minutes: 160
    strategy:
      matrix:
        python-version: [3.8]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        python -m pip install codecov
        python -m pip install -U ray
        if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
    - name: Install package
      run: |
        python -m pip install -e .
    - name: Install legacy XGBoost
      run: |
        python -m pip install xgboost==0.90
    - name: Print environment info
      run: |
        ./xgboost_ray/tests/env_info.sh
    - name: Run tests
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 45
        max_attempts: 3
        command: bash ./run_ci_tests.sh
    - name: Run examples
      uses: nick-invision/retry@v2
      with:
        timeout_minutes: 10
        max_attempts: 3
        command: bash ./run_ci_examples.sh


================================================
FILE: .gitignore
================================================
# Python byte code files
*.pyc
python/.eggs

# Backup files
*.bak

# Emacs temporary files
*~
*#

# Debug symbols
*.pdb

# Visual Studio files
/packages
*.suo
*.user
*.VC.db
*.VC.opendb

# Protobuf-generated files
*_pb2.py
*.pb.h
*.pb.cc

# Ray cluster configuration
scripts/nodes.txt

# OS X folder attributes
.DS_Store

# Debug files
*.dSYM/
*.su

# Python setup files
*.egg-info

# Compressed files
*.gz

# Datasets from examples
**/MNIST_data/
**/cifar-10-batches-bin/

# Generated documentation files
/doc/_build
/doc/source/_static/thumbs
/doc/source/tune/generated_guides/

# User-specific stuff:
.idea/

# Pytest Cache
**/.pytest_cache
**/.cache
.benchmarks
python-driver-*

# Vscode
.vscode/

*.iml

# python virtual env
venv

# pyenv version file
.python-version

# Vim
.*.swp
*.swp
tags

# Emacs
.#*

# tools
tools/prometheus*

# ray project files
project-id
.mypy_cache/

# XGBoost models from examples
*.xgb

# Downloaded test data
*.csv
*.csv.gz
*.parquet

# Byte-compiled files
__pycache__/

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright {yyyy} {name of copyright owner}

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

--------------------------------------------------------------------------------

Code in python/ray/rllib/{evolution_strategies, dqn} adapted from
https://github.com/openai (MIT License)

Copyright (c) 2016 OpenAI (http://openai.com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

--------------------------------------------------------------------------------

Code in python/ray/rllib/impala/vtrace.py from
https://github.com/deepmind/scalable_agent

Copyright 2018 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

--------------------------------------------------------------------------------
Code in python/ray/rllib/ars is adapted from https://github.com/modestyachts/ARS

Copyright (c) 2018, ARS contributors (Horia Mania, Aurelia Guy, Benjamin Recht)
All rights reserved.

Redistribution and use of ARS in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

------------------
Code in python/ray/prometheus_exporter.py is adapted from https://github.com/census-instrumentation/opencensus-python/blob/master/contrib/opencensus-ext-prometheus/opencensus/ext/prometheus/stats_exporter/__init__.py

# Copyright 2018, OpenCensus Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: README.md
================================================
<!--$UNCOMMENT(xgboost-ray)=-->

# Distributed XGBoost on Ray
<!--$REMOVE-->
![Build Status](https://github.com/ray-project/xgboost_ray/workflows/pytest%20on%20push/badge.svg)
[![docs.ray.io](https://img.shields.io/badge/docs-ray.io-blue)](https://docs.ray.io/en/master/xgboost-ray.html)
<!--$END_REMOVE-->
XGBoost-Ray is a distributed backend for
[XGBoost](https://xgboost.readthedocs.io/en/latest/), built
on top of
[distributed computing framework Ray](https://ray.io).

XGBoost-Ray

- enables [multi-node](#usage) and [multi-GPU](#multi-gpu-training) training
- integrates seamlessly with distributed [hyperparameter optimization](#hyperparameter-tuning) library [Ray Tune](http://tune.io)
- comes with advanced [fault tolerance handling](#fault-tolerance) mechanisms, and
- supports [distributed dataframes and distributed data loading](#distributed-data-loading)

All releases are tested on large clusters and workloads.

## Installation

You can install the latest XGBoost-Ray release from PIP:

```bash
pip install "xgboost_ray"
```

If you'd like to install the latest master, use this command instead:

```bash
pip install "git+https://github.com/ray-project/xgboost_ray.git#egg=xgboost_ray"
```

## Usage

XGBoost-Ray provides a drop-in replacement for XGBoost's `train`
function. To pass data, instead of using `xgb.DMatrix` you will
have to use `xgboost_ray.RayDMatrix`. You can also use a scikit-learn
interface - see next section.


Just as in original `xgb.train()` function, the
[training parameters](https://xgboost.readthedocs.io/en/stable/parameter.html)
are passed as the `params` dictionary.

Ray-specific distributed training parameters are configured with a
`xgboost_ray.RayParams` object. For instance, you can set
the `num_actors` property to specify how many distributed actors
you would like to use.

Here is a simplified example (which requires `sklearn`):

**Training:**

```python
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)

evals_result = {}
bst = train(
    {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(
        num_actors=2,  # Number of remote actors
        cpus_per_actor=1))

bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
    evals_result["train"]["error"][-1]))
```

**Prediction:**

```python
from xgboost_ray import RayDMatrix, RayParams, predict
from sklearn.datasets import load_breast_cancer
import xgboost as xgb

data, labels = load_breast_cancer(return_X_y=True)

dpred = RayDMatrix(data, labels)

bst = xgb.Booster(model_file="model.xgb")
pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2))

print(pred_ray)
```

### scikit-learn API

XGBoost-Ray also features a scikit-learn API fully mirroring pure
XGBoost scikit-learn API, providing a completely drop-in
replacement. The following estimators are available:

- `RayXGBClassifier`
- `RayXGRegressor`
- `RayXGBRFClassifier`
- `RayXGBRFRegressor`
- `RayXGBRanker`

Example usage of `RayXGBClassifier`:

```python
from xgboost_ray import RayXGBClassifier, RayParams
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

seed = 42

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.25, random_state=42
)

clf = RayXGBClassifier(
    n_jobs=4,  # In XGBoost-Ray, n_jobs sets the number of actors
    random_state=seed
)

# scikit-learn API will automatically convert the data
# to RayDMatrix format as needed.
# You can also pass X as a RayDMatrix, in which case
# y will be ignored.

clf.fit(X_train, y_train)

pred_ray = clf.predict(X_test)
print(pred_ray)

pred_proba_ray = clf.predict_proba(X_test)
print(pred_proba_ray)

# It is also possible to pass a RayParams object
# to fit/predict/predict_proba methods - will override
# n_jobs set during initialization

clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2))

pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
print(pred_ray)
```

Things to keep in mind:

- `n_jobs` parameter controls the number of actors spawned.
  You can pass a `RayParams` object to the
  `fit`/`predict`/`predict_proba` methods as the `ray_params` argument
  for greater control over resource allocation. Doing
  so will override the value of `n_jobs` with the value of
  `ray_params.num_actors` attribute. For more information, refer
  to the [Resources](#resources) section below.
- By default `n_jobs` is set to `1`, which means the training
  will **not** be distributed. Make sure to either set `n_jobs`
  to a higher value or pass a `RayParams` object as outlined above
  in order to take advantage of XGBoost-Ray's functionality.
- After calling `fit`, additional evaluation results (e.g. training time,
  number of rows, callback results) will be available under
  `additional_results_` attribute.
- XGBoost-Ray's scikit-learn API is based on XGBoost 1.4.
  While we try to support older XGBoost versions, please note that
  this library is only fully tested and supported for XGBoost >= 1.4.

For more information on the scikit-learn API, refer to the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).

## Data loading

Data is passed to XGBoost-Ray via a `RayDMatrix` object.

The `RayDMatrix` lazy loads data and stores it sharded in the
Ray object store. The Ray XGBoost actors then access these
shards to run their training on.

A `RayDMatrix` support various data and file types, like
Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.

Example loading multiple parquet files:

```python
import glob
from xgboost_ray import RayDMatrix, RayFileType

# We can also pass a list of files
path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))

# This argument will be passed to `pd.read_parquet()`
columns = [
    "passenger_count",
    "trip_distance", "pickup_longitude", "pickup_latitude",
    "dropoff_longitude", "dropoff_latitude",
    "fare_amount", "extra", "mta_tax", "tip_amount",
    "tolls_amount", "total_amount"
]

dtrain = RayDMatrix(
    path,
    label="passenger_count",  # Will select this column as the label
    columns=columns,
    # ignore=["total_amount"],  # Optional list of columns to ignore
    filetype=RayFileType.PARQUET)
```

<!--$UNCOMMENT(xgboost-ray-tuning)=-->

## Hyperparameter Tuning

XGBoost-Ray integrates with <!--$UNCOMMENT{ref}`Ray Tune <tune-main>`--><!--$REMOVE-->[Ray Tune](https://tune.io)<!--$END_REMOVE--> to provide distributed hyperparameter tuning for your
distributed XGBoost models. You can run multiple XGBoost-Ray training runs in parallel, each with a different
hyperparameter configuration, and each training run parallelized by itself. All you have to do is move your training
code to a function, and pass the function to `tune.run`. Internally, `train` will detect if `tune` is being used and will
automatically report results to tune.

Example using XGBoost-Ray with Ray Tune:

```python
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

num_actors = 4
num_cpus_per_actor = 1

ray_params = RayParams(
    num_actors=num_actors,
    cpus_per_actor=num_cpus_per_actor)

def train_model(config):
    train_x, train_y = load_breast_cancer(return_X_y=True)
    train_set = RayDMatrix(train_x, train_y)

    evals_result = {}
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(train_set, "train")],
        verbose_eval=False,
        ray_params=ray_params)
    bst.save_model("model.xgb")

from ray import tune

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "eta": tune.loguniform(1e-4, 1e-1),
    "subsample": tune.uniform(0.5, 1.0),
    "max_depth": tune.randint(1, 9)
}

# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
analysis = tune.run(
    train_model,
    config=config,
    metric="train-error",
    mode="min",
    num_samples=4,
    resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters", analysis.best_config)
```

Also see examples/simple_tune.py for another example.

## Fault tolerance

XGBoost-Ray leverages the stateful Ray actor model to
enable fault tolerant training. There are currently
two modes implemented.

### Non-elastic training (warm restart)

When an actor or node dies, XGBoost-Ray will retain the
state of the remaining actors. In non-elastic training,
the failed actors will be replaced as soon as resources
are available again. Only these actors will reload their
parts of the data. Training will resume once all actors
are ready for training again.

You can set this mode in the `RayParams`:

```python
from xgboost_ray import RayParams

ray_params = RayParams(
    elastic_training=False,  # Use non-elastic training
    max_actor_restarts=2,    # How often are actors allowed to fail
)
```

### Elastic training

In elastic training, XGBoost-Ray will continue training
with fewer actors (and on fewer data) when a node or actor
dies. The missing actors are staged in the background,
and are reintegrated into training once they are back and
loaded their data.

This mode will train on fewer data for a period of time,
which can impact accuracy. In practice, we found these
effects to be minor, especially for large shuffled datasets.
The immediate benefit is that training time is reduced
significantly to almost the same level as if no actors died.
Thus, especially when data loading takes a large part of
the total training time, this setting can dramatically speed
up training times for large distributed jobs.

You can configure this mode in the `RayParams`:

```python
from xgboost_ray import RayParams

ray_params = RayParams(
    elastic_training=True,  # Use elastic training
    max_failed_actors=3,    # Only allow at most 3 actors to die at the same time
    max_actor_restarts=2,   # How often are actors allowed to fail
)
```

## Resources

By default, XGBoost-Ray tries to determine the number of CPUs
available and distributes them evenly across actors.

In the case of very large clusters or clusters with many different
machine sizes, it makes sense to limit the number of CPUs per actor
by setting the `cpus_per_actor` argument. Consider always
setting this explicitly.

The number of XGBoost actors always has to be set manually with
the `num_actors` argument.

### Multi GPU training

XGBoost-Ray enables multi GPU training. The XGBoost core backend
will automatically leverage NCCL2 for cross-device communication.
All you have to do is to start one actor per GPU and set XGBoost's
`tree_method` to a GPU-compatible option, eg. `gpu_hist` (see XGBoost
documentation for more details.)

For instance, if you have 2 machines with 4 GPUs each, you will want
to start 8 remote actors, and set `gpus_per_actor=1`. There is usually
no benefit in allocating less (e.g. 0.5) or more than one GPU per actor.

You should divide the CPUs evenly across actors per machine, so if your
machines have 16 CPUs in addition to the 4 GPUs, each actor should have
4 CPUs to use.

```python
from xgboost_ray import RayParams

ray_params = RayParams(
    num_actors=8,
    gpus_per_actor=1,
    cpus_per_actor=4,   # Divide evenly across actors per machine
)
```

### How many remote actors should I use?

This depends on your workload and your cluster setup.
Generally there is no inherent benefit of running more than
one remote actor per node for CPU-only training. This is because
XGBoost core can already leverage multiple CPUs via threading.

However, there are some cases when you should consider starting
more than one actor per node:

- For [multi GPU training](#multi-gpu-training), each GPU should have a separate
  remote actor. Thus, if your machine has 24 CPUs and 4 GPUs,
  you will want to start 4 remote actors with 6 CPUs and 1 GPU
  each
- In a **heterogeneous cluster**, you might want to find the
  [greatest common divisor](https://en.wikipedia.org/wiki/Greatest_common_divisor)
  for the number of CPUs.
  E.g. for a cluster with three nodes of 4, 8, and 12 CPUs, respectively,
  you should set the number of actors to 6 and the CPUs per
  actor to 4.

## Distributed data loading

XGBoost-Ray can leverage both centralized and distributed data loading.

In **centralized data loading**, the data is partitioned by the head node
and stored in the object store. Each remote actor then retrieves their
partitions by querying the Ray object store. Centralized loading is used
when you pass centralized in-memory dataframes, such as Pandas dataframes
or Numpy arrays, or when you pass a single source file, such as a single CSV
or Parquet file.

```python
from xgboost_ray import RayDMatrix

# This will use centralized data loading, as only one source file is specified
# `label_col` is a column in the CSV, used as the target label
ray_params = RayDMatrix("./source_file.csv", label="label_col")
```

In **distributed data loading**, each remote actor loads their data directly from
the source (e.g. local hard disk, NFS, HDFS, S3),
without a central bottleneck. The data is still stored in the
object store, but locally to each actor. This mode is used automatically
when loading data from multiple CSV or Parquet files. Please note that
we do not check or enforce partition sizes in this case - it is your job
to make sure the data is evenly distributed across the source files.

```python
from xgboost_ray import RayDMatrix

# This will use distributed data loading, as four source files are specified
# Please note that you cannot schedule more than four actors in this case.
# `label_col` is a column in the Parquet files, used as the target label
ray_params = RayDMatrix([
    "hdfs:///tmp/part1.parquet",
    "hdfs:///tmp/part2.parquet",
    "hdfs:///tmp/part3.parquet",
    "hdfs:///tmp/part4.parquet",
], label="label_col")
```

Lastly, XGBoost-Ray supports **distributed dataframe** representations, such
as <!--$UNCOMMENT{ref}`Ray Datasets <datasets>`--><!--$REMOVE-->[Ray Datasets](https://docs.ray.io/en/latest/data/dataset.html)<!--$END_REMOVE-->,
[Modin](https://modin.readthedocs.io/en/latest/) and
[Dask dataframes](https://docs.dask.org/en/latest/dataframe.html)
(used with <!--$UNCOMMENT{ref}`Dask on Ray <dask-on-ray>`--><!--$REMOVE-->[Dask on Ray](https://docs.ray.io/en/master/dask-on-ray.html)<!--$END_REMOVE-->).
Here, XGBoost-Ray will check on which nodes the distributed partitions
are currently located, and will assign partitions to actors in order to
minimize cross-node data transfer. Please note that we also assume here
that partition sizes are uniform.

```python
from xgboost_ray import RayDMatrix

# This will try to allocate the existing Modin partitions
# to co-located Ray actors. If this is not possible, data will
# be transferred across nodes
ray_params = RayDMatrix(existing_modin_df)
```

### Data sources

The following data sources can be used with a `RayDMatrix` object.

| Type                                                             | Centralized loading | Distributed loading |
|------------------------------------------------------------------|---------------------|---------------------|
| Numpy array                                                      | Yes                 | No                  |
| Pandas dataframe                                                 | Yes                 | No                  |
| Single CSV                                                       | Yes                 | No                  |
| Multi CSV                                                        | Yes                 | Yes                 |
| Single Parquet                                                   | Yes                 | No                  |
| Multi Parquet                                                    | Yes                 | Yes                 |
| [Ray Dataset](https://docs.ray.io/en/latest/data/dataset.html)   | Yes                 | Yes                 |
| [Petastorm](https://github.com/uber/petastorm)                   | Yes                 | Yes                 |
| [Dask dataframe](https://docs.dask.org/en/latest/dataframe.html) | Yes                 | Yes                 |
| [Modin dataframe](https://modin.readthedocs.io/en/latest/)       | Yes                 | Yes                 |

## Memory usage

XGBoost uses a compute-optimized datastructure, the `DMatrix`,
to hold training data. When converting a dataset to a `DMatrix`,
XGBoost creates intermediate copies and ends up
holding a complete copy of the full data. The data will be converted
into the local dataformat (on a 64 bit system these are 64 bit floats.)
Depending on the system and original dataset dtype, this matrix can
thus occupy more memory than the original dataset.

The **peak memory usage** for CPU-based training is at least
**3x** the dataset size (assuming dtype `float32` on a 64bit system)
plus about **400,000 KiB** for other resources,
like operating system requirements and storing of intermediate
results.

**Example**

- Machine type: AWS m5.xlarge (4 vCPUs, 16 GiB RAM)
- Usable RAM: ~15,350,000 KiB
- Dataset: 1,250,000 rows with 1024 features, dtype float32.
  Total size: 5,000,000 KiB
- XGBoost DMatrix size: ~10,000,000 KiB

This dataset will fit exactly on this node for training.

Note that the DMatrix size might be lower on a 32 bit system.

**GPUs**

Generally, the same memory requirements exist for GPU-based
training. Additionally, the GPU must have enough memory
to hold the dataset.

In the example above, the GPU must have at least
10,000,000 KiB (about 9.6 GiB) memory. However,
empirically we found that using a `DeviceQuantileDMatrix`
seems to show more peak GPU memory usage, possibly
for intermediate storage when loading data (about 10%).

**Best practices**

In order to reduce peak memory usage, consider the following
suggestions:

- Store data as `float32` or less. More precision is often
  not needed, and keeping data in a smaller format will
  help reduce peak memory usage for initial data loading.
- Pass the `dtype` when loading data from CSV. Otherwise,
  floating point values will be loaded as `np.float64`
  per default, increasing peak memory usage by 33%.

## Placement Strategies

XGBoost-Ray leverages Ray's Placement Group API (<https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html>)
to implement placement strategies for better fault tolerance.

By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers
across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the
number of worker failures when a node goes down, but comes at a cost of increased inter-node communication
To disable this strategy, set the `RXGB_USE_SPREAD_STRATEGY` environment variable to 0. If disabled, no
particular placement strategy will be used.

Note that this strategy is used only when `elastic_training` is not used. If `elastic_training` is set to `True`,
no placement strategy is used.

When XGBoost-Ray is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy
attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node
goes down, it will be less likely to impact multiple trials.

When placement strategies are used, XGBoost-Ray will wait for 100 seconds for the required resources
to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale
to increase the number of resources. You can change the `RXGB_PLACEMENT_GROUP_TIMEOUT_S` environment variable to modify
how long this timeout should be.

## More examples

For complete end to end examples, please have a look at
the [examples folder](https://github.com/ray-project/xgboost_ray/tree/master/xgboost_ray/examples/):

- [Simple sklearn breastcancer dataset example](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/simple.py) (requires `sklearn`)
- [HIGGS classification example](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/higgs.py)
  ([download dataset (2.6 GB)](https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz))
- [HIGGS classification example with Parquet](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/higgs_parquet.py) (uses the same dataset)
- [Test data classification](https://github.com/ray-project/xgboost_ray/blob/master/xgboost_ray/examples/train_on_test_data.py) (uses a self-generated dataset)
<!--$REMOVE-->
## Resources

* [XGBoost-Ray documentation](https://xgboost.readthedocs.io/en/stable/tutorials/ray.html)
* [Ray community slack](https://forms.gle/9TSdDYUgxYs8SA9e8)
<!--$END_REMOVE-->
<!--$UNCOMMENT## API reference

```{eval-rst}
.. autoclass:: xgboost_ray.RayParams
    :members:
```

```{eval-rst}
.. autoclass:: xgboost_ray.RayDMatrix
    :members:
```

```{eval-rst}
.. autofunction:: xgboost_ray.train
```

```{eval-rst}
.. autofunction:: xgboost_ray.predict
```

### scikit-learn API

```{eval-rst}
.. autoclass:: xgboost_ray.RayXGBClassifier
    :members:
```

```{eval-rst}
.. autoclass:: xgboost_ray.RayXGBRegressor
    :members:
```

```{eval-rst}
.. autoclass:: xgboost_ray.RayXGBRFClassifier
    :members:
```

```{eval-rst}
.. autoclass:: xgboost_ray.RayXGBRFRegressor
    :members:
```-->


================================================
FILE: format.sh
================================================
#!/usr/bin/env bash
# Black + Clang formatter (if installed). This script formats all changed files from the last mergebase.
# You are encouraged to run this locally before pushing changes for review.

# Cause the script to exit if a single command fails
set -euo pipefail

FLAKE8_VERSION_REQUIRED="3.9.1"
BLACK_VERSION_REQUIRED="22.10.0"
SHELLCHECK_VERSION_REQUIRED="0.7.1"
ISORT_VERSION_REQUIRED="5.10.1"

check_python_command_exist() {
    VERSION=""
    case "$1" in
        black)
            VERSION=$BLACK_VERSION_REQUIRED
            ;;
        flake8)
            VERSION=$FLAKE8_VERSION_REQUIRED
            ;;
        isort)
            VERSION=$ISORT_VERSION_REQUIRED
            ;;
        *)
            echo "$1 is not a required dependency"
            exit 1
    esac
    if ! [ -x "$(command -v "$1")" ]; then
        echo "$1 not installed. Install the python package with: pip install $1==$VERSION"
        exit 1
    fi
}

check_docstyle() {
    echo "Checking docstyle..."
    violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true)
    if [[ -n "$violations" ]]; then
        echo
        echo "=== Found Ray docstyle violations ==="
        echo "$violations"
        echo
        echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style "
        echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore."
        exit 1
    fi
    return 0
}

check_python_command_exist black
check_python_command_exist flake8
check_python_command_exist isort

# this stops git rev-parse from failing if we run this from the .git directory
builtin cd "$(dirname "${BASH_SOURCE:-$0}")"

ROOT="$(git rev-parse --show-toplevel)"
builtin cd "$ROOT" || exit 1

# NOTE(edoakes): black version differs based on installation method:
#   Option 1) 'black, 21.12b0 (compiled: no)'
#   Option 2) 'black, version 21.12b0'
#   For newer versions (at least 22.10.0), a second line is printed which must be dropped:
#
#     black, 22.10.0 (compiled: yes)
#     Python (CPython) 3.9.13
BLACK_VERSION_STR=$(black --version)
if [[ "$BLACK_VERSION_STR" == *"compiled"* ]]
then
    BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}')
else
    BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}')
fi
FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}')
ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}')

# params: tool name, tool version, required version
tool_version_check() {
    if [ "$2" != "$3" ]; then
        echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results."
    fi
}

tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED"
tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED"
tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED"

if command -v shellcheck >/dev/null; then
    SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}')
    tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED"
else
    echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager."
fi

if command -v clang-format >/dev/null; then
  CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}')
  tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0"
else
    echo "WARNING: clang-format is not installed!"
fi

if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then
    echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes"
fi

if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then
    echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear"
fi

SHELLCHECK_FLAGS=(
  --exclude=1090  # "Can't follow non-constant source. Use a directive to specify location."
  --exclude=1091  # "Not following {file} due to some error"
  --exclude=2207  # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash
)


BLACK_EXCLUDES=(
    '--force-exclude'
    'python/ray/cloudpickle/*|'`
    `'python/build/*|'`
    `'python/ray/core/src/ray/gcs/*|'`
    `'python/ray/thirdparty_files/*|'`
    `'python/ray/_private/thirdparty/*|'`
    `'python/ray/serve/tests/test_config_files/syntax_error\.py'
)

GIT_LS_EXCLUDES=(
  ':(exclude)python/ray/cloudpickle/'
  ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py'
)

# TODO(barakmich): This should be cleaned up. I've at least excised the copies
# of these arguments to this location, but the long-term answer is to actually
# make a flake8 config file
FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605"

shellcheck_scripts() {
  shellcheck "${SHELLCHECK_FLAGS[@]}" "$@"
}

# Format specified files
format_files() {
    local shell_files=() python_files=() bazel_files=()

    local name
    for name in "$@"; do
      local base="${name%.*}"
      local suffix="${name#"${base}"}"

      local shebang=""
      read -r shebang < "${name}" || true
      case "${shebang}" in
        '#!'*)
          shebang="${shebang#/usr/bin/env }"
          shebang="${shebang%% *}"
          shebang="${shebang##*/}"
          ;;
      esac

      if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then
        bazel_files+=("${name}")
      elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then
        python_files+=("${name}")
      elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then
        shell_files+=("${name}")
      else
        echo "error: failed to determine file type: ${name}" 1>&2
        return 1
      fi
    done

    if [ 0 -lt "${#python_files[@]}" ]; then
      isort "${python_files[@]}"
      black "${python_files[@]}"
    fi

    if command -v shellcheck >/dev/null; then
      if shellcheck --shell=sh --format=diff - < /dev/null; then
        if [ 0 -lt "${#shell_files[@]}" ]; then
          local difference
          difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")"
          difference="${difference%-}"
          printf "%s" "${difference}" | patch -p1
        fi
      else
        echo "error: this version of shellcheck does not support diffs"
      fi
    fi
}

format_all_scripts() {
    command -v flake8 &> /dev/null;
    HAS_FLAKE8=$?

    # Run isort before black to fix imports and let black deal with file format.
    echo "$(date)" "isort...."
    git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
      isort
    echo "$(date)" "Black...."
    git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
      black "${BLACK_EXCLUDES[@]}"
    if [ $HAS_FLAKE8 ]; then
      echo "$(date)" "Flake8...."
      git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \
        flake8 --config=.flake8
    fi

    if command -v shellcheck >/dev/null; then
      local shell_files non_shell_files
      non_shell_files=($(git ls-files -- ':(exclude)*.sh'))
      shell_files=($(git ls-files -- '*.sh'))
      if [ 0 -lt "${#non_shell_files[@]}" ]; then
        shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true))
      fi
      if [ 0 -lt "${#shell_files[@]}" ]; then
        echo "$(date)" "shellcheck scripts...."
        shellcheck_scripts "${shell_files[@]}"
      fi
    fi
}

# Format files that differ from main branch. Ignores dirs that are not slated
# for autoformat yet.
format_changed() {
    # The `if` guard ensures that the list of filenames is not empty, which
    # could cause the formatter to receive 0 positional arguments, making
    # Black error.
    #
    # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that
    # exist on both branches.
    MERGEBASE="$(git merge-base upstream/master HEAD)"

    if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then
        git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
            isort
    fi

    if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then
        git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
            black "${BLACK_EXCLUDES[@]}"
        if which flake8 >/dev/null; then
            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
                 flake8 --config=.flake8
        fi
    fi

    if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then
        if which flake8 >/dev/null; then
            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \
                 flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES"
        fi
    fi

    if which clang-format >/dev/null; then
        if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then
            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \
                 clang-format -i
        fi
    fi

    if command -v shellcheck >/dev/null; then
        local shell_files non_shell_files
        non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh'))
        shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh'))
        if [ 0 -lt "${#non_shell_files[@]}" ]; then
            shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true))
        fi
        if [ 0 -lt "${#shell_files[@]}" ]; then
            shellcheck_scripts "${shell_files[@]}"
        fi
    fi
}

# This flag formats individual files. --files *must* be the first command line
# arg to use this option.
if [ "${1-}" == '--files' ]; then
    format_files "${@:2}"
# If `--all` or `--scripts` are passed, then any further arguments are ignored.
# Format the entire python directory and other scripts.
elif [ "${1-}" == '--all-scripts' ]; then
    format_all_scripts "${@}"
    if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi
# Format the all Python, C++, Java and other script files.
elif [ "${1-}" == '--all' ]; then
    format_all_scripts "${@}"
    if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi
else
    # Add the upstream remote if it doesn't exist
    if ! git remote -v | grep -q upstream; then
        git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git'
    fi

    # Only fetch master since that's the branch we're diffing against.
    git fetch upstream master || true

    # Format only the files that changed in last commit.
    format_changed
fi

check_docstyle

if ! git diff --quiet &>/dev/null; then
    echo 'Reformatted changed files. Please review and stage the changes.'
    echo 'Files updated:'
    echo

    git --no-pager diff --name-only

    exit 1
fi


================================================
FILE: requirements/lint-requirements.txt
================================================
flake8==3.9.1
flake8-comprehensions==3.10.1
flake8-quotes==2.0.0
flake8-bugbear==21.9.2
black==22.10.0
isort==5.10.1
importlib-metadata==4.13.0


================================================
FILE: requirements/test-requirements.txt
================================================
packaging
petastorm
pytest
pyarrow<15.0.0
ray[tune, data, default]
scikit-learn
# modin==0.23.1.post0 is not compatible with xgboost_ray py38
modin<=0.23.1; python_version == '3.8'
# modin==0.26.0 is not compatible with xgboost_ray py39+
modin<0.26.0; python_version > '3.8'
dask

#workaround for now
protobuf<4.0.0
tensorboardX==2.2


================================================
FILE: run_ci_examples.sh
================================================
#!/bin/bash
set -e

TUNE=1

for i in "$@"
do
echo "$i"
case "$i" in
    --no-tune)
    TUNE=0
    ;;
    *)
    echo "unknown arg, $i"
    exit 1
    ;;
esac
done

pushd xgboost_ray/examples/ || exit 1
ray stop || true
echo "================"
echo "Running examples"
echo "================"
echo "running readme.py" && python readme.py
echo "running readme_sklearn_api.py" && python readme_sklearn_api.py
echo "running simple.py" && python simple.py --smoke-test
echo "running simple_predict.py" && python simple_predict.py
echo "running simple_dask.py" && python simple_dask.py --smoke-test
echo "running simple_modin.py" && python simple_modin.py --smoke-test
echo "running simple_objectstore.py" && python simple_objectstore.py --smoke-test
echo "running simple_ray_dataset.py" && python simple_objectstore.py --smoke-test
echo "running simple_partitioned.py" && python simple_partitioned.py --smoke-test

if [ "$TUNE" = "1" ]; then
  echo "running simple_tune.py" && python simple_tune.py --smoke-test
else
  echo "skipping tune example"
fi

echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test
popd

pushd xgboost_ray/tests
echo "running examples with Ray Client"
python -m pytest -v --durations=0 -x test_client.py
popd || exit 1


================================================
FILE: run_ci_tests.sh
================================================
#!/bin/bash
TUNE=1

for i in "$@"
do
echo "$i"
case "$i" in
    --no-tune)
    TUNE=0
    ;;
    *)
    echo "unknown arg, $i"
    exit 1
    ;;
esac
done

pushd xgboost_ray/tests || exit 1
echo "============="
echo "Running tests"
echo "============="
END_STATUS=0
if ! python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x "test_colocation.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_matrix.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_data_source.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_xgboost_api.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_fault_tolerance.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_end_to_end.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_sklearn.py" ; then END_STATUS=1; fi
if ! python -m pytest -v --durations=0 -x "test_sklearn_matrix.py" ; then END_STATUS=1; fi

if [ "$TUNE" = "1" ]; then
  if ! python -m pytest -v --durations=0 -x "test_tune.py" ; then END_STATUS=1; fi
else
  echo "skipping tune tests"
fi

echo "running smoke test on benchmark_cpu_gpu.py" && if ! python release/benchmark_cpu_gpu.py 2 10 20 --smoke-test; then END_STATUS=1; fi
popd || exit 1

if [ "$END_STATUS" = "1" ]; then
  echo "At least one test has failed, exiting with code 1"
fi
exit "$END_STATUS"

================================================
FILE: setup.py
================================================
from setuptools import find_packages, setup

setup(
    name="xgboost_ray",
    packages=find_packages(where=".", include="xgboost_ray*"),
    version="0.1.20",
    author="Ray Team",
    description="A Ray backend for distributed XGBoost",
    license="Apache 2.0",
    long_description="A distributed backend for XGBoost built on top of "
    "distributed computing framework Ray.",
    url="https://github.com/ray-project/xgboost_ray",
    install_requires=[
        "ray>=2.7",
        "numpy>=1.16",
        "pandas",
        "wrapt>=1.12.1",
        "xgboost>=0.90",
        "packaging",
    ],
)


================================================
FILE: xgboost_ray/__init__.py
================================================
from xgboost_ray.main import RayParams, predict, train
from xgboost_ray.matrix import (
    Data,
    RayDeviceQuantileDMatrix,
    RayDMatrix,
    RayFileType,
    RayShardingMode,
    combine_data,
)

# workaround for legacy xgboost==0.9.0
try:
    from xgboost_ray.sklearn import (
        RayXGBClassifier,
        RayXGBRanker,
        RayXGBRegressor,
        RayXGBRFClassifier,
        RayXGBRFRegressor,
    )
except ImportError:
    pass

__version__ = "0.1.20"

__all__ = [
    "__version__",
    "RayParams",
    "RayDMatrix",
    "RayDeviceQuantileDMatrix",
    "RayFileType",
    "RayShardingMode",
    "Data",
    "combine_data",
    "train",
    "predict",
    "RayXGBClassifier",
    "RayXGBRegressor",
    "RayXGBRFClassifier",
    "RayXGBRFRegressor",
    "RayXGBRanker",
]


================================================
FILE: xgboost_ray/callback.py
================================================
import os
from abc import ABC
from typing import TYPE_CHECKING, Any, Dict, Sequence, Union

import pandas as pd
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
    from xgboost_ray.main import RayXGBoostActor
    from xgboost_ray.matrix import RayDMatrix


@PublicAPI(stability="beta")
class DistributedCallback(ABC):
    """Distributed callbacks for RayXGBoostActors.

    The hooks of these callbacks are executed on the remote Ray actors
    at different points in time. They can be used to set environment
    variables or to prepare the training/prediction environment in other
    ways. Distributed callback objects are de-serialized on each actor
    and are then independent of each other - changing the state of one
    callback will not alter the state of the other copies on different actors.

    Callbacks can be passed to xgboost_ray via
    :class:`RayParams <xgboost_ray.main.RayParams>` using the
    ``distributed_callbacks`` parameter.
    """

    def on_init(self, actor: "RayXGBoostActor", *args, **kwargs):
        pass

    def before_data_loading(
        self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
    ):
        pass

    def after_data_loading(
        self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
    ):
        pass

    def before_train(self, actor: "RayXGBoostActor", *args, **kwargs):
        pass

    def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs):
        pass

    def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs):
        pass

    def after_predict(
        self,
        actor: "RayXGBoostActor",
        predictions: Union[pd.Series, pd.DataFrame],
        *args,
        **kwargs
    ):
        pass


@DeveloperAPI
class DistributedCallbackContainer:
    def __init__(self, callbacks: Sequence[DistributedCallback]):
        self.callbacks = callbacks or []

    def on_init(self, actor: "RayXGBoostActor", *args, **kwargs):
        for callback in self.callbacks:
            callback.on_init(actor, *args, **kwargs)

    def before_data_loading(
        self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
    ):
        for callback in self.callbacks:
            callback.before_data_loading(actor, data, *args, **kwargs)

    def after_data_loading(
        self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
    ):
        for callback in self.callbacks:
            callback.after_data_loading(actor, data, *args, **kwargs)

    def before_train(self, actor: "RayXGBoostActor", *args, **kwargs):
        for callback in self.callbacks:
            callback.before_train(actor, *args, **kwargs)

    def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs):
        for callback in self.callbacks:
            callback.after_train(actor, result_dict, *args, **kwargs)

    def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs):
        for callback in self.callbacks:
            callback.before_predict(actor, *args, **kwargs)

    def after_predict(
        self,
        actor: "RayXGBoostActor",
        predictions: Union[pd.Series, pd.DataFrame],
        *args,
        **kwargs
    ):
        for callback in self.callbacks:
            callback.after_predict(actor, predictions, *args, **kwargs)


class EnvironmentCallback(DistributedCallback):
    def __init__(self, env_dict: Dict[str, Any]):
        self.env_dict = env_dict

    def on_init(self, actor, *args, **kwargs):
        os.environ.update(self.env_dict)


================================================
FILE: xgboost_ray/compat/__init__.py
================================================
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from xgboost_ray.xgb import xgboost as xgb

try:
    from xgboost.callback import TrainingCallback

    LEGACY_CALLBACK = False
except ImportError:

    class TrainingCallback:
        def __init__(self):
            if hasattr(self, "before_iteration"):
                # XGBoost < 1.0 is looking up __dict__ to see if a
                # callback should be called before or after an iteration.
                # So here we move this to self._before_iteration and
                # overwrite the dict.
                self._before_iteration = getattr(self, "before_iteration")
                self.__dict__["before_iteration"] = True

        def __call__(self, callback_env: "xgb.core.CallbackEnv"):
            if hasattr(self, "_before_iteration"):
                self._before_iteration(
                    model=callback_env.model,
                    epoch=callback_env.iteration,
                    evals_log=callback_env.evaluation_result_list,
                )

            if hasattr(self, "after_iteration"):
                self.after_iteration(
                    model=callback_env.model,
                    epoch=callback_env.iteration,
                    evals_log=callback_env.evaluation_result_list,
                )

        def before_training(self, model):
            pass

        def after_training(self, model):
            pass

    LEGACY_CALLBACK = True

try:
    from xgboost import RabitTracker
except ImportError:
    from xgboost_ray.compat.tracker import RabitTracker

__all__ = ["TrainingCallback", "RabitTracker"]


================================================
FILE: xgboost_ray/compat/tracker.py
================================================
# flake8: noqa

# Copyright 2021 by XGBoost Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# File copied from:
# https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/python-package/xgboost/tracker.py

import logging

# License:
# https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/LICENSE
import socket
import struct
import time
from threading import Thread


class ExSocket(object):
    """
    Extension of socket to handle recv and send of special data
    """

    def __init__(self, sock):
        self.sock = sock

    def recvall(self, nbytes):
        res = []
        nread = 0
        while nread < nbytes:
            chunk = self.sock.recv(min(nbytes - nread, 1024))
            nread += len(chunk)
            res.append(chunk)
        return b"".join(res)

    def recvint(self):
        return struct.unpack("@i", self.recvall(4))[0]

    def sendint(self, n):
        self.sock.sendall(struct.pack("@i", n))

    def sendstr(self, s):
        self.sendint(len(s))
        self.sock.sendall(s.encode())

    def recvstr(self):
        slen = self.recvint()
        return self.recvall(slen).decode()


# magic number used to verify existence of data
kMagic = 0xFF99


def get_some_ip(host):
    return socket.getaddrinfo(host, None)[0][4][0]


def get_host_ip(hostIP=None):
    if hostIP is None or hostIP == "auto":
        hostIP = "ip"

    if hostIP == "dns":
        hostIP = socket.getfqdn()
    elif hostIP == "ip":
        from socket import gaierror

        try:
            hostIP = socket.gethostbyname(socket.getfqdn())
        except gaierror:
            logging.debug(
                "gethostbyname(socket.getfqdn()) failed... trying on hostname()"
            )
            hostIP = socket.gethostbyname(socket.gethostname())
        if hostIP.startswith("127."):
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            # doesn't have to be reachable
            s.connect(("10.255.255.255", 1))
            hostIP = s.getsockname()[0]
    return hostIP


def get_family(addr):
    return socket.getaddrinfo(addr, None)[0][0]


class SlaveEntry(object):
    def __init__(self, sock, s_addr):
        slave = ExSocket(sock)
        self.sock = slave
        self.host = get_some_ip(s_addr[0])
        magic = slave.recvint()
        assert magic == kMagic, "invalid magic number=%d from %s" % (magic, self.host)
        slave.sendint(kMagic)
        self.rank = slave.recvint()
        self.world_size = slave.recvint()
        self.jobid = slave.recvstr()
        self.cmd = slave.recvstr()
        self.wait_accept = 0
        self.port = None

    def decide_rank(self, job_map):
        if self.rank >= 0:
            return self.rank
        if self.jobid != "NULL" and self.jobid in job_map:
            return job_map[self.jobid]
        return -1

    def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
        self.rank = rank
        nnset = set(tree_map[rank])
        rprev, rnext = ring_map[rank]
        self.sock.sendint(rank)
        # send parent rank
        self.sock.sendint(parent_map[rank])
        # send world size
        self.sock.sendint(len(tree_map))
        self.sock.sendint(len(nnset))
        # send the rprev and next link
        for r in nnset:
            self.sock.sendint(r)
        # send prev link
        if rprev not in (-1, rank):
            nnset.add(rprev)
            self.sock.sendint(rprev)
        else:
            self.sock.sendint(-1)
        # send next link
        if rnext not in (-1, rank):
            nnset.add(rnext)
            self.sock.sendint(rnext)
        else:
            self.sock.sendint(-1)
        while True:
            ngood = self.sock.recvint()
            goodset = set([])
            for _ in range(ngood):
                goodset.add(self.sock.recvint())
            assert goodset.issubset(nnset)
            badset = nnset - goodset
            conset = []
            for r in badset:
                if r in wait_conn:
                    conset.append(r)
            self.sock.sendint(len(conset))
            self.sock.sendint(len(badset) - len(conset))
            for r in conset:
                self.sock.sendstr(wait_conn[r].host)
                self.sock.sendint(wait_conn[r].port)
                self.sock.sendint(r)
            nerr = self.sock.recvint()
            if nerr != 0:
                continue
            self.port = self.sock.recvint()
            rmset = []
            # all connection was successuly setup
            for r in conset:
                wait_conn[r].wait_accept -= 1
                if wait_conn[r].wait_accept == 0:
                    rmset.append(r)
            for r in rmset:
                wait_conn.pop(r, None)
            self.wait_accept = len(badset) - len(conset)
            return rmset


class RabitTracker(object):
    """
    tracker for rabit
    """

    def __init__(self, hostIP, nslave, port=9091, port_end=9999):
        sock = socket.socket(get_family(hostIP), socket.SOCK_STREAM)
        for _port in range(port, port_end):
            try:
                sock.bind((hostIP, _port))
                self.port = _port
                break
            except socket.error as e:
                if e.errno in [98, 48]:
                    continue
                raise
        sock.listen(256)
        self.sock = sock
        self.hostIP = hostIP
        self.thread = None
        self.start_time = None
        self.end_time = None
        self.nslave = nslave
        logging.info("start listen on %s:%d", hostIP, self.port)

    def __del__(self):
        self.sock.close()

    @staticmethod
    def get_neighbor(rank, nslave):
        rank = rank + 1
        ret = []
        if rank > 1:
            ret.append(rank // 2 - 1)
        if rank * 2 - 1 < nslave:
            ret.append(rank * 2 - 1)
        if rank * 2 < nslave:
            ret.append(rank * 2)
        return ret

    def slave_envs(self):
        """
        get enviroment variables for slaves
        can be passed in as args or envs
        """
        return {"DMLC_TRACKER_URI": self.hostIP, "DMLC_TRACKER_PORT": self.port}

    def get_tree(self, nslave):
        tree_map = {}
        parent_map = {}
        for r in range(nslave):
            tree_map[r] = self.get_neighbor(r, nslave)
            parent_map[r] = (r + 1) // 2 - 1
        return tree_map, parent_map

    def find_share_ring(self, tree_map, parent_map, r):
        """
        get a ring structure that tends to share nodes with the tree
        return a list starting from r
        """
        nset = set(tree_map[r])
        cset = nset - set([parent_map[r]])
        if not cset:
            return [r]
        rlst = [r]
        cnt = 0
        for v in cset:
            vlst = self.find_share_ring(tree_map, parent_map, v)
            cnt += 1
            if cnt == len(cset):
                vlst.reverse()
            rlst += vlst
        return rlst

    def get_ring(self, tree_map, parent_map):
        """
        get a ring connection used to recover local data
        """
        assert parent_map[0] == -1
        rlst = self.find_share_ring(tree_map, parent_map, 0)
        assert len(rlst) == len(tree_map)
        ring_map = {}
        nslave = len(tree_map)
        for r in range(nslave):
            rprev = (r + nslave - 1) % nslave
            rnext = (r + 1) % nslave
            ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
        return ring_map

    def get_link_map(self, nslave):
        """
        get the link map, this is a bit hacky, call for better algorithm
        to place similar nodes together
        """
        tree_map, parent_map = self.get_tree(nslave)
        ring_map = self.get_ring(tree_map, parent_map)
        rmap = {0: 0}
        k = 0
        for i in range(nslave - 1):
            k = ring_map[k][1]
            rmap[k] = i + 1

        ring_map_ = {}
        tree_map_ = {}
        parent_map_ = {}
        for k, v in ring_map.items():
            ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]])
        for k, v in tree_map.items():
            tree_map_[rmap[k]] = [rmap[x] for x in v]
        for k, v in parent_map.items():
            if k != 0:
                parent_map_[rmap[k]] = rmap[v]
            else:
                parent_map_[rmap[k]] = -1
        return tree_map_, parent_map_, ring_map_

    def accept_slaves(self, nslave):
        # set of nodes that finishs the job
        shutdown = {}
        # set of nodes that is waiting for connections
        wait_conn = {}
        # maps job id to rank
        job_map = {}
        # list of workers that is pending to be assigned rank
        pending = []
        # lazy initialize tree_map
        tree_map = None

        while len(shutdown) != nslave:
            fd, s_addr = self.sock.accept()
            s = SlaveEntry(fd, s_addr)
            if s.cmd == "print":
                msg = s.sock.recvstr()
                print(msg.strip(), flush=True)
                continue
            if s.cmd == "shutdown":
                assert s.rank >= 0 and s.rank not in shutdown
                assert s.rank not in wait_conn
                shutdown[s.rank] = s
                logging.debug("Received %s signal from %d", s.cmd, s.rank)
                continue
            assert s.cmd == "start" or s.cmd == "recover"
            # lazily initialize the slaves
            if tree_map is None:
                assert s.cmd == "start"
                if s.world_size > 0:
                    nslave = s.world_size
                tree_map, parent_map, ring_map = self.get_link_map(nslave)
                # set of nodes that is pending for getting up
                todo_nodes = list(range(nslave))
            else:
                assert s.world_size == -1 or s.world_size == nslave
            if s.cmd == "recover":
                assert s.rank >= 0

            rank = s.decide_rank(job_map)
            # batch assignment of ranks
            if rank == -1:
                assert todo_nodes
                pending.append(s)
                if len(pending) == len(todo_nodes):
                    pending.sort(key=lambda x: x.host)
                    for s in pending:
                        rank = todo_nodes.pop(0)
                        if s.jobid != "NULL":
                            job_map[s.jobid] = rank
                        s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
                        if s.wait_accept > 0:
                            wait_conn[rank] = s
                        logging.debug(
                            "Received %s signal from %s; assign rank %d",
                            s.cmd,
                            s.host,
                            s.rank,
                        )
                if not todo_nodes:
                    logging.info("@tracker All of %d nodes getting started", nslave)
                    self.start_time = time.time()
            else:
                s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
                logging.debug("Received %s signal from %d", s.cmd, s.rank)
                if s.wait_accept > 0:
                    wait_conn[rank] = s
        logging.info("@tracker All nodes finishes job")
        self.end_time = time.time()
        logging.info(
            "@tracker %s secs between node start and job finish",
            str(self.end_time - self.start_time),
        )

    def start(self, nslave):
        def run():
            self.accept_slaves(nslave)

        self.thread = Thread(target=run, args=(), daemon=True)
        self.thread.start()

    def join(self):
        while self.thread.is_alive():
            self.thread.join(100)

    def alive(self):
        return self.thread.is_alive()


================================================
FILE: xgboost_ray/data_sources/__init__.py
================================================
from xgboost_ray.data_sources.csv import CSV
from xgboost_ray.data_sources.dask import Dask
from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.modin import Modin
from xgboost_ray.data_sources.numpy import Numpy
from xgboost_ray.data_sources.object_store import ObjectStore
from xgboost_ray.data_sources.pandas import Pandas
from xgboost_ray.data_sources.parquet import Parquet
from xgboost_ray.data_sources.partitioned import Partitioned
from xgboost_ray.data_sources.petastorm import Petastorm
from xgboost_ray.data_sources.ray_dataset import RayDataset

data_sources = [
    Numpy,
    Pandas,
    Partitioned,
    Modin,
    Dask,
    Petastorm,
    CSV,
    Parquet,
    ObjectStore,
    RayDataset,
]

__all__ = [
    "DataSource",
    "RayFileType",
    "Numpy",
    "Pandas",
    "Modin",
    "Dask",
    "Petastorm",
    "CSV",
    "Parquet",
    "ObjectStore",
    "RayDataset",
    "Partitioned",
]


================================================
FILE: xgboost_ray/data_sources/_distributed.py
================================================
import itertools
import math
from collections import defaultdict
from typing import Any, Dict, Sequence

import ray
from ray.actor import ActorHandle


def get_actor_rank_ips(actors: Sequence[ActorHandle]) -> Dict[int, str]:
    """Get a dict mapping from actor ranks to their IPs"""
    no_obj = ray.put(None)
    # Build a dict mapping actor ranks to their IP addresses
    actor_rank_ips: Dict[int, str] = dict(
        enumerate(
            ray.get(
                [actor.ip.remote() if actor is not None else no_obj for actor in actors]
            )
        )
    )
    return actor_rank_ips


def assign_partitions_to_actors(
    ip_to_parts: Dict[int, Any], actor_rank_ips: Dict[int, str]
) -> Dict[int, Sequence[Any]]:
    """Assign partitions from a distributed dataframe to actors.

    This function collects distributed partitions and evenly distributes
    them to actors, trying to minimize data transfer by respecting
    co-locality.

    This function currently does _not_ take partition sizes into account
    for distributing data. It assumes that all partitions have (more or less)
    the same length.

    Instead, partitions are evenly distributed. E.g. for 8 partitions and 3
    actors, each actor gets assigned 2 or 3 partitions. Which partitions are
    assigned depends on the data locality.

    The algorithm is as follows: For any number of data partitions, get the
    Ray object references to the shards and the IP addresses where they
    currently live.

    Calculate the minimum and maximum amount of partitions per actor. These
    numbers should differ by at most 1. Also calculate how many actors will
    get more partitions assigned than the other actors.

    First, each actor gets assigned up to ``max_parts_per_actor`` co-located
    partitions. Only up to ``num_actors_with_max_parts`` actors get the
    maximum number of partitions, the rest try to fill the minimum.

    The rest of the partitions (all of which cannot be assigned to a
    co-located actor) are assigned to actors until there are none left.
    """
    num_partitions = sum(len(parts) for parts in ip_to_parts.values())
    num_actors = len(actor_rank_ips)
    min_parts_per_actor = max(0, math.floor(num_partitions / num_actors))
    max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors))
    num_actors_with_max_parts = num_partitions % num_actors

    # This is our result dict that maps actor objects to a list of partitions
    actor_to_partitions = defaultdict(list)

    # First we loop through the actors and assign them partitions from their
    # own IPs. Do this until each actor has `min_parts_per_actor` partitions
    partition_assigned = True
    while partition_assigned:
        partition_assigned = False

        # Loop through each actor once, assigning
        for rank, actor_ip in actor_rank_ips.items():
            num_parts_left_on_ip = len(ip_to_parts[actor_ip])
            num_actor_parts = len(actor_to_partitions[rank])

            if num_parts_left_on_ip > 0 and num_actor_parts < max_parts_per_actor:
                if num_actor_parts >= min_parts_per_actor:
                    # Only allow up to `num_actors_with_max_parts actors to
                    # have the maximum number of partitions assigned.
                    if num_actors_with_max_parts <= 0:
                        continue
                    num_actors_with_max_parts -= 1
                actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0))
                partition_assigned = True

    # The rest of the partitions, no matter where they are located, could not
    # be assigned to co-located actors. Thus, we assign them
    # to actors who still need partitions.
    rest_parts = list(itertools.chain(*ip_to_parts.values()))
    partition_assigned = True
    while len(rest_parts) > 0 and partition_assigned:
        partition_assigned = False
        for rank in actor_rank_ips:
            num_actor_parts = len(actor_to_partitions[rank])
            if num_actor_parts < max_parts_per_actor:
                if num_actor_parts >= min_parts_per_actor:
                    if num_actors_with_max_parts <= 0:
                        continue
                    num_actors_with_max_parts -= 1
                actor_to_partitions[rank].append(rest_parts.pop(0))
                partition_assigned = True
            if len(rest_parts) <= 0:
                break

    if len(rest_parts) != 0:
        raise RuntimeError(
            "There are still partitions left to assign, but no actor "
            "has capacity for more. This is probably a bug. Please go "
            "to https://github.com/ray-project/xgboost_ray to report it."
        )

    return actor_to_partitions


================================================
FILE: xgboost_ray/data_sources/csv.py
================================================
from typing import Any, Iterable, Optional, Sequence, Union

import pandas as pd

from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.pandas import Pandas


class CSV(DataSource):
    """Read one or many CSV files."""

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return filetype == RayFileType.CSV

    @staticmethod
    def get_filetype(data: Any) -> Optional[RayFileType]:
        if data.endswith(".csv") or data.endswith("csv.gz"):
            return RayFileType.CSV
        return None

    @staticmethod
    def load_data(
        data: Union[str, Sequence[str]],
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs
    ):
        if isinstance(data, Iterable) and not isinstance(data, str):
            shards = []

            for i, shard in enumerate(data):
                if indices and i not in indices:
                    continue
                shard_df = pd.read_csv(shard, **kwargs)
                shards.append(Pandas.load_data(shard_df, ignore=ignore))
            return pd.concat(shards, copy=False)
        else:
            local_df = pd.read_csv(data, **kwargs)
            return Pandas.load_data(local_df, ignore=ignore)

    @staticmethod
    def get_n(data: Any):
        return len(list(data))


================================================
FILE: xgboost_ray/data_sources/dask.py
================================================
from collections import defaultdict
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import pandas as pd
import ray
import wrapt
from ray.actor import ActorHandle

from xgboost_ray.data_sources._distributed import (
    assign_partitions_to_actors,
    get_actor_rank_ips,
)
from xgboost_ray.data_sources.data_source import DataSource, RayFileType

try:
    import dask  # noqa: F401
    from ray.util.dask import ray_dask_get

    DASK_INSTALLED = True
except ImportError:
    DASK_INSTALLED = False


def _assert_dask_installed():
    if not DASK_INSTALLED:
        raise RuntimeError(
            "Tried to use Dask as a data source, but dask is not "
            "installed. This function shouldn't have been called. "
            "\nFIX THIS by installing dask: `pip install dask`. "
            "\nPlease also raise an issue on our GitHub: "
            "https://github.com/ray-project/xgboost_ray as this part of "
            "the code should not have been reached."
        )


@wrapt.decorator
def ensure_ray_dask_initialized(
    func: Any, instance: Any, args: List[Any], kwargs: Any
) -> Any:
    _assert_dask_installed()
    dask.config.set(scheduler=ray_dask_get)
    return func(*args, **kwargs)


class Dask(DataSource):
    """Read from distributed Dask dataframe.

    A `Dask dataframe <https://docs.dask.org/en/latest/dataframe.html>`_
    is a distributed drop-in replacement for pandas.

    Dask dataframes are stored on multiple actors, making them
    suitable for distributed loading.
    """

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        if not DASK_INSTALLED:
            return False
        from dask.dataframe import DataFrame as DaskDataFrame
        from dask.dataframe import Series as DaskSeries

        return isinstance(data, (DaskDataFrame, DaskSeries))

    @ensure_ray_dask_initialized
    @staticmethod
    def load_data(
        data: Any,  # dask.pandas.DataFrame
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Union[Sequence[int], Sequence[int]]] = None,
        **kwargs
    ) -> pd.DataFrame:
        _assert_dask_installed()

        import dask.dataframe as dd

        if indices is not None and len(indices) > 0 and isinstance(indices[0], Tuple):
            # We got a list of partition IDs belonging to Dask partitions
            return dd.concat([data.partitions[i] for (i,) in indices]).compute()

        # Dask does not support iloc() for row selection, so we have to
        # compute a local pandas dataframe first
        local_df = data.compute()

        if indices:
            local_df = local_df.iloc[indices]

        if ignore:
            local_df = local_df[local_df.columns.difference(ignore)]

        return local_df

    @ensure_ray_dask_initialized
    @staticmethod
    def convert_to_series(data: Any) -> pd.Series:
        _assert_dask_installed()
        from dask.array import Array as DaskArray
        from dask.dataframe import DataFrame as DaskDataFrame
        from dask.dataframe import Series as DaskSeries

        if isinstance(data, DaskDataFrame):
            return pd.Series(data.compute().squeeze())
        elif isinstance(data, DaskSeries):
            return data.compute()
        elif isinstance(data, DaskArray):
            return pd.Series(data.compute())

        return DataSource.convert_to_series(data)

    @ensure_ray_dask_initialized
    @staticmethod
    def get_actor_shards(
        data: Any, actors: Sequence[ActorHandle]  # dask.dataframe.DataFrame
    ) -> Tuple[Any, Optional[Dict[int, Any]]]:
        _assert_dask_installed()

        actor_rank_ips = get_actor_rank_ips(actors)

        # Get IPs and partitions
        ip_to_parts = get_ip_to_parts(data)

        return data, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)

    @ensure_ray_dask_initialized
    @staticmethod
    def get_n(data: Any):
        """
        For naive distributed loading we just return the number of rows
        here. Loading by shard is achieved via `get_actor_shards()`
        """
        return len(data)


def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]:
    persisted = data.persist(scheduler=ray_dask_get)
    name = persisted._name

    node_ids_to_node = {node["NodeID"]: node for node in ray.state.nodes()}

    # This is a hacky way to get the partition node IDs, and it's not
    # 100% accurate as the map task could get scheduled on a different node
    # (though Ray tries to keep locality). We need to use that until
    # ray.state.objects() or something like it is available again.
    partition_locations_df = persisted.map_partitions(
        lambda df: pd.DataFrame([ray.get_runtime_context().get_node_id()])
    ).compute()
    partition_locations = [
        partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size)
    ]

    ip_to_parts = defaultdict(list)
    for (obj_name, pid), obj_ref in dask.base.collections_to_dsk([persisted]).items():
        assert obj_name == name

        if isinstance(obj_ref, ray.ObjectRef):
            node_id = partition_locations[pid]
            node = node_ids_to_node.get(node_id, {})
            ip = node.get("NodeManagerAddress", "_no_ip")
        else:
            ip = "_no_ip"

        # Pass tuples here (integers can be misinterpreted as row numbers)
        ip_to_parts[ip].append((pid,))

    return ip_to_parts


================================================
FILE: xgboost_ray/data_sources/data_source.py
================================================
from enum import Enum
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union

import pandas as pd
from ray.actor import ActorHandle
from ray.util.annotations import PublicAPI

if TYPE_CHECKING:
    from xgboost_ray.xgb import xgboost as xgb


@PublicAPI(stability="beta")
class RayFileType(Enum):
    """Enum for different file types (used for overrides)."""

    CSV = 1
    PARQUET = 2
    PETASTORM = 3


@PublicAPI(stability="beta")
class DataSource:
    """Abstract class for data sources.

    xgboost_ray supports reading from various sources, such as files
    (e.g. CSV, Parquet) or distributed datasets (Modin).

    This abstract class defines an interface to read from these sources.
    New data sources can be added by implementing this interface.

    ``DataSource`` classes are not instantiated. Instead, static and
    class methods are called directly.
    """

    supports_central_loading = True
    supports_distributed_loading = False
    needs_partitions = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        """Check if the supplied data matches this data source.

        Args:
            data: Dataset.
            filetype: RayFileType of the provided
                dataset. Some DataSource implementations might require
                that this is explicitly set (e.g. if multiple sources can
                read CSV files).

        Returns:
            Boolean indicating if this data source belongs to/is compatible
                with the data.
        """
        return False

    @staticmethod
    def get_filetype(data: Any) -> Optional[RayFileType]:
        """Method to help infer the filetype.

        Returns None if the supplied data type (usually a filename)
        is not covered by this data source, otherwise the filetype
        is returned.

        Args:
            data: Data set

        Returns:
            RayFileType or None.
        """
        return None

    @staticmethod
    def load_data(
        data: Any,
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[Any]] = None,
        **kwargs
    ) -> pd.DataFrame:
        """
        Load data into a pandas dataframe.

        Ignore specific columns, and optionally select specific indices.

        Args:
            data: Input data
            ignore: Column names to ignore
            indices: Indices to select. What an
                index indicates depends on the data source.

        Returns:
            Pandas DataFrame.
        """
        raise NotImplementedError

    @staticmethod
    def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]):
        """Optionally update feature names before training/prediction

        Args:
            matrix: xgboost DMatrix object.
            feature_names: Feature names manually passed to the
                ``RayDMatrix`` object.

        """
        pass

    @staticmethod
    def convert_to_series(data: Any) -> pd.Series:
        """Convert data from the data source type to a pandas series"""
        if isinstance(data, pd.DataFrame):
            return pd.Series(data.squeeze())

        if not isinstance(data, pd.Series):
            return pd.Series(data)

        return data

    @classmethod
    def get_column(
        cls, data: pd.DataFrame, column: Any
    ) -> Tuple[pd.Series, Optional[Union[str, List]]]:
        """Helper method wrapping around convert to series.

        This method should usually not be overwritten.
        """
        if isinstance(column, str) or isinstance(column, List):
            return data[column], column
        elif column is not None:
            return cls.convert_to_series(column), None
        return column, None

    @staticmethod
    def get_n(data: Any):
        """Get length of data source partitions for sharding."""
        return len(data)

    @staticmethod
    def get_actor_shards(
        data: Any, actors: Sequence[ActorHandle]
    ) -> Tuple[Any, Optional[Dict[int, Any]]]:
        """Get a dict mapping actor ranks to shards.

        Args:
            data: Data to shard.

        Returns:
            Returns a tuple of which the first element indicates the new
                data object that will overwrite the existing data object
                in the RayDMatrix (e.g. when the object is not serializable).
                The second element is a dict mapping actor ranks to shards.
                These objects are usually passed to the ``load_data()`` method
                for distributed loading, so that method needs to be able to
                deal with the respective data.
        """
        return data, None


================================================
FILE: xgboost_ray/data_sources/modin.py
================================================
from collections import defaultdict
from typing import Any, Dict, Optional, Sequence, Tuple, Union

import pandas as pd
import ray
from ray import ObjectRef
from ray.actor import ActorHandle

from xgboost_ray.data_sources._distributed import (
    assign_partitions_to_actors,
    get_actor_rank_ips,
)
from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.object_store import ObjectStore

try:
    import modin  # noqa: F401
    from modin.config.envvars import Engine
    from modin.distributed.dataframe.pandas import unwrap_partitions  # noqa: F401
    from modin.pandas import DataFrame as ModinDataFrame  # noqa: F401
    from modin.pandas import Series as ModinSeries  # noqa: F401
    from packaging.version import Version

    MODIN_INSTALLED = Version(modin.__version__) >= Version("0.9.0")

    # Check if importing the Ray engine leads to errors
    Engine().get()

except (ImportError, AttributeError):
    MODIN_INSTALLED = False


def _assert_modin_installed():
    if not MODIN_INSTALLED:
        raise RuntimeError(
            "Tried to use Modin as a data source, but modin is not "
            "installed or it conflicts with the pandas version. "
            "This function shouldn't have been called. "
            "\nFIX THIS by installing modin: `pip install modin` "
            "and making sure that the installed pandas version is "
            "supported by modin."
            "\nPlease also raise an issue on our GitHub: "
            "https://github.com/ray-project/xgboost_ray as this part of "
            "the code should not have been reached."
        )


class Modin(DataSource):
    """Read from distributed Modin dataframe.

    `Modin <https://github.com/modin-project/modin>`_ is a distributed
    drop-in replacement for pandas supporting Ray as a backend.

    Modin dataframes are stored on multiple actors, making them
    suitable for distributed loading.
    """

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        if not MODIN_INSTALLED:
            return False
        # Has to be imported again.
        from modin.pandas import DataFrame as ModinDataFrame  # noqa: F811
        from modin.pandas import Series as ModinSeries  # noqa: F811

        return isinstance(data, (ModinDataFrame, ModinSeries))

    @staticmethod
    def load_data(
        data: Any,  # modin.pandas.DataFrame
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Union[Sequence[int], Sequence[ObjectRef]]] = None,
        **kwargs
    ) -> pd.DataFrame:
        _assert_modin_installed()

        if (
            indices is not None
            and len(indices) > 0
            and isinstance(indices[0], ObjectRef)
        ):
            # We got a list of ObjectRefs belonging to Modin partitions
            return ObjectStore.load_data(data=indices, indices=None, ignore=ignore)

        local_df = data
        if indices:
            local_df = local_df.iloc[indices]

        local_df = local_df._to_pandas()

        if ignore:
            local_df = local_df[local_df.columns.difference(ignore)]

        return local_df

    @staticmethod
    def convert_to_series(data: Any) -> pd.Series:
        _assert_modin_installed()
        # Has to be imported again.
        from modin.pandas import DataFrame as ModinDataFrame  # noqa: F811
        from modin.pandas import Series as ModinSeries  # noqa: F811

        if isinstance(data, ModinDataFrame):
            return pd.Series(data._to_pandas().squeeze())
        elif isinstance(data, ModinSeries):
            return data._to_pandas()

        return DataSource.convert_to_series(data)

    @staticmethod
    def get_actor_shards(
        data: Any, actors: Sequence[ActorHandle]  # modin.pandas.DataFrame
    ) -> Tuple[Any, Optional[Dict[int, Any]]]:
        _assert_modin_installed()

        # Has to be imported again.
        from modin.distributed.dataframe.pandas import unwrap_partitions  # noqa: F811

        actor_rank_ips = get_actor_rank_ips(actors)

        # Get IPs and partitions
        unwrapped = unwrap_partitions(data, axis=0, get_ip=True)
        ip_objs, part_objs = zip(*unwrapped)

        # Build a table mapping from IP to list of partitions
        ip_to_parts = defaultdict(list)
        for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs):
            ip_to_parts[ip].append(part_obj)

        # Modin dataframes are not serializable, so pass None here
        # as the first return value
        return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)

    @staticmethod
    def get_n(data: Any):
        """
        For naive distributed loading we just return the number of rows
        here. Loading by shard is achieved via `get_actor_shards()`
        """
        return len(data)


================================================
FILE: xgboost_ray/data_sources/numpy.py
================================================
from typing import TYPE_CHECKING, Any, List, Optional, Sequence

import numpy as np
import pandas as pd

from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.pandas import Pandas

if TYPE_CHECKING:
    from xgboost_ray.xgb import xgboost as xgb


class Numpy(DataSource):
    """Read from numpy arrays."""

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return isinstance(data, np.ndarray)

    @staticmethod
    def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]):
        # Potentially unset feature names
        matrix.feature_names = feature_names

    @staticmethod
    def load_data(
        data: np.ndarray,
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs,
    ) -> pd.DataFrame:
        local_df = pd.DataFrame(data, columns=[f"f{i}" for i in range(data.shape[1])])
        return Pandas.load_data(local_df, ignore=ignore, indices=indices)


================================================
FILE: xgboost_ray/data_sources/object_store.py
================================================
from typing import Any, Optional, Sequence

import pandas as pd
import ray
from ray import ObjectRef

from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.pandas import Pandas


class ObjectStore(DataSource):
    """Read pandas dataframes and series from ray object store."""

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        if isinstance(data, Sequence):
            return all(isinstance(d, ObjectRef) for d in data)
        return isinstance(data, ObjectRef)

    @staticmethod
    def load_data(
        data: Sequence[ObjectRef],
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs
    ) -> pd.DataFrame:
        if indices is not None:
            data = [data[i] for i in indices]

        local_df = ray.get(data)

        return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore)

    @staticmethod
    def convert_to_series(data: Any) -> pd.Series:
        if isinstance(data, ObjectRef):
            data = ray.get(data)
        else:
            data = pd.concat(ray.get(data), copy=False)
        return DataSource.convert_to_series(data)


================================================
FILE: xgboost_ray/data_sources/pandas.py
================================================
from typing import Any, Optional, Sequence

import pandas as pd

from xgboost_ray.data_sources.data_source import DataSource, RayFileType


class Pandas(DataSource):
    """Read from pandas dataframes and series."""

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return isinstance(data, (pd.DataFrame, pd.Series))

    @staticmethod
    def load_data(
        data: Any,
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs
    ) -> pd.DataFrame:
        local_df = data

        if ignore:
            local_df = local_df[local_df.columns.difference(ignore)]

        if indices:
            return local_df.iloc[indices]

        return local_df


================================================
FILE: xgboost_ray/data_sources/parquet.py
================================================
from typing import Any, Iterable, Optional, Sequence, Union

import pandas as pd

from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.pandas import Pandas


class Parquet(DataSource):
    """Read one or many Parquet files."""

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return filetype == RayFileType.PARQUET

    @staticmethod
    def get_filetype(data: Any) -> Optional[RayFileType]:
        if data.endswith(".parquet"):
            return RayFileType.PARQUET
        return None

    @staticmethod
    def load_data(
        data: Union[str, Sequence[str]],
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs
    ) -> pd.DataFrame:
        if isinstance(data, Iterable) and not isinstance(data, str):
            shards = []

            for i, shard in enumerate(data):
                if indices and i not in indices:
                    continue

                shard_df = pd.read_parquet(shard, **kwargs)
                shards.append(Pandas.load_data(shard_df, ignore=ignore))
            return pd.concat(shards, copy=False)
        else:
            local_df = pd.read_parquet(data, **kwargs)
            return Pandas.load_data(local_df, ignore=ignore)

    @staticmethod
    def get_n(data: Any):
        return len(list(data))


================================================
FILE: xgboost_ray/data_sources/partitioned.py
================================================
from collections import defaultdict
from typing import Any, Dict, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
from ray import ObjectRef
from ray.actor import ActorHandle

from xgboost_ray.data_sources._distributed import (
    assign_partitions_to_actors,
    get_actor_rank_ips,
)
from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.numpy import Numpy
from xgboost_ray.data_sources.pandas import Pandas


class Partitioned(DataSource):
    """Read from distributed data structure implementing __partitioned__.

    __partitioned__ provides meta data about how the data is partitioned and
    distributed across several compute nodes, making supporting objects them
    suitable for distributed loading.

    Also see the __partitioned__ spec:
    https://github.com/IntelPython/DPPY-Spec/blob/draft/partitioned/Partitioned.md
    """

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return hasattr(data, "__partitioned__")

    @staticmethod
    def load_data(
        data: Any,  # __partitioned__ dict
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[ObjectRef]] = None,
        **kwargs
    ) -> pd.DataFrame:

        assert isinstance(data, dict), "Expected __partitioned__ dict"
        _get = data["get"]

        if indices is None or len(indices) == 0:
            tiling = data["partition_tiling"]
            ndims = len(tiling)
            # we need tuples to access partitions in the right order
            pos_suffix = (0,) * (ndims - 1)
            parts = data["partitions"]
            # get the full data, e.g. all shards/partitions
            local_df = [
                _get(parts[(i,) + pos_suffix]["data"]) for i in range(tiling[0])
            ]
        else:
            # here we got a list of futures for partitions
            local_df = _get(indices)

        if isinstance(local_df[0], pd.DataFrame):
            return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore)
        else:
            return Numpy.load_data(np.concatenate(local_df), ignore=ignore)

    @staticmethod
    def get_actor_shards(
        data: Any, actors: Sequence[ActorHandle]  # partitioned.pandas.DataFrame
    ) -> Tuple[Any, Optional[Dict[int, Any]]]:
        assert hasattr(data, "__partitioned__")

        actor_rank_ips = get_actor_rank_ips(actors)

        # Get accessor func and partitions
        parted = data.__partitioned__
        parts = parted["partitions"]
        tiling = parted["partition_tiling"]
        ndims = len(tiling)
        if ndims < 1 or ndims > 2 or any(tiling[x] != 1 for x in range(1, ndims)):
            raise RuntimeError(
                "Only row-wise partitionings of 1d/2d structures supported."
            )

        # Now build a table mapping from IP to list of partitions
        ip_to_parts = defaultdict(lambda: [])
        # we need tuples to access partitions in the right order
        pos_suffix = (0,) * (ndims - 1)
        for i in range(tiling[0]):
            part = parts[(i,) + pos_suffix]  # this works for 1d and 2d
            ip_to_parts[part["location"][0]].append(part["data"])
        # __partitioned__ is serializable, so pass it here
        # as the first return value
        ret = parted, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)
        return ret

    @staticmethod
    def get_n(data: Any):
        """Get length of data source partitions for sharding."""
        return data.__partitioned__["shape"][0]


================================================
FILE: xgboost_ray/data_sources/petastorm.py
================================================
from typing import Any, List, Optional, Sequence, Union

import pandas as pd

from xgboost_ray.data_sources.data_source import DataSource, RayFileType

try:
    import petastorm

    PETASTORM_INSTALLED = True
except ImportError:
    PETASTORM_INSTALLED = False


def _assert_petastorm_installed():
    if not PETASTORM_INSTALLED:
        raise RuntimeError(
            "Tried to use Petastorm as a data source, but petastorm is not "
            "installed. This function shouldn't have been called. "
            "\nFIX THIS by installing petastorm: `pip install petastorm`. "
            "\nPlease also raise an issue on our GitHub: "
            "https://github.com/ray-project/xgboost_ray as this part of "
            "the code should not have been reached."
        )


class Petastorm(DataSource):
    """Read with Petastorm.

    `Petastorm <https://github.com/uber/petastorm>`_ is a machine learning
    training and evaluation library.

    This class accesses Petastorm's dataset loading interface for efficient
    loading of large datasets.
    """

    supports_central_loading = True
    supports_distributed_loading = True

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        return PETASTORM_INSTALLED and filetype == RayFileType.PETASTORM

    @staticmethod
    def get_filetype(data: Any) -> Optional[RayFileType]:
        if not PETASTORM_INSTALLED:
            return None

        if not isinstance(data, List):
            data = [data]

        def _is_compatible(url: str):
            return url.endswith(".parquet") and (
                url.startswith("s3://")
                or url.startswith("gs://")
                or url.startswith("hdfs://")
                or url.startswith("file://")
            )

        if all(_is_compatible(url) for url in data):
            return RayFileType.PETASTORM

        return None

    @staticmethod
    def load_data(
        data: Union[str, Sequence[str]],
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[Sequence[int]] = None,
        **kwargs
    ) -> pd.DataFrame:
        _assert_petastorm_installed()
        with petastorm.make_batch_reader(data) as reader:
            shards = [
                pd.DataFrame(batch._asdict())
                for i, batch in enumerate(reader)
                if not indices or i in indices
            ]

        local_df = pd.concat(shards, copy=False)

        if ignore:
            local_df = local_df[local_df.columns.difference(ignore)]

        return local_df

    @staticmethod
    def get_n(data: Any):
        return len(list(data))


================================================
FILE: xgboost_ray/data_sources/ray_dataset.py
================================================
from typing import Any, Dict, Optional, Sequence, Tuple, Union

import pandas as pd
import ray
from ray.actor import ActorHandle

from xgboost_ray.data_sources.data_source import DataSource, RayFileType
from xgboost_ray.data_sources.pandas import Pandas

try:
    import ray.data.dataset  # noqa: F401

    RAY_DATASET_AVAILABLE = True
except (ImportError, AttributeError):
    RAY_DATASET_AVAILABLE = False

DATASET_TO_PANDAS_LIMIT = float("inf")


def _assert_ray_data_available():
    if not RAY_DATASET_AVAILABLE:
        raise RuntimeError(
            "Tried to use Ray datasets as a data source, but your version "
            "of Ray does not support it. "
            "\nFIX THIS by upgrading Ray: `pip install -U ray`. "
            "\nPlease also raise an issue on our GitHub: "
            "https://github.com/ray-project/xgboost_ray as this part of "
            "the code should not have been reached."
        )


class RayDataset(DataSource):
    """Read from distributed Ray dataset."""

    supports_central_loading = True
    supports_distributed_loading = True
    needs_partitions = False

    @staticmethod
    def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
        if not RAY_DATASET_AVAILABLE:
            return False

        return isinstance(data, ray.data.dataset.Dataset)

    @staticmethod
    def load_data(
        data: "ray.data.dataset.Dataset",
        ignore: Optional[Sequence[str]] = None,
        indices: Optional[
            Union[Sequence[int], Sequence["ray.data.dataset.Dataset"]]
        ] = None,
        **kwargs
    ) -> pd.DataFrame:
        _assert_ray_data_available()

        if indices is not None:
            if len(indices) > 0 and isinstance(indices[0], ray.data.dataset.Dataset):
                # We got a list of Datasets belonging a partition
                data = indices
            else:
                data = [data[i] for i in indices]

        if isinstance(data, ray.data.dataset.Dataset):
            local_df = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT)
        else:
            local_df = pd.concat(
                [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False
            )
        return Pandas.load_data(local_df, ignore=ignore)

    @staticmethod
    def convert_to_series(
        data: Union["ray.data.dataset.Dataset", Sequence["ray.data.dataset.Dataset"]]
    ) -> pd.Series:
        _assert_ray_data_available()

        if isinstance(data, ray.data.dataset.Dataset):
            data = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT)
        else:
            data = pd.concat(
                [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False
            )
        return DataSource.convert_to_series(data)

    @staticmethod
    def get_actor_shards(
        data: "ray.data.dataset.Dataset", actors: Sequence[ActorHandle]
    ) -> Tuple[Any, Optional[Dict[int, Any]]]:
        _assert_ray_data_available()

        # We do not use our assign_partitions_to_actors as assignment of splits
        # to actors is handled by locality_hints argument.

        dataset_splits = data.split(
            len(actors),
            equal=True,
            locality_hints=actors,
        )

        return None, {
            i: [dataset_split] for i, dataset_split in enumerate(dataset_splits)
        }

    @staticmethod
    def get_n(data: "ray.data.dataset.Dataset"):
        """
        Return number of distributed blocks.
        """
        return data._plan.initial_num_blocks()


================================================
FILE: xgboost_ray/elastic.py
================================================
import time
from typing import Callable, Dict, List, Optional, Tuple

import ray

from xgboost_ray.main import (
    ENV,
    ActorHandle,
    RayParams,
    RayXGBoostActorAvailable,
    _create_actor,
    _PrepareActorTask,
    _TrainingState,
    logger,
)
from xgboost_ray.matrix import RayDMatrix


def _maybe_schedule_new_actors(
    training_state: _TrainingState,
    num_cpus_per_actor: int,
    num_gpus_per_actor: int,
    resources_per_actor: Optional[Dict],
    ray_params: RayParams,
    load_data: List[RayDMatrix],
) -> bool:
    """Schedule new actors for elastic training if resources are available.

    Potentially starts new actors and triggers data loading."""

    # This is only enabled for elastic training.
    if not ray_params.elastic_training:
        return False

    missing_actor_ranks = [
        rank
        for rank, actor in enumerate(training_state.actors)
        if actor is None and rank not in training_state.pending_actors
    ]

    # If all actors are alive, there is nothing to do.
    if not missing_actor_ranks:
        return False

    now = time.time()

    # Check periodically every n seconds.
    if (
        now
        < training_state.last_resource_check_at + ENV.ELASTIC_RESTART_RESOURCE_CHECK_S
    ):
        return False

    training_state.last_resource_check_at = now

    new_pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = {}
    for rank in missing_actor_ranks:
        # Actor rank should not be already pending
        if rank in training_state.pending_actors or rank in new_pending_actors:
            continue

        # Try to schedule this actor
        actor = _create_actor(
            rank=rank,
            num_actors=ray_params.num_actors,
            num_cpus_per_actor=num_cpus_per_actor,
            num_gpus_per_actor=num_gpus_per_actor,
            resources_per_actor=resources_per_actor,
            placement_group=training_state.placement_group,
            queue=training_state.queue,
            checkpoint_frequency=ray_params.checkpoint_frequency,
            distributed_callbacks=ray_params.distributed_callbacks,
        )

        task = _PrepareActorTask(
            actor,
            queue=training_state.queue,
            stop_event=training_state.stop_event,
            load_data=load_data,
        )

        new_pending_actors[rank] = (actor, task)
        logger.debug(
            f"Re-scheduled actor with rank {rank}. Waiting for "
            f"placement and data loading before promoting it "
            f"to training."
        )
    if new_pending_actors:
        training_state.pending_actors.update(new_pending_actors)
        logger.info(
            f"Re-scheduled {len(new_pending_actors)} actors for "
            f"training. Once data loading finished, they will be "
            f"integrated into training again."
        )
    return bool(new_pending_actors)


def _update_scheduled_actor_states(training_state: _TrainingState):
    """Update status of scheduled actors in elastic training.

    If actors finished their preparation tasks, promote them to
    proper training actors (set the `training_state.actors` entry).

    Also schedule a `RayXGBoostActorAvailable` exception so that training
    is restarted with the new actors.

    """
    now = time.time()
    actor_became_ready = False

    # Wrap in list so we can alter the `training_state.pending_actors` dict
    for rank in list(training_state.pending_actors.keys()):
        actor, task = training_state.pending_actors[rank]
        if task.is_ready():
            # Promote to proper actor
            training_state.actors[rank] = actor
            del training_state.pending_actors[rank]
            actor_became_ready = True

    if actor_became_ready:
        if not training_state.pending_actors:
            # No other actors are pending, so let's restart right away.
            training_state.restart_training_at = now - 1.0

        # If an actor became ready but other actors are pending, we wait
        # for n seconds before restarting, as chances are that they become
        # ready as well (e.g. if a large node came up).
        grace_period = ENV.ELASTIC_RESTART_GRACE_PERIOD_S
        if training_state.restart_training_at is None:
            logger.debug(
                f"A RayXGBoostActor became ready for training. Waiting "
                f"{grace_period} seconds before triggering training restart."
            )
            training_state.restart_training_at = now + grace_period

    if training_state.restart_training_at is not None:
        if now > training_state.restart_training_at:
            training_state.restart_training_at = None
            raise RayXGBoostActorAvailable(
                "A new RayXGBoostActor became available for training. "
                "Triggering restart."
            )


def _get_actor_alive_status(
    actors: List[ActorHandle], callback: Callable[[ActorHandle], None]
):
    """Loop through all actors. Invoke a callback on dead actors."""
    obj_to_rank = {}

    alive = 0
    dead = 0

    for rank, actor in enumerate(actors):
        if actor is None:
            dead += 1
            continue
        obj = actor.pid.remote()
        obj_to_rank[obj] = rank

    not_ready = list(obj_to_rank.keys())
    while not_ready:
        ready, not_ready = ray.wait(not_ready, timeout=0)

        for obj in ready:
            try:
                pid = ray.get(obj)
                rank = obj_to_rank[obj]
                logger.debug(f"Actor {actors[rank]} with PID {pid} is alive.")
                alive += 1
            except Exception:
                rank = obj_to_rank[obj]
                logger.debug(f"Actor {actors[rank]} is _not_ alive.")
                dead += 1
                callback(actors[rank])
    logger.info(f"Actor status: {alive} alive, {dead} dead " f"({alive+dead} total)")

    return alive, dead


================================================
FILE: xgboost_ray/examples/__init__.py
================================================


================================================
FILE: xgboost_ray/examples/create_test_data.py
================================================
from xgboost_ray.tests.utils import create_parquet


def main():
    create_parquet(
        "example.parquet",
        num_rows=1_000_000,
        num_partitions=100,
        num_features=8,
        num_classes=2,
    )


if __name__ == "__main__":
    main()


================================================
FILE: xgboost_ray/examples/higgs.py
================================================
import os
import time

from xgboost_ray import RayDMatrix, RayParams, train

FILENAME_CSV = "HIGGS.csv.gz"


def download_higgs(target_file):
    url = (
        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
        "00280/HIGGS.csv.gz"
    )

    try:
        import urllib.request
    except ImportError as e:
        raise ValueError(
            f"Automatic downloading of the HIGGS dataset requires `urllib`."
            f"\nFIX THIS by running `pip install urllib` or manually "
            f"downloading the dataset from {url}."
        ) from e

    print(f"Downloading HIGGS dataset to {target_file}")
    urllib.request.urlretrieve(url, target_file)
    return os.path.exists(target_file)


def main():
    # Example adapted from this blog post:
    # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7
    # This uses the HIGGS dataset. Download here:
    # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz

    if not os.path.exists(FILENAME_CSV):
        assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed."
        print("HIGGS dataset downloaded.")
    else:
        print("HIGGS dataset found locally.")

    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]

    dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames)

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=1, num_actors=1),
        num_boost_round=100,
        evals=[(dtrain, "train")],
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("higgs.xgb")
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    import ray

    ray.init()

    start = time.time()
    main()
    taken = time.time() - start
    print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")


================================================
FILE: xgboost_ray/examples/higgs_parquet.py
================================================
import os
import time

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from higgs import download_higgs

from xgboost_ray import RayDMatrix, RayParams, train

FILENAME_CSV = "HIGGS.csv.gz"
FILENAME_PARQUET = "HIGGS.parquet"


def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs):
    if os.path.exists(out_file):
        return False

    print(f"Converting CSV {in_file} to PARQUET {out_file}")
    csv_stream = pd.read_csv(
        in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs
    )

    parquet_schema = None
    parquet_writer = None
    for i, chunk in enumerate(csv_stream):
        print("Chunk", i)
        if not parquet_schema:
            # Guess the schema of the CSV file from the first chunk
            parquet_schema = pa.Table.from_pandas(df=chunk).schema
            # Open a Parquet file for writing
            parquet_writer = pq.ParquetWriter(
                out_file, parquet_schema, compression="snappy"
            )
        # Write CSV chunk to the parquet file
        table = pa.Table.from_pandas(chunk, schema=parquet_schema)
        parquet_writer.write_table(table)

    parquet_writer.close()
    return True


def main():
    # Example adapted from this blog post:
    # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7
    # This uses the HIGGS dataset. Download here:
    # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz

    if not os.path.exists(FILENAME_PARQUET):
        if not os.path.exists(FILENAME_CSV):
            download_higgs(FILENAME_CSV)
            print("Downloaded HIGGS csv dataset")
        print("Converting HIGGS csv dataset to parquet")
        csv_to_parquet(
            FILENAME_CSV,
            FILENAME_PARQUET,
            names=[
                "label",
                "feature-01",
                "feature-02",
                "feature-03",
                "feature-04",
                "feature-05",
                "feature-06",
                "feature-07",
                "feature-08",
                "feature-09",
                "feature-10",
                "feature-11",
                "feature-12",
                "feature-13",
                "feature-14",
                "feature-15",
                "feature-16",
                "feature-17",
                "feature-18",
                "feature-19",
                "feature-20",
                "feature-21",
                "feature-22",
                "feature-23",
                "feature-24",
                "feature-25",
                "feature-26",
                "feature-27",
                "feature-28",
            ],
        )

    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]

    # Here we load the Parquet file
    dtrain = RayDMatrix(
        os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames
    )

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=1, num_actors=1),
        num_boost_round=100,
        evals=[(dtrain, "train")],
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("higgs.xgb")
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    import ray

    ray.init()

    start = time.time()
    main()
    taken = time.time() - start
    print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")


================================================
FILE: xgboost_ray/examples/readme.py
================================================
# flake8: noqa E501


def readme_simple():
    from sklearn.datasets import load_breast_cancer

    from xgboost_ray import RayDMatrix, RayParams, train

    train_x, train_y = load_breast_cancer(return_X_y=True)
    train_set = RayDMatrix(train_x, train_y)

    evals_result = {}
    bst = train(
        {
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
        },
        train_set,
        evals_result=evals_result,
        evals=[(train_set, "train")],
        verbose_eval=False,
        ray_params=RayParams(num_actors=2, cpus_per_actor=1),
    )

    bst.save_model("model.xgb")
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


def readme_predict():
    import xgboost as xgb
    from sklearn.datasets import load_breast_cancer

    from xgboost_ray import RayDMatrix, RayParams, predict

    data, labels = load_breast_cancer(return_X_y=True)

    dpred = RayDMatrix(data, labels)

    bst = xgb.Booster(model_file="model.xgb")
    pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2))

    print(pred_ray)


def readme_tune():
    from sklearn.datasets import load_breast_cancer

    from xgboost_ray import RayDMatrix, RayParams, train

    num_actors = 4
    num_cpus_per_actor = 1

    ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)

    def train_model(config):
        train_x, train_y = load_breast_cancer(return_X_y=True)
        train_set = RayDMatrix(train_x, train_y)

        evals_result = {}
        bst = train(
            params=config,
            dtrain=train_set,
            evals_result=evals_result,
            evals=[(train_set, "train")],
            verbose_eval=False,
            ray_params=ray_params,
        )
        bst.save_model("model.xgb")

    from ray import tune

    # Specify the hyperparameter search space.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9),
    }

    # Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
    analysis = tune.run(
        train_model,
        config=config,
        metric="train-error",
        mode="min",
        num_samples=4,
        resources_per_trial=ray_params.get_tune_resources(),
    )
    print("Best hyperparameters", analysis.best_config)


if __name__ == "__main__":
    import ray

    ray.init(num_cpus=5)

    print("Readme: Simple example")
    readme_simple()
    readme_predict()
    try:
        print("Readme: Ray Tune example")
        readme_tune()
    except ImportError:
        print("Ray Tune not installed.")


================================================
FILE: xgboost_ray/examples/readme_sklearn_api.py
================================================
def readme_sklearn_api():
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split

    from xgboost_ray import RayParams, RayXGBClassifier

    seed = 42

    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.25, random_state=42
    )

    clf = RayXGBClassifier(
        n_jobs=4, random_state=seed  # In XGBoost-Ray, n_jobs sets the number of actors
    )

    # scikit-learn API will automatically conver the data
    # to RayDMatrix format as needed.
    # You can also pass X as a RayDMatrix, in which case
    # y will be ignored.

    clf.fit(X_train, y_train)

    pred_ray = clf.predict(X_test)
    print(pred_ray)

    pred_proba_ray = clf.predict_proba(X_test)
    print(pred_proba_ray)

    # It is also possible to pass a RayParams object
    # to fit/predict/predict_proba methods - will override
    # n_jobs set during initialization

    clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2))

    pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
    print(pred_ray)


if __name__ == "__main__":
    import ray

    ray.init(num_cpus=5)

    print("Readme: scikit-learn API example")
    readme_sklearn_api()


================================================
FILE: xgboost_ray/examples/simple.py
================================================
import argparse

import ray
from sklearn import datasets
from sklearn.model_selection import train_test_split

from xgboost_ray import RayDMatrix, RayParams, train


def main(cpus_per_actor, num_actors):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "simple.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_dask.py
================================================
import argparse

import numpy as np
import pandas as pd
import ray

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.data_sources.dask import DASK_INSTALLED


def main(cpus_per_actor, num_actors):
    if not DASK_INSTALLED:
        print("Dask is not installed. Install with `pip install dask`")
        return

    # Local import so the installation check comes first
    import dask
    import dask.dataframe as dd
    from ray.util.dask import ray_dask_get

    dask.config.set(scheduler=ray_dask_get)

    # Generate dataset
    x = np.repeat(range(8), 16).reshape((32, 4))
    # Even numbers --> 0, odd numbers --> 1
    y = np.tile(np.repeat(range(2), 4), 4)

    # Flip some bits to reduce max accuracy
    bits_to_flip = np.random.choice(32, size=6, replace=False)
    y[bits_to_flip] = 1 - y[bits_to_flip]

    data = pd.DataFrame(x)
    data["label"] = y

    # Split into 4 partitions
    dask_df = dd.from_pandas(data, npartitions=4)

    train_set = RayDMatrix(dask_df, "label")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(train_set, "train")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "dask.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors + 1)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_modin.py
================================================
import argparse

import numpy as np
import pandas as pd
import ray

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.data_sources.modin import MODIN_INSTALLED


def main(cpus_per_actor, num_actors):
    if not MODIN_INSTALLED:
        print(
            "Modin is not installed or installed in a version that is not "
            "compatible with xgboost_ray (< 0.9.0)."
        )
        return

    # Import modin after initializing Ray
    from modin.distributed.dataframe.pandas import from_partitions

    # Generate dataset
    x = np.repeat(range(8), 16).reshape((32, 4))
    # Even numbers --> 0, odd numbers --> 1
    y = np.tile(np.repeat(range(2), 4), 4)

    # Flip some bits to reduce max accuracy
    bits_to_flip = np.random.choice(32, size=6, replace=False)
    y[bits_to_flip] = 1 - y[bits_to_flip]

    data = pd.DataFrame(x)
    data["label"] = y

    # Split into 4 partitions
    partitions = [ray.put(part) for part in np.split(data, 4)]

    # Create modin df here
    modin_df = from_partitions(partitions, axis=0)

    train_set = RayDMatrix(modin_df, "label")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(train_set, "train")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "modin.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors + 1)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_objectstore.py
================================================
import argparse

import numpy as np
import pandas as pd
import ray

from xgboost_ray import RayDMatrix, RayParams, train


def main(cpus_per_actor, num_actors):
    # Generate dataset
    x = np.repeat(range(8), 16).reshape((32, 4))
    # Even numbers --> 0, odd numbers --> 1
    y = np.tile(np.repeat(range(2), 4), 4)

    # Flip some bits to reduce max accuracy
    bits_to_flip = np.random.choice(32, size=6, replace=False)
    y[bits_to_flip] = 1 - y[bits_to_flip]

    data = pd.DataFrame(x)
    data["label"] = y

    # Split into 4 partitions
    partitions = [ray.put(part) for part in np.split(data, 4)]

    train_set = RayDMatrix(partitions, "label")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(train_set, "train")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "modin.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors + 1)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_partitioned.py
================================================
import argparse

import numpy as np
import ray
from sklearn import datasets
from sklearn.model_selection import train_test_split

from xgboost_ray import RayDMatrix, RayParams, train

nc = 31


@ray.remote
class AnActor:
    """We mimic a distributed DF by having several actors create
    data which form the global DF.
    """

    @ray.method(num_returns=2)
    def genData(self, rank, nranks, nrows):
        """Generate global dataset and cut out local piece.
        In real life each actor would of course directly create local data.
        """
        # Load dataset
        data, labels = datasets.load_breast_cancer(return_X_y=True)
        # Split into train and test set
        train_x, _, train_y, _ = train_test_split(data, labels, test_size=0.25)
        train_y = train_y.reshape((train_y.shape[0], 1))
        train = np.hstack([train_x, train_y])
        assert nrows <= train.shape[0]
        assert nc == train.shape[1]
        sz = nrows // nranks
        return train[sz * rank : sz * (rank + 1)], ray.util.get_node_ip_address()


class Parted:
    """Class exposing __partitioned__"""

    def __init__(self, parted):
        self.__partitioned__ = parted


def main(cpus_per_actor, num_actors):
    nr = 424
    actors = [AnActor.remote() for _ in range(num_actors)]
    parts = [actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors)]
    rowsperpart = nr // num_actors
    nr = rowsperpart * num_actors
    parted = Parted(
        {
            "shape": (nr, nc),
            "partition_tiling": (num_actors, 1),
            "get": lambda x: ray.get(x),
            "partitions": {
                (i, 0): {
                    "start": (i * rowsperpart, 0),
                    "shape": (rowsperpart, nc),
                    "data": parts[i][0],
                    "location": [ray.get(parts[i][1])],
                }
                for i in range(num_actors)
            },
        }
    )

    yl = nc - 1
    # Let's create DMatrix from our __partitioned__ structure
    train_set = RayDMatrix(parted, f"f{yl}")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(train_set, "train")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "partitioned.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if not ray.is_initialized():
        if args.smoke_test:
            ray.init(num_cpus=args.num_actors + 1)
        elif args.server_address:
            ray.util.connect(args.server_address)
        else:
            ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_predict.py
================================================
import os

import numpy as np
import xgboost as xgb
from sklearn import datasets

from xgboost_ray import RayDMatrix, RayParams, predict


def main():
    if not os.path.exists("simple.xgb"):
        raise ValueError(
            "Model file not found: `simple.xgb`"
            "\nFIX THIS by running `python `simple.py` first to "
            "train the model."
        )

    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)

    dmat_xgb = xgb.DMatrix(data, labels)
    dmat_ray = RayDMatrix(data, labels)

    bst = xgb.Booster(model_file="simple.xgb")

    pred_xgb = bst.predict(dmat_xgb)
    pred_ray = predict(bst, dmat_ray, ray_params=RayParams(num_actors=2))

    np.testing.assert_array_equal(pred_xgb, pred_ray)
    print(pred_ray)


if __name__ == "__main__":
    main()


================================================
FILE: xgboost_ray/examples/simple_ray_dataset.py
================================================
import argparse

import numpy as np
import pandas as pd
import ray
from xgboost import DMatrix

from xgboost_ray import RayDMatrix, RayParams, train


def main(cpus_per_actor, num_actors):
    np.random.seed(1234)
    # Generate dataset
    x = np.repeat(range(8), 16).reshape((32, 4))
    # Even numbers --> 0, odd numbers --> 1
    y = np.tile(np.repeat(range(2), 4), 4)

    # Flip some bits to reduce max accuracy
    bits_to_flip = np.random.choice(32, size=6, replace=False)
    y[bits_to_flip] = 1 - y[bits_to_flip]

    data = pd.DataFrame(x)
    # Ray Datasets require all columns to be string
    data.columns = [str(c) for c in data.columns]
    data["label"] = y

    ray_ds = ray.data.from_pandas(data)
    train_set = RayDMatrix(ray_ds, "label")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(
        params=xgboost_params,
        dtrain=train_set,
        evals=[(train_set, "train")],
        evals_result=evals_result,
        ray_params=RayParams(
            max_actor_restarts=0,
            gpus_per_actor=0,
            cpus_per_actor=cpus_per_actor,
            num_actors=num_actors,
        ),
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "ray_datasets.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))

    # Distributed prediction
    scored = ray_ds.drop_columns(["label"]).map_batches(
        lambda batch: {"pred": bst.predict(DMatrix(batch))}, batch_format="pandas"
    )
    print(scored.to_pandas())


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per xgboost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=4,
        help="Sets number of xgboost workers to use.",
    )
    parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors + 1)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors)


================================================
FILE: xgboost_ray/examples/simple_tune.py
================================================
import argparse
import os

import ray
from ray import tune
from sklearn import datasets
from sklearn.model_selection import train_test_split

import xgboost_ray
from xgboost_ray import RayDMatrix, RayParams, train


def train_breast_cancer(config, ray_params):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    bst = train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        ray_params=ray_params,
        verbose_eval=False,
        num_boost_round=10,
    )

    model_path = "tuned.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))


def main(cpus_per_actor, num_actors, num_samples):
    # Set XGBoost config.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9),
    }

    ray_params = RayParams(
        max_actor_restarts=1,
        gpus_per_actor=0,
        cpus_per_actor=cpus_per_actor,
        num_actors=num_actors,
    )

    analysis = tune.run(
        tune.with_parameters(train_breast_cancer, ray_params=ray_params),
        # Use the `get_tune_resources` helper function to set the resources.
        resources_per_trial=ray_params.get_tune_resources(),
        config=config,
        num_samples=num_samples,
        metric="eval-error",
        mode="min",
    )

    # Load the best model checkpoint.
    best_bst = xgboost_ray.tune.load_model(
        os.path.join(analysis.best_trial.local_path, "tuned.xgb")
    )

    best_bst.save_model("best_model.xgb")

    accuracy = 1.0 - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.",
    )
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=1,
        help="Sets number of CPUs per XGBoost training worker.",
    )
    parser.add_argument(
        "--num-actors",
        type=int,
        default=1,
        help="Sets number of XGBoost workers to use.",
    )
    parser.add_argument(
        "--num-samples", type=int, default=4, help="Number of samples to use for Tune."
    )
    parser.add_argument("--smoke-test", action="store_true", default=False)

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors * args.num_samples)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address=args.address)

    main(args.cpus_per_actor, args.num_actors, args.num_samples)


================================================
FILE: xgboost_ray/examples/train_on_test_data.py
================================================
import argparse
import os
import shutil
import time

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.tests.utils import create_parquet_in_tempdir

####
# Run `create_test_data.py` first to create a large fake data set.
# Alternatively, run with `--smoke-test` to create an ephemeral small fake
# data set.
####


def main(fname, num_actors=2):
    dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"])

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors),
        num_boost_round=10,
        evals=[(dtrain, "train")],
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("test_data.xgb")
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing",
    )
    args = parser.parse_args()

    temp_dir, path = None, None
    if args.smoke_test:
        temp_dir, path = create_parquet_in_tempdir(
            "smoketest.parquet",
            num_rows=1_000,
            num_features=4,
            num_classes=2,
            num_partitions=2,
        )
    else:
        path = os.path.join(os.path.dirname(__file__), "parted.parquet")

    import ray

    ray.init()

    start = time.time()
    main(path)
    taken = time.time() - start
    print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")

    if args.smoke_test:
        shutil.rmtree(temp_dir)


================================================
FILE: xgboost_ray/examples/train_with_ml_dataset.py
================================================
import argparse
import os
import shutil
import time

from ray.util.data import read_parquet

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.tests.utils import create_parquet_in_tempdir

####
# Run `create_test_data.py` first to create a large fake data set.
# Alternatively, run with `--smoke-test` to create an ephemeral small fake
# data set.
####


def main(fname, num_actors=2):
    ml_dataset = read_parquet(fname, num_shards=num_actors)

    dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"])

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors),
        num_boost_round=10,
        evals=[(dtrain, "train")],
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("test_data.xgb")
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing",
    )
    args = parser.parse_args()

    temp_dir, path = None, None
    if args.smoke_test:
        temp_dir, path = create_parquet_in_tempdir(
            "smoketest.parquet",
            num_rows=1_000,
            num_features=4,
            num_classes=2,
            num_partitions=2,
        )
    else:
        path = os.path.join(os.path.dirname(__file__), "parted.parquet")

    import ray

    ray.init()

    start = time.time()
    main(path)
    taken = time.time() - start
    print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")

    if args.smoke_test:
        shutil.rmtree(temp_dir)


================================================
FILE: xgboost_ray/main.py
================================================
import functools
import inspect
import multiprocessing
import os
import pickle
import platform
import threading
import time
import warnings
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
from packaging.version import Version
from xgboost.core import XGBoostError

from xgboost_ray.xgb import xgboost as xgb

try:
    from xgboost.core import EarlyStopException
except ImportError:

    class EarlyStopException(XGBoostError):
        pass


# From xgboost>=1.7.0, rabit is replaced by a collective communicator.
try:
    from xgboost.collective import CommunicatorContext

    rabit = None
    HAS_COLLECTIVE = True
except ImportError:
    from xgboost import rabit  # noqa

    CommunicatorContext = None
    HAS_COLLECTIVE = False

from xgboost_ray.callback import DistributedCallback, DistributedCallbackContainer
from xgboost_ray.compat import LEGACY_CALLBACK, RabitTracker, TrainingCallback

try:
    import ray
    from ray import logger
    from ray.actor import ActorHandle
    from ray.exceptions import RayActorError, RayTaskError
    from ray.util import get_node_ip_address, placement_group
    from ray.util.annotations import DeveloperAPI, PublicAPI
    from ray.util.placement_group import (
        PlacementGroup,
        get_current_placement_group,
        remove_placement_group,
    )
    from ray.util.queue import Queue
    from ray.util.scheduling_strategies import (
        NodeAffinitySchedulingStrategy,
        PlacementGroupSchedulingStrategy,
    )

    from xgboost_ray.util import Event, MultiActorTask, force_on_current_node

    DEFAULT_PG = "default"

    RAY_INSTALLED = True
except ImportError:
    ray = get_node_ip_address = Queue = Event = ActorHandle = logger = None

    def PublicAPI(f):
        @functools.wraps(f)
        def inner_f(*args, **kwargs):
            return f(*args, **kwargs)

        return inner_f

    DeveloperAPI = PublicAPI
    RAY_INSTALLED = False

from xgboost_ray.matrix import (
    LEGACY_MATRIX,
    QUANTILE_AVAILABLE,
    RayDataIter,
    RayDeviceQuantileDMatrix,
    RayDMatrix,
    RayQuantileDMatrix,
    combine_data,
    concat_dataframes,
)
from xgboost_ray.session import (
    get_rabit_rank,
    init_session,
    put_queue,
    set_session_queue,
)

RAY_TUNE_INSTALLED = True

try:
    import ray.train
    import ray.tune
except (ImportError, ModuleNotFoundError):
    RAY_TUNE_INSTALLED = False

if RAY_TUNE_INSTALLED:
    from xgboost_ray.tune import _get_tune_resources, _try_add_tune_callback
else:
    _get_tune_resources = _try_add_tune_callback = None


def _get_environ(item: str, old_val: Any):
    env_var = f"RXGB_{item}"
    new_val = old_val
    if env_var in os.environ:
        new_val_str = os.environ.get(env_var)

        if isinstance(old_val, bool):
            new_val = bool(int(new_val_str))
        elif isinstance(old_val, int):
            new_val = int(new_val_str)
        elif isinstance(old_val, float):
            new_val = float(new_val_str)
        else:
            new_val = new_val_str

    return new_val


@dataclass
class _XGBoostEnv:
    # Whether to use SPREAD placement group strategy for training.
    USE_SPREAD_STRATEGY: bool = True

    # How long to wait for placement group creation before failing.
    PLACEMENT_GROUP_TIMEOUT_S: int = 100

    # Status report frequency when waiting for initial actors
    # and during training
    STATUS_FREQUENCY_S: int = 30

    # If restarting failed actors is disabled
    ELASTIC_RESTART_DISABLED: bool = False

    # How often to check for new available resources
    ELASTIC_RESTART_RESOURCE_CHECK_S: int = 30

    # How long to wait before triggering a new start of the training loop
    # when new actors become available
    ELASTIC_RESTART_GRACE_PERIOD_S: int = 10

    # Whether to allow soft-placement of communication processes. If True,
    # the Queue and Event actors may be scheduled on non-driver nodes.
    COMMUNICATION_SOFT_PLACEMENT: bool = True

    def __getattribute__(self, item):
        old_val = super(_XGBoostEnv, self).__getattribute__(item)
        new_val = _get_environ(item, old_val)
        if new_val != old_val:
            setattr(self, item, new_val)
        return super(_XGBoostEnv, self).__getattribute__(item)


ENV = _XGBoostEnv()

xgboost_version = xgb.__version__ if xgb else "0.0.0"

LEGACY_WARNING = (
    f"You are using `xgboost_ray` with a legacy XGBoost version "
    f"(version {xgboost_version}). While we try to support "
    f"older XGBoost versions, please note that this library is only "
    f"fully tested and supported for XGBoost >= 1.4. Please consider "
    f"upgrading your XGBoost version (`pip install -U xgboost`)."
)

# XGBoost Version for comparisions
XGBOOST_VERSION = Version(xgboost_version)


class RayXGBoostTrainingError(RuntimeError):
    """Raised from RayXGBoostActor.train() when the local xgb.train function
    did not complete."""

    pass


class RayXGBoostTrainingStopped(RuntimeError):
    """Raised from RayXGBoostActor.train() when training was deliberately
    stopped."""

    pass


class RayXGBoostActorAvailable(RuntimeError):
    """Raise from `_update_scheduled_actor_states()` when new actors become
    available in elastic training"""

    pass


def _assert_ray_support():
    if not RAY_INSTALLED:
        raise ImportError(
            "Ray needs to be installed in order to use this module. "
            "Try: `pip install ray`"
        )


def _in_ray_tune_session() -> bool:
    return (
        RAY_TUNE_INSTALLED and ray.train.get_context().get_trial_resources() is not None
    )


def _maybe_print_legacy_warning():
    if LEGACY_MATRIX or LEGACY_CALLBACK:
        warnings.warn(LEGACY_WARNING)


def _is_client_connected() -> bool:
    try:
        return ray.util.client.ray.is_connected()
    except Exception:
        return False


class _RabitTrackerCompatMixin:
    """Fallback calls to legacy terminology"""

    def accept_workers(self, n_workers: int):
        return self.accept_slaves(n_workers)

    def worker_envs(self):
        return self.slave_envs()


class _RabitTracker(RabitTracker, _RabitTrackerCompatMixin):
    """
    This method overwrites the xgboost-provided RabitTracker to switch
    from a daemon thread to a multiprocessing Process. This is so that
    we are able to terminate/kill the tracking process at will.
    """

    def start(self, nworker):
        # TODO: refactor RabitTracker to support spawn process creation.
        # In python 3.8, spawn is used as default process creation on macOS.
        # But spawn doesn't work because `run` is not pickleable.
        # For now we force the start method to use fork.
        multiprocessing.set_start_method("fork", force=True)

        def run():
            self.accept_workers(nworker)

        self.thread = multiprocessing.Process(target=run, args=())
        self.thread.start()


def _start_rabit_tracker(num_workers: int):
    """Start Rabit tracker. The workers connect to this tracker to share
    their results.

    The Rabit tracker is the main process that all local workers connect to
    to share their weights. When one or more actors die, we want to
    restart the Rabit tracker, too, for two reasons: First we don't want to
    be potentially stuck with stale connections from old training processes.
    Second, we might restart training with a different number of actors, and
    for that we would have to restart the tracker anyway.

    To do this we start the Tracker in its own subprocess with its own PID.
    We can use this process then to specifically kill/terminate the tracker
    process in `_stop_rabit_tracker` without touching other functionality.
    """
    host = get_node_ip_address()

    env = {"DMLC_NUM_WORKER": num_workers}

    rabit_tracker = _RabitTracker(host, num_workers)

    # Get tracker Host + IP
    env.update(rabit_tracker.worker_envs())
    rabit_tracker.start(num_workers)

    logger.debug(f"Started Rabit tracker process with PID {rabit_tracker.thread.pid}")

    return rabit_tracker.thread, env


def _stop_rabit_tracker(rabit_process: multiprocessing.Process):
    logger.debug(f"Stopping Rabit process with PID {rabit_process.pid}")
    rabit_process.join(timeout=5)
    rabit_process.terminate()


class _RabitContextBase:
    """This context is used by local training actors to connect to the
    Rabit tracker.

    Args:
        actor_id: Unique actor ID
        args: Arguments for Rabit initialisation. These are
            environment variables to configure Rabit clients.
    """

    def __init__(self, actor_id: int, args: dict):
        args["DMLC_TASK_ID"] = "[xgboost.ray]:" + actor_id
        self.args = args


# From xgboost>=1.7.0, rabit is replaced by a collective communicator
if HAS_COLLECTIVE:

    class _RabitContext(_RabitContextBase, CommunicatorContext):
        pass

else:

    class _RabitContext(_RabitContextBase):
        def __init__(self, actor_id: int, args: dict):
            super().__init__(actor_id, args)
            self._list_args = [("%s=%s" % item).encode() for item in self.args.items()]

        def __enter__(self):
            xgb.rabit.init(self._list_args)

        def __exit__(self, *args):
            xgb.rabit.finalize()


def _ray_get_actor_cpus():
    # Get through resource IDs
    if Version(ray.__version__) < Version("2.0.0"):
        # Remove after 2.2?
        resource_ids = ray.worker.get_resource_ids()
        if "CPU" in resource_ids:
            return sum(cpu[1] for cpu in resource_ids["CPU"])
    else:
        resource_ids = ray.get_runtime_context().get_assigned_resources()
        for key in resource_ids.keys():
            if key.startswith("CPU"):
                return resource_ids[key]
        return 1


def _ray_get_cluster_cpus():
    return ray.cluster_resources().get("CPU", None)


def _get_min_node_cpus():
    max_node_cpus = min(
        node.get("Resources", {}).get("CPU", 0.0)
        for node in ray.nodes()
        if node.get("Alive", False)
    )
    return max_node_cpus if max_node_cpus > 0.0 else 1.0


def _set_omp_num_threads():
    ray_cpus = _ray_get_actor_cpus()
    if ray_cpus:
        os.environ["OMP_NUM_THREADS"] = str(int(ray_cpus))
    else:
        if "OMP_NUM_THREADS" in os.environ:
            del os.environ["OMP_NUM_THREADS"]
    return int(float(os.environ.get("OMP_NUM_THREADS", "0.0")))


def _prepare_dmatrix_params(param: Dict) -> Dict:
    dm_param = {
        "data": concat_dataframes(param["data"]),
        "label": concat_dataframes(param["label"]),
        "weight": concat_dataframes(param["weight"]),
        "feature_weights": concat_dataframes(param["feature_weights"]),
        "qid": concat_dataframes(param["qid"]),
        "base_margin": concat_dataframes(param["base_margin"]),
        "label_lower_bound": concat_dataframes(param["label_lower_bound"]),
        "label_upper_bound": concat_dataframes(param["label_upper_bound"]),
    }
    return dm_param


def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix:
    if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix):
        if isinstance(param["data"], list):
            qdm_param = _prepare_dmatrix_params(param)
            param.update(qdm_param)
        if data.enable_categorical is not None:
            param["enable_categorical"] = data.enable_categorical
        matrix = xgb.QuantileDMatrix(**param)
    if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix):
        # If we only got a single data shard, create a list so we can
        # iterate over it
        if not isinstance(param["data"], list):
            param["data"] = [param["data"]]

            if not isinstance(param["label"], list):
                param["label"] = [param["label"]]
            if not isinstance(param["weight"], list):
                param["weight"] = [param["weight"]]
            if not isinstance(param["feature_weights"], list):
                param["feature_weights"] = [param["feature_weights"]]
            if not isinstance(param["qid"], list):
                param["qid"] = [param["qid"]]
            if not isinstance(param["data"], list):
                param["base_margin"] = [param["base_margin"]]

        param["label_lower_bound"] = [None]
        param["label_upper_bound"] = [None]

        dm_param = {
            "feature_names": data.feature_names,
            "feature_types": data.feature_types,
            "missing": data.missing,
        }

        if data.enable_categorical is not None:
            dm_param["enable_categorical"] = data.enable_categorical

        param.update(dm_param)
        it = RayDataIter(**param)
        matrix = xgb.DeviceQuantileDMatrix(it, **dm_param)
    else:
        if isinstance(param["data"], list):
            dm_param = _prepare_dmatrix_params(param)
            param.update(dm_param)

        ll = param.pop("label_lower_bound", None)
        lu = param.pop("label_upper_bound", None)
        fw = param.pop("feature_weights", None)

        if LEGACY_MATRIX:
            param.pop("base_margin", None)

        if "qid" not in inspect.signature(xgb.DMatrix).parameters:
            param.pop("qid", None)

        if data.enable_categorical is not None:
            param["enable_categorical"] = data.enable_categorical

        matrix = xgb.DMatrix(**param)

        if not LEGACY_MATRIX:
            matrix.set_info(
                label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw
            )

    data.update_matrix_properties(matrix)
    return matrix


@PublicAPI(stability="beta")
@dataclass
class RayParams:
    """Parameters to configure Ray-specific behavior.

    Args:
        num_actors: Number of parallel Ray actors.
        cpus_per_actor: Number of CPUs to be used per Ray actor.
        gpus_per_actor: Number of GPUs to be used per Ray actor.
        resources_per_actor: Dict of additional resources
            required per Ray actor.
        elastic_training: If True, training will continue with
            fewer actors if an actor fails. Default False.
        max_failed_actors: If `elastic_training` is True, this
            specifies the maximum number of failed actors with which
            we still continue training.
        max_actor_restarts: Number of retries when Ray actors fail.
            Defaults to 0 (no retries). Set to -1 for unlimited retries.
        checkpoint_frequency: How often to save checkpoints. Defaults
            to ``5`` (every 5th iteration).
        verbose: Whether to output Ray-specific info messages
            during training/prediction.
        placement_options: Optional kwargs to pass to
            ``PlacementGroupFactory`` in ``get_tune_resources()``.
    """

    # Actor scheduling
    num_actors: int = 0
    cpus_per_actor: int = 0
    gpus_per_actor: int = -1
    resources_per_actor: Optional[Dict] = None

    # Fault tolerance
    elastic_training: bool = False
    max_failed_actors: int = 0
    max_actor_restarts: int = 0
    checkpoint_frequency: int = 5

    # Distributed callbacks
    distributed_callbacks: Optional[List[DistributedCallback]] = None

    verbose: Optional[bool] = None
    placement_options: Dict[str, Any] = None

    def get_tune_resources(self):
        """Return the resources to use for xgboost_ray training with Tune."""
        if self.cpus_per_actor <= 0 or self.num_actors <= 0:
            raise ValueError(
                "num_actors and cpus_per_actor both must be " "greater than 0."
            )
        return _get_tune_resources(
            num_actors=self.num_actors,
            cpus_per_actor=self.cpus_per_actor,
            gpus_per_actor=max(0, self.gpus_per_actor),
            resources_per_actor=self.resources_per_actor,
            placement_options=self.placement_options,
        )


@dataclass
class _Checkpoint:
    iteration: int = 0
    value: Optional[bytes] = None


def _validate_ray_params(ray_params: Union[None, RayParams, dict]) -> RayParams:
    if ray_params is None:
        ray_params = RayParams()
    elif isinstance(ray_params, dict):
        ray_params = RayParams(**ray_params)
    elif not isinstance(ray_params, RayParams):
        raise ValueError(
            f"`ray_params` must be a `RayParams` instance, a dict, or None, "
            f"but it was {type(ray_params)}."
            f"\nFIX THIS preferably by passing a `RayParams` instance as "
            f"the `ray_params` parameter."
        )
    if ray_params.num_actors <= 0:
        raise ValueError(
            "The `num_actors` parameter is set to 0. Please always specify "
            "the number of distributed actors you want to use."
            "\nFIX THIS by passing a `RayParams(num_actors=X)` argument "
            "to your call to xgboost_ray."
        )
    elif ray_params.num_actors < 2:
        warnings.warn(
            f"`num_actors` in `ray_params` is smaller than 2 "
            f"({ray_params.num_actors}). XGBoost will NOT be distributed!"
        )
    if ray_params.verbose is None and RAY_TUNE_INSTALLED:
        # In Tune/Train sessions, reduce verbosity
        ray_params.verbose = not _in_ray_tune_session()
    return ray_params


@DeveloperAPI
class RayXGBoostActor:
    """Remote Ray XGBoost actor class.

    This remote actor handles local training and prediction of one data
    shard. It initializes a Rabit context, thus connecting to the Rabit
    all-reduce ring, and initializes local training, sending updates
    to other workers.

    The actor with rank 0 also checkpoints the model periodically and
    sends the checkpoint back to the driver.

    Args:
        rank: Rank of the actor. Must be ``0 <= rank < num_actors``.
        num_actors: Total number of actors.
        queue: Ray queue to communicate with main process.
        checkpoint_frequency: How often to store checkpoints. Defaults
            to ``5``, saving checkpoints every 5 boosting rounds.

    """

    def __init__(
        self,
        rank: int,
        num_actors: int,
        queue: Optional[Queue] = None,
        stop_event: Optional[Event] = None,
        checkpoint_frequency: int = 5,
        distributed_callbacks: Optional[List[DistributedCallback]] = None,
    ):
        self.queue = queue
        init_session(rank, self.queue)

        self.rank = rank
        self.num_actors = num_actors

        self.checkpoint_frequency = checkpoint_frequency

        self._data: Dict[RayDMatrix, dict] = {}
        self._local_n: Dict[RayDMatrix, int] = {}

        self._stop_event = stop_event

        self._distributed_callbacks = DistributedCallbackContainer(
            distributed_callbacks
        )

        self._distributed_callbacks.on_init(self)
        _set_omp_num_threads()
        logger.debug(f"Initialized remote XGBoost actor with rank {self.rank}")

    def set_queue(self, queue: Queue):
        self.queue = queue
        set_session_queue(self.queue)

    def set_stop_event(self, stop_event: Event):
        self._stop_event = stop_event

    def _get_stop_event(self):
        return self._stop_event

    def pid(self):
        """Get process PID. Used for checking if still alive"""
        return os.getpid()

    def ip(self):
        """Get node IP address."""
        return get_node_ip_address()

    def _save_checkpoint_callback(self):
        """Send checkpoints to driver"""
        this = self

        class _SaveInternalCheckpointCallback(TrainingCallback):
            def after_iteration(self, model, epoch, evals_log):
                if get_rabit_rank() == 0 and epoch % this.checkpoint_frequency == 0:
                    put_queue(_Checkpoint(epoch, pickle.dumps(model)))

            def after_training(self, model):
                if get_rabit_rank() == 0:
                    put_queue(_Checkpoint(-1, pickle.dumps(model)))
                return model

        return _SaveInternalCheckpointCallback()

    def _stop_callback(self):
        """Stop if event is set"""
        this = self
        # Keep track of initial stop event. Since we're training in a thread,
        # the stop event might be overwritten, which should he handled
        # as if the previous stop event was set.
        initial_stop_event = self._stop_event

        class _StopCallback(TrainingCallback):
            def after_iteration(self, model, epoch, evals_log):
                try:
                    if (
                        this._stop_event.is_set()
                        or this._get_stop_event() is not initial_stop_event
                    ):
                        if LEGACY_CALLBACK:
                            raise EarlyStopException(epoch)
                        # Returning True stops training
                        return True
                except RayActorError:
                    if LEGACY_CALLBACK:
                        raise EarlyStopException(epoch)
                    return True

        return _StopCallback()

    def load_data(self, data: RayDMatrix):
        if data in self._data:
            return

        self._distributed_callbacks.before_data_loading(self, data)

        param = data.get_data(self.rank, self.num_actors)
        if isinstance(param["data"], list):
            self._local_n[data] = sum(len(a) for a in param["data"])
        else:
            self._local_n[data] = len(param["data"])

        # set nthread for dmatrix conversion
        param["nthread"] = int(_ray_get_actor_cpus())
        self._data[data] = param

        self._distributed_callbacks.after_data_loading(self, data)

    def train(
        self,
        rabit_args: List[str],
        return_bst: bool,
        params: Dict[str, Any],
        dtrain: RayDMatrix,
        evals: Tuple[RayDMatrix, str],
        *args,
        **kwargs,
    ) -> Dict[str, Any]:
        self._distributed_callbacks.before_train(self)

        num_threads = _set_omp_num_threads()

        local_params = params.copy()

        if "xgb_model" in kwargs:
            if isinstance(kwargs["xgb_model"], bytes):
                # bytearray type gets lost in remote actor call
                kwargs["xgb_model"] = bytearray(kwargs["xgb_model"])

        if "nthread" not in local_params and "n_jobs" not in local_params:
            if num_threads > 0:
                local_params["nthread"] = num_threads
                local_params["n_jobs"] = num_threads
            else:
                local_params["nthread"] = _ray_get_actor_cpus()
                local_params["n_jobs"] = local_params["nthread"]

        if dtrain not in self._data:
            self.load_data(dtrain)

        for deval, _name in evals:
            if deval not in self._data:
                self.load_data(deval)

        evals_result = dict()

        if "callbacks" in kwargs:
            callbacks = kwargs["callbacks"] or []
        else:
            callbacks = []
        callbacks.append(self._save_checkpoint_callback())
        callbacks.append(self._stop_callback())
        kwargs["callbacks"] = callbacks

        result_dict = {}
        error_dict = {}

        # We run xgb.train in a thread to be able to react to the stop event.
        def _train():
            try:
                with _RabitContext(str(id(self)), rabit_args):
                    local_dtrain = _get_dmatrix(dtrain, self._data[dtrain])

                    if not local_dtrain.get_label().size:
                        raise RuntimeError(
                            "Training data has no label set. Please make sure "
                            "to set the `label` argument when initializing "
                            "`RayDMatrix()` for data you would like "
                            "to train on."
                        )

                    local_evals = []
                    for deval, name in evals:
                        local_evals.append(
                            (_get_dmatrix(deval, self._data[deval]), name)
                        )
                    if LEGACY_CALLBACK:
                        for xgb_callback in kwargs.get("callbacks", []):
                            if isinstance(xgb_callback, TrainingCallback):
                                xgb_callback.before_training(None)

                    bst = xgb.train(
                        local_params,
                        local_dtrain,
                        *args,
                        evals=local_evals,
                        evals_result=evals_result,
                        **kwargs,
                    )

                    if LEGACY_CALLBACK:
                        for xgb_callback in kwargs.get("callbacks", []):
                            if isinstance(xgb_callback, TrainingCallback):
                                xgb_callback.after_training(bst)

                    result_dict.update(
                        {
                            "bst": bst,
                            "evals_result": evals_result,
                            "train_n": self._local_n[dtrain],
                        }
                    )
            except EarlyStopException:
                # Usually this should be caught by XGBoost core.
                # Silent fail, will be raised as RayXGBoostTrainingStopped.
                return
            except XGBoostError as e:
                error_dict.update({"exception": e})
                return

        thread = threading.Thread(target=_train)
        thread.daemon = True
        thread.start()
        while thread.is_alive():
            thread.join(timeout=0)
            if self._stop_event.is_set():
                raise RayXGBoostTrainingStopped("Training was interrupted.")
            time.sleep(0.1)

        if not result_dict:
            raise_from = error_dict.get("exception", None)
            raise RayXGBoostTrainingError("Training failed.") from raise_from

        thread.join()
        self._distributed_callbacks.after_train(self, result_dict)

        if not return_bst:
            result_dict.pop("bst", None)

        return result_dict

    def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs):
        self._distributed_callbacks.before_predict(self)

        _set_omp_num_threads()

        if data not in self._data:
            self.load_data(data)
        local_data = _get_dmatrix(data, self._data[data])

        predictions = model.predict(local_data, **kwargs)
        if predictions.ndim == 1:
            callback_predictions = pd.Series(predictions)
        else:
            callback_predictions = pd.DataFrame(predictions)
        self._distributed_callbacks.after_predict(self, callback_predictions)
        return predictions


@ray.remote
class _RemoteRayXGBoostActor(RayXGBoostActor):
    pass


class _PrepareActorTask(MultiActorTask):
    def __init__(
        self,
        actor: ActorHandle,
        queue: Queue,
        stop_event: Event,
        load_data: List[RayDMatrix],
    ):
        futures = []
        futures.append(actor.set_queue.remote(queue))
        futures.append(actor.set_stop_event.remote(stop_event))
        for data in load_data:
            futures.append(actor.load_data.remote(data))

        super(_PrepareActorTask, self).__init__(futures)


def _autodetect_resources(
    ray_params: RayParams, use_tree_method: bool = False
) -> Tuple[int, int]:
    gpus_per_actor = ray_params.gpus_per_actor
    cpus_per_actor = ray_params.cpus_per_actor

    # Automatically set gpus_per_actor if left at the default value
    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if use_tree_method:
            gpus_per_actor = 1

    # Automatically set cpus_per_actor if left at the default value
    # Will be set to the number of cluster CPUs divided by the number of
    # actors, bounded by the minimum number of CPUs across actors nodes.
    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = max(
            1,
            min(
                int(_get_min_node_cpus() or 1),
                int(cluster_cpus // ray_params.num_actors),
            ),
        )
    return cpus_per_actor, gpus_per_actor


def _create_actor(
    rank: int,
    num_actors: int,
    num_cpus_per_actor: int,
    num_gpus_per_actor: int,
    resources_per_actor: Optional[Dict] = None,
    placement_group: Optional[PlacementGroup] = None,
    queue: Optional[Queue] = None,
    checkpoint_frequency: int = 5,
    distributed_callbacks: Optional[Sequence[DistributedCallback]] = None,
) -> ActorHandle:
    # Send DEFAULT_PG here, which changed in Ray >= 1.5.0
    # If we send `None`, this will ignore the parent placement group and
    # lead to errors e.g. when used within Ray Tune
    actor_cls = _RemoteRayXGBoostActor.options(
        num_cpus=num_cpus_per_actor,
        num_gpus=num_gpus_per_actor,
        resources=resources_per_actor,
        scheduling_strategy=PlacementGroupSchedulingStrategy(
            placement_group=placement_group or DEFAULT_PG,
            placement_group_capture_child_tasks=True,
        ),
    )

    return actor_cls.remote(
        rank=rank,
        num_actors=num_actors,
        queue=queue,
        checkpoint_frequency=checkpoint_frequency,
        distributed_callbacks=distributed_callbacks,
    )


def _trigger_data_load(actor, dtrain, evals):
    wait_load = [actor.load_data.remote(dtrain)]
    for deval, _name in evals:
        wait_load.append(actor.load_data.remote(deval))
    return wait_load


def _handle_queue(queue: Queue, checkpoint: _Checkpoint, callback_returns: Dict):
    """Handle results obtained from workers through the remote Queue object.

    Remote actors supply these results via the
    ``xgboost_ray.session.put_queue()`` function. These can be:

    - Callables. These will be called immediately with no arguments.
    - ``_Checkpoint`` objects. These will update the latest checkpoint
      object on the driver.
    - Any other type. These will be appended to an actor rank-specific
      ``callback_returns`` dict that will be written to the
      ``additional_returns`` dict of the :func:`train() <train>` method.
    """
    while not queue.empty():
        (actor_rank, item) = queue.get()
        if isinstance(item, Callable):
            item()
        elif isinstance(item, _Checkpoint):
            checkpoint.__dict__.update(item.__dict__)
        else:
            callback_returns[actor_rank].append(item)


def _shutdown(
    actors: List[ActorHandle],
    pending_actors: Optional[Dict[int, Tuple[ActorHandle, _PrepareActorTask]]] = None,
    queue: Optional[Queue] = None,
    event: Optional[Event] = None,
    placement_group: Optional[PlacementGroup] = None,
    force: bool = False,
):
    alive_actors = [a for a in actors if a is not None]
    if pending_actors:
        alive_actors += [a for (a, _) in pending_actors.values()]

    if force:
        for actor in alive_actors:
            ray.kill(actor)
    else:
        done_refs = [a.__ray_terminate__.remote() for a in alive_actors]
        # Wait 5 seconds for actors to die gracefully.
        done, not_done = ray.wait(done_refs, timeout=5)
        if not_done:
            # If all actors are not able to die gracefully, then kill them.
            for actor in alive_actors:
                ray.kill(actor)
    for i in range(len(actors)):
        actors[i] = None
    if queue:
        queue.shutdown()
    if event:
        event.shutdown()
    if placement_group:
        remove_placement_group(placement_group)


def _create_placement_group(
    cpus_per_actor, gpus_per_actor, resources_per_actor, num_actors, strategy
):
    resources_per_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor}
    extra_resources_per_bundle = (
        {} if resources_per_actor is None else resources_per_actor
    )
    # Create placement group for training worker colocation.
    bundles = [
        {**resources_per_bundle, **extra_resources_per_bundle}
        for _ in range(num_actors)
    ]
    pg = placement_group(bundles, strategy=strategy)
    # Wait for placement group to get created.
    logger.debug("Waiting for placement group to start.")
    timeout = ENV.PLACEMENT_GROUP_TIMEOUT_S
    ready, _ = ray.wait([pg.ready()], timeout=timeout)
    if ready:
        logger.debug("Placement group has started.")
    else:
        raise TimeoutError(
            f"Placement group creation timed out after {timeout} seconds. "
            "Make sure your cluster either has enough resources or use "
            "an autoscaling cluster. Current resources "
            f"available: {ray.available_resources()}, resources requested "
            f"by the placement group: {pg.bundle_specs}. "
            "You can change the timeout by setting the "
            "RXGB_PLACEMENT_GROUP_TIMEOUT_S environment variable."
        )
    return pg


def _create_communication_processes(added_tune_callback: bool = False):
    # Have to explicitly set num_cpus to 0.
    placement_option = {"num_cpus": 0}
    current_pg = get_current_placement_group()
    if current_pg is not None:
        # If we are already in a placement group, let's use it
        # Also, if we are specifically in Tune, let's
        # ensure that we force Queue and
        # StopEvent onto same bundle as the Trainable.
        placement_option.update(
            {
                "placement_group": current_pg,
                "placement_group_bundle_index": 0 if added_tune_callback else -1,
            }
        )
    else:
        # Create Queue and Event actors and make sure to colocate with
        # driver node.
        node_id = ray.get_runtime_context().get_node_id()
        placement_option.update(
            {
                "scheduling_strategy": NodeAffinitySchedulingStrategy(
                    node_id=node_id,
                    soft=ENV.COMMUNICATION_SOFT_PLACEMENT,
                )
            }
        )
    queue = Queue(actor_options=placement_option)  # Queue actor
    stop_event = Event(actor_options=placement_option)  # Stop event actor
    return queue, stop_event


def _validate_kwargs_for_func(kwargs: Dict[str, Any], func: Callable, func_name: str):
    """Raise exception if kwargs are not valid for a given function."""
    sig = inspect.signature(func)
    try:
        sig.bind_partial(**kwargs)
    except TypeError as e:
        # Try to find set of invalid kwargs
        valid_keys = inspect.getfullargspec(func)[0]
        invalid_kwargs = [k for k in kwargs if k not in valid_keys]

        raise TypeError(
            f"Got invalid keyword arguments to be passed to `{func_name}`. "
            f"Please check these arguments: {invalid_kwargs}"
        ) from e


@dataclass
class _TrainingState:
    actors: List[Optional[ActorHandle]]
    queue: Queue
    stop_event: Event

    checkpoint: _Checkpoint
    additional_results: Dict

    training_started_at: float = 0.0

    placement_group: Optional[PlacementGroup] = None

    failed_actor_ranks: set = field(default_factory=set)

    # Last time we checked resources to schedule new actors
    last_resource_check_at: float = 0
    pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = field(
        default_factory=dict
    )
    restart_training_at: Optional[float] = None


def _train(
    params: Dict,
    dtrain: RayDMatrix,
    *args,
    evals=(),
    ray_params: RayParams,
    cpus_per_actor: int,
    gpus_per_actor: int,
    _training_state: _TrainingState,
    **kwargs,
) -> Tuple[xgb.Booster, Dict, Dict]:
    """This is the local train function wrapped by :func:`train() <train>`.

    This function can be thought of one invocation of a multi-actor xgboost
    training run. It starts the required number of actors, triggers data
    loading, collects the results, and handles (i.e. registers) actor failures
    - but it does not handle fault tolerance or general training setup.

    Generally, this function is called one or multiple times by the
    :func:`train() <train>` function. It is called exactly once if no
    errors occur. It is called more than once if errors occurred (e.g. an
    actor died) and failure handling is enabled.
    """
    from xgboost_ray.elastic import (
        _get_actor_alive_status,
        _maybe_schedule_new_actors,
        _update_scheduled_actor_states,
    )

    # Do not modify original parameters
    params = params.copy()

    # Un-schedule possible scheduled restarts
    _training_state.restart_training_at = None

    if "nthread" in params or "n_jobs" in params:
        if ("nthread" in params and params["nthread"] > cpus_per_actor) or (
            "n_jobs" in params and params["n_jobs"] > cpus_per_actor
        ):
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`."
            )
    else:
        params["nthread"] = cpus_per_actor
        params["n_jobs"] = cpus_per_actor

    if ray_params.verbose:
        maybe_log = logger.info
        params.setdefault("verbosity", 1)
    else:
        maybe_log = logger.debug
        params.setdefault("verbosity", 0)

    # This is a callback that handles actor failures.
    # We identify the rank of the failed actor, add this to a set of
    # failed actors (which we might want to restart later), and set its
    # entry in the actor list to None.
    def handle_actor_failure(actor_id):
        rank = _training_state.actors.index(actor_id)
        _training_state.failed_actor_ranks.add(rank)
        _training_state.actors[rank] = None

    # Here we create new actors. In the first invocation of _train(), this
    # will be all actors. In future invocations, this may be less than
    # the num_actors setting, depending on the failure mode.
    newly_created = 0
    for i in list(_training_state.failed_actor_ranks):
        if _training_state.actors[i] is not None:
            raise RuntimeError(
                f"Trying to create actor with rank {i}, but it already " f"exists."
            )
        actor = _create_actor(
            rank=i,
            num_actors=ray_params.num_actors,
            num_cpus_per_actor=cpus_per_actor,
            num_gpus_per_actor=gpus_per_actor,
            resources_per_actor=ray_params.resources_per_actor,
            placement_group=_training_state.placement_group,
            queue=_training_state.queue,
            checkpoint_frequency=ray_params.checkpoint_frequency,
            distributed_callbacks=ray_params.distributed_callbacks,
        )
        # Set actor entry in our list
        _training_state.actors[i] = actor
        # Remove from this set so it is not created again
        _training_state.failed_actor_ranks.remove(i)
        newly_created += 1

    alive_actors = sum(1 for a in _training_state.actors if a is not None)

    maybe_log(
        f"[RayXGBoost] Created {newly_created} new actors "
        f"({alive_actors} total actors). Waiting until actors "
        f"are ready for training."
    )

    # For distributed datasets (e.g. Modin), this will initialize
    # (and fix) the assignment of data shards to actor ranks
    dtrain.assert_enough_shards_for_actors(num_actors=ray_params.num_actors)
    dtrain.assign_shards_to_actors(_training_state.actors)
    for deval, _ in evals:
        deval.assert_enough_shards_for_actors(num_actors=ray_params.num_actors)
        deval.assign_shards_to_actors(_training_state.actors)

    load_data = [dtrain] + [eval[0] for eval in evals]

    prepare_actor_tasks = [
        _PrepareActorTask(
            actor,
            # Maybe we got a new Queue actor, so send it to all actors.
            queue=_training_state.queue,
            # Maybe we got a new Event actor, so send it to all actors.
            stop_event=_training_state.stop_event,
            # Trigger data loading
            load_data=load_data,
        )
        for actor in _training_state.actors
        if actor is not None
    ]

    start_wait = time.time()
    last_status = start_wait
    try:
        # Construct list before calling any() to force evaluation
        ready_states = [task.is_ready() for task in prepare_actor_tasks]
        while not all(ready_states):
            if time.time() >= last_status + ENV.STATUS_FREQUENCY_S:
                wait_time = time.time() - start_wait
                logger.info(
                    f"Waiting until actors are ready "
                    f"({wait_time:.0f} seconds passed)."
                )
                last_status = time.time()
            time.sleep(0.1)
            ready_states = [task.is_ready() for task in prepare_actor_tasks]

    except Exception as exc:
        _training_state.stop_event.set()
        _get_actor_alive_status(_training_state.actors, handle_actor_failure)
        raise RayActorError from exc

    maybe_log("[RayXGBoost] Starting XGBoost training.")

    # Start Rabit tracker for gradient sharing
    rabit_process, rabit_args = _start_rabit_tracker(alive_actors)

    # Load checkpoint if we have one. In that case we need to adjust the
    # number of training rounds.
    if _training_state.checkpoint.value:
        kwargs["xgb_model"] = pickle.loads(_training_state.checkpoint.value)
        if _training_state.checkpoint.iteration == -1:
            # -1 means training already finished.
            logger.error(
                "Trying to load continue from checkpoint, but the checkpoint"
                "indicates training already finished. Returning last"
                "checkpointed model instead."
            )
            return kwargs["xgb_model"], {}, _training_state.additional_results

    # The callback_returns dict contains actor-rank indexed lists of
    # results obtained through the `put_queue` function, usually
    # sent via callbacks.
    callback_returns = _training_state.additional_results.get("callback_returns")
    if callback_returns is None:
        callback_returns = [list() for _ in range(len(_training_state.actors))]
        _training_state.additional_results["callback_returns"] = callback_returns

    _training_state.training_started_at = time.time()

    # Trigger the train function
    live_actors = [actor for actor in _training_state.actors if actor is not None]
    training_futures = [
        actor.train.remote(
            rabit_args, i == 0, params, dtrain, evals, *args, **kwargs  # return_bst
        )
        for i, actor in enumerate(live_actors)
    ]

    # Failure handling loop. Here we wait until all training tasks finished.
    # If a training task fails, we stop training on the remaining actors,
    # check which ones are still alive, and raise the error.
    # The train() wrapper function will then handle the error.
    start_wait = time.time()
    last_status = start_wait

    # When the number of trees/dataset size is very small,
    # training can be too fast and finish before the queue Actor
    # gets to process the calls it has recieved. This is a very rare edge
    # case, but it can show up in CI.
    # In order to mitigate, if the queue has not been handled before,
    # we simply wait a moment before checking it one last time.
    has_queue_been_handled = False
    try:
        not_ready = training_futures
        while not_ready:
            if _training_state.queue:
                has_queue_been_handled = True
                _handle_queue(
                    queue=_training_state.queue,
                    checkpoint=_training_state.checkpoint,
                    callback_returns=callback_returns,
                )

            if ray_params.elastic_training and not ENV.ELASTIC_RESTART_DISABLED:
                _maybe_schedule_new_actors(
                    training_state=_training_state,
                    num_cpus_per_actor=cpus_per_actor,
                    num_gpus_per_actor=gpus_per_actor,
                    resources_per_actor=ray_params.resources_per_actor,
                    ray_params=ray_params,
                    load_data=load_data,
                )

                # This may raise RayXGBoostActorAvailable
                _update_scheduled_actor_states(_training_state)

            if time.time() >= last_status + ENV.STATUS_FREQUENCY_S:
                wait_time = time.time() - start_wait
                logger.info(
                    f"Training in progress "
                    f"({wait_time:.0f} seconds since last restart)."
                )
                last_status = time.time()

            ready, not_ready = ray.wait(
                not_ready, num_returns=len(not_ready), timeout=1
            )
            ray.get(ready)

        # Get items from queue one last time
        if not has_queue_been_handled:
            time.sleep(1)
        if _training_state.queue:
            _handle_queue(
                queue=_training_state.queue,
                checkpoint=_training_state.checkpoint,
                callback_returns=callback_returns,
            )

    # The inner loop should catch all exceptions
    except Exception as exc:
        logger.debug(f"Caught exception in training loop: {exc}")

        # Stop all other actors from training
        _training_state.stop_event.set()

        # Check which actors are still alive
        _get_actor_alive_status(_training_state.actors, handle_actor_failure)

        # Todo: Try to fetch newer checkpoint, store in `_checkpoint`
        # Shut down rabit
        _stop_rabit_tracker(rabit_process)

        raise RayActorError from exc

    # Training is now complete.
    # Stop Rabit tracking process
    _stop_rabit_tracker(rabit_process)

    # Get all results from all actors.
    all_results: List[Dict[str, Any]] = ray.get(training_futures)

    # All results should be the same because of Rabit tracking. But only
    # the first one actually returns its bst object.
    bst = all_results[0]["bst"]
    evals_result = all_results[0]["evals_result"]

    if callback_returns:
        _training_state.additional_results["callback_returns"] = callback_returns

    total_n = sum(res["train_n"] or 0 for res in all_results)

    _training_state.additional_results["total_n"] = total_n

    return bst, evals_result, _training_state.additional_results


@PublicAPI(stability="beta")
def train(
    params: Dict,
    dtrain: RayDMatrix,
    num_boost_round: int = 10,
    *args,
    evals: Union[List[Tuple[RayDMatrix, str]], Tuple[RayDMatrix, str]] = (),
    evals_result: Optional[Dict] = None,
    additional_results: Optional[Dict] = None,
    ray_params: Union[None, RayParams, Dict] = None,
    _remote: Optional[bool] = None,
    **kwargs,
) -> xgb.Booster:
    """Distributed XGBoost training via Ray.

    This function will connect to a Ray cluster, create ``num_actors``
    remote actors, send data shards to them, and have them train an
    XGBoost classifier. The XGBoost parameters will be shared and combined
    via Rabit's all-reduce protocol.

    If running inside a Ray Tune session, this function will automatically
    handle results to tune for hyperparameter search.

    Failure handling:

    XGBoost on Ray supports automatic failure handling that can be configured
    with the :class:`ray_params <RayParams>` argument. If an actor or local
    training task dies, the Ray actor is marked as dead, and there are
    three options on how to proceed.

    First, if ``ray_params.elastic_training`` is ``True`` and
    the number of dead actors is below ``ray_params.max_failed_actors``,
    training will continue right away with fewer actors. No data will be
    loaded again and the latest available checkpoint will be used.
    A maximum of ``ray_params.max_actor_restarts`` restarts will be tried
    before exiting.

    Second, if ``ray_params.elastic_training`` is ``False`` and
    the number of restarts is below ``ray_params.max_actor_restarts``,
    Ray will try to schedule the dead actor again, load the data shard
    on this actor, and then continue training from the latest checkpoint.

    Third, if none of the above is the case, training is aborted.

    Args:
        params: parameter dict passed to ``xgboost.train()``
        dtrain: Data object containing the training data.
        evals: ``evals`` tuple passed to ``xgboost.train()``.
        evals_result: Dict to store evaluation results in.
        additional_results: Dict to store additional results.
        ray_params: Parameters to configure
            Ray-specific behavior. See :class:`RayParams` for a list of valid
            configuration parameters.
        _remote: Whether to run the driver process in a remote
            function. This is enabled by default in Ray client mode.
        **kwargs: Keyword arguments will be passed to the local
            `xgb.train()` calls.

    Returns: An ``xgboost.Booster`` object.
    """
    os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1")

    if platform.system() == "Windows":
        raise RuntimeError(
            "xgboost-ray training currently does not support " "Windows."
        )

    if xgb is None:
        raise ImportError(
            "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. "
            'FIX THIS by running `pip install "xgboost-ray"`.'
        )

    if _remote is None:
        _remote = _is_client_connected() and not _in_ray_tune_session()

    if not ray.is_initialized():
        ray.init()

    if _remote:
        # Run this function as a remote function to support Ray client mode.
        @ray.remote(num_cpus=0)
        def _wrapped(*args, **kwargs):
            _evals_result = {}
            _additional_results = {}
            bst = train(
                *args,
                num_boost_round=num_boost_round,
                evals_result=_evals_result,
                additional_results=_additional_results,
                **kwargs,
            )
            return bst, _evals_result, _additional_results

        # Make sure that train is called on the server node.
        _wrapped = force_on_current_node(_wrapped)

        bst, train_evals_result, train_additional_results = ray.get(
            _wrapped.remote(
                params,
                dtrain,
                *args,
                evals=evals,
                ray_params=ray_params,
                _remote=False,
                **kwargs,
            )
        )
        if isinstance(evals_result, dict):
            evals_result.update(train_evals_result)
        if isinstance(additional_results, dict):
            additional_results.update(train_additional_results)
        return bst

    _maybe_print_legacy_warning()
    # may raise TypeError
    _validate_kwargs_for_func(kwargs, xgb.train, "xgb.train()")

    start_time = time.time()

    ray_params = _validate_ray_params(ray_params)

    max_actor_restarts = (
        ray_params.max_actor_restarts
        if ray_params.max_actor_restarts >= 0
        else float("inf")
    )
    _assert_ray_support()

    if not isinstance(dtrain, RayDMatrix):
        raise ValueError(
            "The `dtrain` argument passed to `train()` is not a RayDMatrix, "
            "but of type {}. "
            "\nFIX THIS by instantiating a RayDMatrix first: "
            "`dtrain = RayDMatrix(data=data, label=label)`.".format(type(dtrain))
        )

    if RAY_TUNE_INSTALLED:
        added_tune_callback = _try_add_tune_callback(kwargs)
    else:
        added_tune_callback = False

    # Tune currently does not support elastic training.
    if (
        added_tune_callback
        and ray_params.elastic_training
        and not bool(os.getenv("RXGB_ALLOW_ELASTIC_TUNE", "0"))
    ):
        raise ValueError(
            "Elastic Training cannot be used with Ray Tune. "
            "Please disable elastic_training in RayParams in "
            "order to use xgboost_ray with Tune."
        )

    if added_tune_callback or get_current_placement_group():
        # Don't autodetect resources when used with Tune.
        cpus_per_actor = ray_params.cpus_per_actor
        gpus_per_actor = max(0, ray_params.gpus_per_actor)
    else:
        cpus_per_actor, gpus_per_actor = _autodetect_resources(
            ray_params=ray_params,
            use_tree_method="tree_method" in params
            and params["tree_method"] is not None
            and params["tree_method"].startswith("gpu"),
        )

    tree_method = params.get("tree_method", "auto") or "auto"

    # preemptively raise exceptions with bad params
    if tree_method == "exact":
        raise ValueError("`exact` tree method doesn't support distributed training.")

    if params.get("updater", None) == "grow_colmaker":
        raise ValueError(
            "`grow_colmaker` updater doesn't support distributed training."
        )

    if gpus_per_actor > 0 and not tree_method.startswith("gpu_"):
        warnings.warn(
            f"GPUs have been assigned to the actors, but the current XGBoost "
            f"tree method is set to `{tree_method}`. Thus, GPUs will "
            f"currently not be used. To enable GPUs usage, please set the "
            f"`tree_method` to a GPU-compatible option, "
            f"e.g. `gpu_hist`."
        )

    if gpus_per_actor == 0 and cpus_per_actor == 0:
        raise ValueError(
            "cpus_per_actor and gpus_per_actor both cannot be "
            "0. Are you sure your cluster has CPUs available?"
        )

    if ray_params.elastic_training and ray_params.max_failed_actors == 0:
        raise ValueError(
            "Elastic training enabled but the maximum number of failed "
            "actors is set to 0. This means that elastic training is "
            "effectively disabled. Please set `RayParams.max_failed_actors` "
            "to something larger than 0 to enable elastic training."
        )

    if ray_params.elastic_training and ray_params.max_actor_restarts == 0:
        raise ValueError(
            "Elastic training enabled but the maximum number of actor "
            "restarts is set to 0. This means that elastic training is "
            "effectively disabled. Please set `RayParams.max_actor_restarts` "
            "to something larger than 0 to enable elastic training."
        )

    if not dtrain.has_label:
        raise ValueError(
            "Training data has no label set. Please make sure to set "
            "the `label` argument when initializing `RayDMatrix()` "
            "for data you would like to train on."
        )

    if not dtrain.loaded and not dtrain.distributed:
        dtrain.load_data(ray_params.num_actors)

    for deval, _name in evals:
        if not deval.has_label:
            raise ValueError(
                "Evaluation data has no label set. Please make sure to set "
                "the `label` argument when initializing `RayDMatrix()` "
                "for data you would like to evaluate on."
            )
        if not deval.loaded and not deval.distributed:
            deval.load_data(ray_params.num_actors)

    bst = None
    train_evals_result = {}
    train_additional_results = {}

    tries = 0
    checkpoint = _Checkpoint()  # Keep track of latest checkpoint
    current_results = {}  # Keep track of additional results
    actors = [None] * ray_params.num_actors  # All active actors
    pending_actors = {}

    # Create the Queue and Event actors.
    queue, stop_event = _create_communication_processes(added_tune_callback)

    placement_strategy = None
    if not ray_params.elastic_training:
        if added_tune_callback or get_current_placement_group():
            # Tune is using placement groups, so the strategy has already
            # been set. Don't create an additional placement_group here.
            placement_strategy = None
        elif bool(ENV.USE_SPREAD_STRATEGY):
            placement_strategy = "SPREAD"

    if placement_strategy is not None:
        pg = _create_placement_group(
            cpus_per_actor,
            gpus_per_actor,
            ray_params.resources_per_actor,
            ray_params.num_actors,
            placement_strategy,
        )
    else:
        pg = None

    start_actor_ranks = set(range(ray_params.num_actors))  # Start these

    total_training_time = 0.0
    boost_rounds_left = num_boost_round
    last_checkpoint_value = checkpoint.value
    while tries <= max_actor_restarts:
        # Only update number of iterations if the checkpoint changed
        # If it didn't change, we already subtracted the iterations.
        if checkpoint.iteration >= 0 and checkpoint.value != last_checkpoint_value:
            boost_rounds_left -= checkpoint.iteration + 1

        last_checkpoint_value = checkpoint.value

        logger.debug(f"Boost rounds left: {boost_rounds_left}")

        training_state = _TrainingState(
            actors=actors,
            queue=queue,
            stop_event=stop_event,
            checkpoint=checkpoint,
            additional_results=current_results,
            training_started_at=0.0,
            placement_group=pg,
            failed_actor_ranks=start_actor_ranks,
            pending_actors=pending_actors,
        )

        try:
            bst, train_evals_result, train_additional_results = _train(
                params,
                dtrain,
                boost_rounds_left,
                *args,
                evals=evals,
                ray_params=ray_params,
                cpus_per_actor=cpus_per_actor,
                gpus_per_actor=gpus_per_actor,
                _training_state=training_state,
                **kwargs,
            )
            if training_state.training_started_at > 0.0:
                total_training_time += time.time() - training_state.training_started_at
            break
        except (RayActorError, RayTaskError) as exc:
            if training_state.training_started_at > 0.0:
                total_training_time += time.time() - training_state.training_started_at
            alive_actors = sum(1 for a in actors if a is not None)
            start_again = False
            if ray_params.elastic_training:
                if alive_actors < ray_params.num_actors - ray_params.max_failed_actors:
                    raise RuntimeError(
                        "A Ray actor died during training and the maximum "
                        "number of dead actors in elastic training was "
                        "reached. Shutting down training."
                    ) from exc

                # Do not start new actors before resuming training
                # (this might still restart actors during training)
                start_actor_ranks.clear()

                if exc.__cause__ and isinstance(
                    exc.__cause__, RayXGBoostActorAvailable
                ):
                    # New actor available, integrate into training loop
                    logger.info(
                        f"A new actor became available. Re-starting training "
                        f"from latest checkpoint with new actor. "
                        f"This will use {alive_actors} existing actors and "
                        f"start {len(start_actor_ranks)} new actors. "
                        f"Sleeping for 10 seconds for cleanup."
                    )
                    tries -= 1  # This is deliberate so shouldn't count
                    start_again = True

                elif tries + 1 <= max_actor_restarts:
                    if exc.__cause__ and isinstance(
                        exc.__cause__, RayXGBoostTrainingError
                    ):
                        logger.warning(f"Caught exception: {exc.__cause__}")
                    logger.warning(
                        f"A Ray actor died during training. Trying to "
                        f"continue training on the remaining actors. "
                        f"This will use {alive_actors} existing actors and "
                        f"start {len(start_actor_ranks)} new actors. "
                        f"Sleeping for 10 seconds for cleanup."
                    )
                    start_again = True

            elif tries + 1 <= max_actor_restarts:
                if exc.__cause__ and isinstance(exc.__cause__, RayXGBoostTrainingError):
                    logger.warning(f"Caught exception: {exc.__cause__}")
                logger.warning(
                    f"A Ray actor died during training. Trying to restart "
                    f"and continue training from last checkpoint "
                    f"(restart {tries + 1} of {max_actor_restarts}). "
                    f"This will use {alive_actors} existing actors and start "
                    f"{len(start_actor_ranks)} new actors. "
                    f"Sleeping for 10 seconds for cleanup."
                )
                start_again = True

            if start_again:
                time.sleep(5)
                queue.shutdown()
                stop_event.shutdown()
                time.sleep(5)
                queue, stop_event = _create_communication_processes()
            else:
                raise RuntimeError(
                    f"A Ray actor died during training and the maximum number "
                    f"of retries ({max_actor_restarts}) is exhausted."
                ) from exc
            tries += 1

    total_time = time.time() - start_time

    train_additional_results["training_time_s"] = total_training_time
    train_additional_results["total_time_s"] = total_time

    if ray_params.verbose:
        maybe_log = logger.info
    else:
        maybe_log = logger.debug

    maybe_log(
        "[RayXGBoost] Finished XGBoost training on training data "
        "with total N={total_n:,} in {total_time_s:.2f} seconds "
        "({training_time_s:.2f} pure XGBoost training time).".format(
            **train_additional_results
        )
    )

    _shutdown(
        actors=actors,
        pending_actors=pending_actors,
        queue=queue,
        event=stop_event,
        placement_group=pg,
        force=False,
    )

    if isinstance(evals_result, dict):
        evals_result.update(train_evals_result)
    if isinstance(additional_results, dict):
        additional_results.update(train_additional_results)

    return bst


def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, **kwargs):
    _assert_ray_support()

    if ray_params.verbose:
        maybe_log = logger.info
    else:
        maybe_log = logger.debug

    if not ray.is_initialized():
        ray.init()

    # Create remote actors
    actors = [
        _create_actor(
            rank=i,
            num_actors=ray_params.num_actors,
            num_cpus_per_actor=ray_params.cpus_per_actor,
            num_gpus_per_actor=ray_params.gpus_per_actor
            if ray_params.gpus_per_actor >= 0
            else 0,
            resources_per_actor=ray_params.resources_per_actor,
            distributed_callbacks=ray_params.distributed_callbacks,
        )
        for i in range(ray_params.num_actors)
    ]
    maybe_log(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for actor in actors:
        wait_load.extend(_trigger_data_load(actor, data, []))

    try:
        ray.get(wait_load)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(actors, force=True)
        raise

    # Put model into object store
    model_ref = ray.put(model)

    maybe_log("[RayXGBoost] Starting XGBoost prediction.")

    # Train
    fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors]

    try:
        actor_results = ray.get(fut)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(actors=actors, force=True)
        raise

    _shutdown(actors=actors, force=False)

    return combine_data(data.sharding, actor_results)


@PublicAPI(stability="beta")
def predict(
    model: xgb.Booster,
    data: RayDMatrix,
    ray_params: Union[None, RayParams, Dict] = None,
    _remote: Optional[bool] = None,
    **kwargs,
) -> Optional[np.ndarray]:
    """Distributed XGBoost predict via Ray.

    This function will connect to a Ray cluster, create ``num_actors``
    remote actors, send data shards to them, and have them predict labels
    using an XGBoost booster model. The results are then combined and
    returned.

    Args:
        model: Booster object to call for prediction.
        data: Data object containing the prediction data.
        ray_params: Parameters to configure
            Ray-specific behavior. See :class:`RayParams` for a list of valid
            configuration parameters.
        _remote: Whether to run the driver process in a remote
            function. This is enabled by default in Ray client mode.
        **kwargs: Keyword arguments will be passed to the local
            `xgb.predict()` calls.

    Returns: ``np.ndarray`` containing the predicted labels.

    """
    os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1")

    if xgb is None:
        raise ImportError(
            "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. "
            'FIX THIS by running `pip install "xgboost-ray"`.'
        )

    if _remote is None:
        _remote = _is_client_connected() and not _in_ray_tune_session()

    if not ray.is_initialized():
        ray.init()

    if _remote:
        return ray.get(
            ray.remote(num_cpus=0)(predict).remote(
                model, data, ray_params, _remote=False, **kwargs
            )
        )

    _maybe_print_legacy_warning()

    ray_params = _validate_ray_params(ray_params)

    max_actor_restarts = (
        ray_params.max_actor_restarts
        if ray_params.max_actor_restarts >= 0
        else float("inf")
    )
    _assert_ray_support()

    if not isinstance(data, RayDMatrix):
        raise ValueError(
            "The `data` argument passed to `train()` is not a RayDMatrix, "
            "but of type {}. "
            "\nFIX THIS by instantiating a RayDMatrix first: "
            "`data = RayDMatrix(data=data)`.".format(type(data))
        )

    tries = 0
    while tries <= max_actor_restarts:
        try:
            return _predict(model, data, ray_params=ray_params, **kwargs)
        except RayActorError:
            if tries + 1 <= max_actor_restarts:
                logger.warning(
                    "A Ray actor died during prediction. Trying to restart "
                    "prediction from scratch. "
                    "Sleeping for 10 seconds for cleanup."
                )
                time.sleep(10)
            else:
                raise RuntimeError(
                    "A Ray actor died during prediction and the maximum "
                    "number of retries ({}) is exhausted.".format(max_actor_restarts)
                )
            tries += 1
    return None


================================================
FILE: xgboost_ray/matrix.py
================================================
import glob
import uuid
from enum import Enum
from typing import (
    TYPE_CHECKING,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Sequence,
    Set,
    Tuple,
    Type,
    Union,
)

from ray.actor import ActorHandle

try:
    import cupy as cp
except ImportError:
    cp = None

import os

import numpy as np
import pandas as pd
import ray
from ray import logger
from ray.util.annotations import DeveloperAPI, PublicAPI

from xgboost_ray.data_sources import DataSource, RayFileType, data_sources

try:
    from ray.data.dataset import Dataset as RayDataset
except ImportError:

    class RayDataset:
        pass


try:
    from xgboost.core import DataIter

    LEGACY_MATRIX = False
except ImportError:
    DataIter = object
    LEGACY_MATRIX = True

try:
    from xgboost.core import QuantileDmatrix

    QUANTILE_AVAILABLE = True
except ImportError:
    QuantileDmatrix = object
    QUANTILE_AVAILABLE = False

if TYPE_CHECKING:
    from xgboost_ray.xgb import xgboost as xgb

Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series]


def concat_dataframes(dfs: List[Optional[pd.DataFrame]]):
    filtered = [df for df in dfs if df is not None]
    return pd.concat(filtered, ignore_index=True, copy=False)


def ensure_sorted_by_qid(
    df: pd.DataFrame, qid: Data
) -> Tuple[Union[np.array, str], pd.DataFrame]:
    _qid: pd.Series = None
    if isinstance(qid, str):
        _qid = df[qid]
    elif isinstance(qid, np.ndarray):
        _qid = pd.Series(qid)
    elif isinstance(qid, pd.DataFrame):
        if len(df.shape) != 2 and df.shape[1] != 1:
            raise ValueError(
                f"qid argument of type pd.DataFrame is expected"
                "to contains only 1 column of data "
                f"but the qid passed in is of shape {df.shape}."
            )
        _qid = qid.iloc[:, 0]
    elif isinstance(qid, pd.Series):
        _qid = qid
    # pandas < 2.0
    if getattr(_qid, "is_monotonic", False):
        return _qid, df
    # pandas >= 2.0
    elif getattr(_qid, "is_monotonic_increasing", False) or getattr(
        _qid, "is_monotonic_decreasing", False
    ):
        return _qid, df
    else:
        if isinstance(qid, str):
            return qid, df.sort_values([qid])
        else:  # case when qid is not part of df
            return _qid.sort_values(), df.set_index(_qid).sort_index().reset_index(
                drop=True
            )


@PublicAPI(stability="beta")
class RayShardingMode(Enum):
    """Enum for different modes of sharding the data.

    ``RayShardingMode.INTERLEAVED`` will divide the data
    per row, i.e. every i-th row will be passed to the first worker,
    every (i+1)th row to the second worker, etc.

    ``RayShardingMode.BATCH`` will divide the data in batches, i.e.
    the first 0-(m-1) rows will be passed to the first worker, the
    m-(2m-1) rows to the second worker, etc.

    ``RayShardingMode.FIXED`` is set automatically when using a distributed
    data source that assigns actors to specific data shards on initialization
    and then keeps these fixed.
    """

    INTERLEAVED = 1
    BATCH = 2
    FIXED = 3


@DeveloperAPI
class RayDataIter(DataIter):
    def __init__(
        self,
        data: List[Data],
        label: List[Optional[Data]],
        missing: Optional[float],
        weight: List[Optional[Data]],
        feature_weights: List[Optional[Data]],
        qid: List[Optional[Data]],
        base_margin: List[Optional[Data]],
        label_lower_bound: List[Optional[Data]],
        label_upper_bound: List[Optional[Data]],
        feature_names: Optional[List[str]],
        feature_types: Optional[List[np.dtype]],
        enable_categorical: Optional[bool],
    ):
        super(RayDataIter, self).__init__()

        assert cp is not None

        self._data = data
        self._label = label
        self._missing = missing
        self._weight = weight
        self._feature_weights = feature_weights
        self._qid = qid
        self._base_margin = base_margin
        self._label_lower_bound = label_lower_bound
        self._label_upper_bound = label_upper_bound
        self._feature_names = feature_names
        self._feature_types = feature_types
        self.enable_categorical = enable_categorical

        self._iter = 0

    def __len__(self):
        return sum(len(shard) for shard in self._data)

    def reset(self):
        self._iter = 0

    def _prop(self, ref):
        if ref is None:
            return None
        item = ref[self._iter]
        if item is None:
            return None
        if not isinstance(item, cp.ndarray):
            item = cp.array(item.values)
        return item

    def next(self, input_data: Callable):
        if self._iter >= len(self._data):
            return 0
        input_data(
            data=self._prop(self._data),
            label=self._prop(self._label),
            weight=self._prop(self._weight),
            feature_weights=self._prop(self._feature_weights),
            qid=self._prop(self._qid),
            group=None,
            label_lower_bound=self._prop(self._label_lower_bound),
            label_upper_bound=self._prop(self._label_upper_bound),
            feature_names=self._feature_names,
            feature_types=self._feature_types,
            enable_categorical=self._enable_categorical,
        )
        self._iter += 1
        return 1


class _RayDMatrixLoader:
    def __init__(
        self,
        data: Data,
        label: Optional[Data] = None,
        missing: Optional[float] = None,
        weight: Optional[Data] = None,
        feature_weights: Optional[Data] = None,
        base_margin: Optional[Data] = None,
        label_lower_bound: Optional[Data] = None,
        label_upper_bound: Optional[Data] = None,
        feature_names: Optional[List[str]] = None,
        feature_types: Optional[List[np.dtype]] = None,
        qid: Optional[Data] = None,
        enable_categorical: Optional[bool] = None,
        filetype: Optional[RayFileType] = None,
        ignore: Optional[List[str]] = None,
        **kwargs,
    ):
        self.data = data
        self.label = label
        self.missing = missing
        self.weight = weight
        self.feature_weights = feature_weights
        self.base_margin = base_margin
        self.label_lower_bound = label_lower_bound
        self.label_upper_bound = label_upper_bound
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.qid = qid
        self.enable_categorical = enable_categorical

        self.data_source = None
        self.actor_shards = None

        self.filetype = filetype
        self.ignore = ignore
        self.kwargs = kwargs

        self._cached_n = None

        check = None
        if isinstance(data, str):
            check = data
        elif isinstance(data, Sequence) and isinstance(data[0], str):
            check = data[0]

        if check is not None:
            if not self.filetype:
                # Try to guess filetype
                for data_source in data_sources:
                    self.filetype = data_source.get_filetype(check)
                    if self.filetype:
                        break
                if not self.filetype:
                    raise ValueError(
                        f"File or stream ({check}) specified as data source, "
                        "but filetype could not be detected. "
                        "\nFIX THIS by passing "
                        "the `filetype` parameter to the RayDMatrix. Use the "
                        "`RayFileType` enum for this."
                    )

    def get_data_source(self) -> Type[DataSource]:
        raise NotImplementedError

    def assert_enough_shards_for_actors(self, num_actors: int):
        """Assert that we have enough shards to split across actors."""
        # Pass per default
        pass

    def update_matrix_properties(self, matrix: "xgb.DMatrix"):
        data_source = self.get_data_source()
        data_source.update_feature_names(matrix, self.feature_names)

    def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool:
        """Assign data shards to actors.

        Returns True if shards were assigned to actors. In that case, the
        sharding mode should be adjusted to ``RayShardingMode.FIXED``.
        Returns False otherwise.
        """
        return False

    def _split_dataframe(
        self, local_data: pd.DataFrame, data_source: Type[DataSource]
    ) -> Tuple[
        pd.DataFrame,
        Optional[pd.Series],
        Optional[pd.Series],
        Optional[pd.Series],
        Optional[pd.Series],
        Optional[pd.Series],
    ]:
        """
        Split dataframe into

        `features`, `labels`, `weight`, `feature_weights`, `base_margin`,
        `label_lower_bound`, `label_upper_bound`

        """
        # sort dataframe by qid if exists (required by DMatrix)
        if self.qid is not None:
            _qid, local_data = ensure_sorted_by_qid(local_data, self.qid)
            if not isinstance(self.qid, str):
                self.qid = _qid

        exclude_cols: Set[str] = set()  # Exclude these columns from `x`

        label, exclude = data_source.get_column(local_data, self.label)
        if exclude:
            if isinstance(exclude, List):
                exclude_cols.update(exclude)
            else:
                exclude_cols.add(exclude)

        weight, exclude = data_source.get_column(local_data, self.weight)
        if exclude:
            exclude_cols.add(exclude)

        feature_weights, exclude = data_source.get_column(
            local_data, self.feature_weights
        )
        if exclude:
            exclude_cols.add(exclude)

        qid, exclude = data_source.get_column(local_data, self.qid)
        if exclude:
            exclude_cols.add(exclude)

        base_margin, exclude = data_source.get_column(local_data, self.base_margin)
        if exclude:
            exclude_cols.add(exclude)

        label_lower_bound, exclude = data_source.get_column(
            local_data, self.label_lower_bound
        )
        if exclude:
            exclude_cols.add(exclude)

        label_upper_bound, exclude = data_source.get_column(
            local_data, self.label_upper_bound
        )
        if exclude:
            exclude_cols.add(exclude)

        x = local_data
        if exclude_cols:
            x = x[[col for col in x.columns if col not in exclude_cols]]

        return (
            x,
            label,
            weight,
            feature_weights,
            base_margin,
            label_lower_bound,
            label_upper_bound,
            qid,
        )

    def load_data(
        self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None
    ) -> Tuple[Dict, int]:
        raise NotImplementedError


class _CentralRayDMatrixLoader(_RayDMatrixLoader):
    """Load full dataset from a central location and put into object store"""

    def get_data_source(self) -> Type[DataSource]:
        if self.data_source:
            return self.data_source

        data_source = None
        for source in data_sources:
            if not source.supports_central_loading:
                continue

            try:
                if source.is_data_type(self.data, self.filetype):
                    data_source = source
                    break
            except Exception as exc:
                # If checking the data throws an exception, the data source
                # is not available.
                logger.warning(
                    f"Checking data source {source.__name__} failed "
                    f"with exception: {exc}"
                )
                continue

        if not data_source:
            raise ValueError(
                "Unknown data source type: {} with FileType: {}."
                "\nFIX THIS by passing a supported data type. Supported "
                "data types include pandas.DataFrame, pandas.Series, "
                "np.ndarray, and CSV/Parquet file paths. If you specify a "
                "file, path, consider passing the `filetype` argument to "
                "specify the type of the source. Use the `RayFileType` "
                "enum for that. If using Modin, Dask, or Petastorm, "
                "make sure the library is installed.".format(
                    type(self.data), self.filetype
                )
            )

        if (
            self.label is not None
            and not isinstance(self.label, str)
            and not type(self.data) != type(self.label)  # noqa: E721
        ):  # noqa: E721:
            # Label is an object of a different type than the main data.
            # We have to make sure they are compatible
            # if it's a parquet data source and label is a list,
            # then we consider it a multi-label data
            if not data_source.is_data_type(self.label) and not (
                isinstance(self.label, List) and data_source.__name__ == "Parquet"
            ):
                raise ValueError(
                    "The passed `data` and `label` types are not compatible."
                    "\nFIX THIS by passing the same types to the "
                    "`RayDMatrix` - e.g. a `pandas.DataFrame` as `data` "
                    "and `label`. The `label` can always be a string. Got "
                    "{} for the main data and {} for the label.".format(
                        type(self.data), type(self.label)
                    )
                )

        self.data_source = data_source
        self._cached_n = data_source.get_n(self.data)
        return self.data_source

    def load_data(
        self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None
    ) -> Tuple[Dict, int]:
        """
        Load data into memory
        """
        if not ray.is_initialized():
            ray.init()

        if "OMP_NUM_THREADS" in os.environ:
            del os.environ["OMP_NUM_THREADS"]

        data_source = self.get_data_source()

        max_num_shards = self._cached_n or data_source.get_n(self.data)
        if num_actors > max_num_shards and data_source.needs_partitions:
            raise RuntimeError(
                f"Trying to shard data for {num_actors} actors, but the "
                f"maximum number of shards (i.e. the number of data rows) "
                f"is {max_num_shards}. Consider using fewer actors."
            )

        # We're doing central data loading here, so we don't pass any indices,
        # yet. Instead, we'll be selecting the rows below.
        local_df = data_source.load_data(
            self.data, ignore=self.ignore, indices=None, **self.kwargs
        )
        x, y, w, fw, b, ll, lu, qid = self._split_dataframe(
            local_df, data_source=data_source
        )

        if isinstance(x, list):
            n = sum(len(a) for a in x)
        else:
            n = len(x)

        refs = {}
        for i in range(num_actors):
            # Here we actually want to split the data.
            indices = _get_sharding_indices(sharding, i, num_actors, n)
            actor_refs = {
                "data": ray.put(x.iloc[indices]),
                "label": ray.put(y.iloc[indices] if y is not None else None),
                "weight": ray.put(w.iloc[indices] if w is not None else None),
                "feature_weights": ray.put(fw),
                "base_margin": ray.put(b.iloc[indices] if b is not None else None),
                "label_lower_bound": ray.put(
                    ll.iloc[indices] if ll is not None else None
                ),
                "label_upper_bound": ray.put(
                    lu.iloc[indices] if lu is not None else None
                ),
                "qid": ray.put(qid.iloc[indices] if qid is not None else None),
            }
            refs[i] = actor_refs

        return refs, n


class _DistributedRayDMatrixLoader(_RayDMatrixLoader):
    """Load each shard individually."""

    def get_data_source(self) -> Type[DataSource]:
        if self.data_source:
            return self.data_source

        invalid_data = False
        if isinstance(self.data, str):
            if self.filetype == RayFileType.PETASTORM:
                self.data = [self.data]
            elif os.path.isdir(self.data):
                if self.filetype == RayFileType.PARQUET:
                    self.data = sorted(glob.glob(f"{self.data}/**/*.parquet"))
                elif self.filetype == RayFileType.CSV:
                    self.data = sorted(glob.glob(f"{self.data}/**/*.csv"))
                else:
                    invalid_data = True
            elif os.path.exists(self.data):
                self.data = [self.data]
            else:
                invalid_data = True

        # Todo (krfricke): It would be good to have a more general way to
        # check for compatibility here. Combine with test below?
        if (
            not (
                isinstance(self.data, (Iterable, RayDataset))
                or hasattr(self.data, "__partitioned__")
            )
            or invalid_data
        ):
            raise ValueError(
                f"Distributed data loading only works with already "
                f"distributed datasets. These should be specified through a "
                f"list of locations (or a single string). "
                f"Got: {type(self.data)}."
                f"\nFIX THIS by passing a list of files (e.g. on S3) to the "
                f"RayDMatrix."
            )

        if (
            self.label is not None
            and not isinstance(self.label, str)
            and not isinstance(self.label, List)
        ):
            raise ValueError(
                f"Invalid `label` value for distributed datasets: "
                f"{self.label}. Only strings are supported. "
                f"\nFIX THIS by passing a string indicating the label "
                f"column of the dataset as the `label` argument."
            )

        data_source = None
        for source in data_sources:
            if not source.supports_distributed_loading:
                continue

            try:
                if source.is_data_type(self.data, self.filetype):
                    data_source = source
                    break
            except Exception as exc:
                # If checking the data throws an exception, the data source
                # is not available.
                logger.warning(
                    f"Checking data source {source.__name__} failed "
                    f"with exception: {exc}"
                )
                continue

        if not data_source:
            raise ValueError(
                f"Invalid data source type: {type(self.data)} "
                f"with FileType: {self.filetype} for a distributed dataset."
                "\nFIX THIS by passing a supported data type. Supported "
                "data types for distributed datasets are a list of "
                "CSV or Parquet sources. If using "
                "Modin, Dask, or Petastorm, make sure the library is "
                "installed."
            )

        self.data_source = data_source
        self._cached_n = data_source.get_n(self.data)
        return self.data_source

    def assert_enough_shards_for_actors(self, num_actors: int):
        data_source = self.get_data_source()

        # Ray Datasets will be automatically split to match the number
        # of actors.
        if isinstance(data_source, RayDataset):
            return

        max_num_shards = self._cached_n or data_source.get_n(self.data)
        if num_actors > max_num_shards and data_source.needs_partitions:
            raise RuntimeError(
                f"Trying to shard data for {num_actors} actors, but the "
                f"maximum number of shards is {max_num_shards}. If you "
                f"want to shard the dataset by rows, consider "
                f"centralized loading by passing `distributed=False` to "
                f"the `RayDMatrix`. Otherwise consider using fewer actors "
                f"or re-partitioning your data."
            )

    def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool:
        if not isinstance(self.label, str):
            # Currently we only support fixed data sharding for datasets
            # that contain both the label and the data.
            return False

        if self.actor_shards:
            # Only assign once
            return True

        data_source = self.get_data_source()
        data, actor_shards = data_source.get_actor_shards(self.data, actors)
        if not actor_shards:
            return False

        self.data = data
        self.actor_shards = actor_shards
        return True

    def load_data(
        self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None
    ) -> Tuple[Dict, int]:
        """
        Load data into memory
        """
        if rank is None or not ray.is_initialized:
            raise ValueError(
                "Distributed loading should be done by the actors, not by the"
                "driver program. "
                "\nFIX THIS by refraining from calling `RayDMatrix.load()` "
                "manually for distributed datasets. Hint: You can check if "
                "`RayDMatrix.distributed` is set to True or False."
            )

        if "OMP_NUM_THREADS" in os.environ:
            del os.environ["OMP_NUM_THREADS"]

        data_source = self.get_data_source()

        if self.actor_shards:
            if rank is None:
                raise RuntimeError(
                    "Distributed loading requires a rank to be passed, " "got None"
                )
            rank_shards = self.actor_shards[rank]
            local_df = data_source.load_data(
                self.data, indices=rank_shards, ignore=self.ignore, **self.kwargs
            )
            x, y, w, fw, b, ll, lu, qid = self._split_dataframe(
                local_df, data_source=data_source
            )

            if isinstance(x, list):
                n = sum(len(a) for a in x)
            else:
                n = len(x)
        else:
            n = self._cached_n or data_source.get_n(self.data)
            indices = _get_sharding_indices(sharding, rank, num_actors, n)

            if not indices:
                x, y, w, fw, b, ll, lu, qid = (
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                )
                n = 0
            else:
                local_df = data_source.load_data(
                    self.data, ignore=self.ignore, indices=indices, **self.kwargs
                )
                x, y, w, fw, b, ll, lu, qid = self._split_dataframe(
                    local_df, data_source=data_source
                )

                if isinstance(x, list):
                    n = sum(len(a) for a in x)
                else:
                    n = len(x)

        refs = {
            rank: {
                "data": ray.put(x),
                "label": ray.put(y),
                "weight": ray.put(w),
                "feature_weights": ray.put(fw),
                "base_margin": ray.put(b),
                "label_lower_bound": ray.put(ll),
                "label_upper_bound": ray.put(lu),
                "qid": ray.put(qid),
            }
        }

        return refs, n


@PublicAPI(stability="beta")
class RayDMatrix:
    """XGBoost on Ray DMatrix class.

    This is the data object that the training and prediction functions
    expect. This wrapper manages distributed data by sharding the data for the
    workers and storing the shards in the object store.

    If this class is called without the ``num_actors`` argument, it will
    be lazy loaded. Thus, it will return immediately and only load the data
    and store it in the Ray object store after ``load_data(num_actors)`` or
    ``get_data(rank, num_actors)`` is called at the beginning of training.

    If this class is instantiated with the ``num_actors`` argument, it will
    directly load the data and store them in the object store. If this should
    be deferred, pass ``lazy=True`` as an argument.

    Loading the data will store it in the Ray object store. This object then
    stores references to the data shards in the Ray object store. Actors
    can request these shards with the ``get_data(rank)`` method, returning
    dataframes according to the actor rank.

    The total number of actors has to remain constant and cannot be changed
    once it has been set.

    Args:
        data: Data object. Can be a pandas dataframe, pandas series,
            numpy array, modin dataframe, string pointing to
            a csv or parquet file, or list of strings pointing to csv or
            parquet files.
        label: Optional label object. Can be a pandas series, numpy array,
            modin series, string pointing to a csv or parquet file, or
            a string indicating the column of the data dataframe that
            contains the label. If this is not a string it must be of the
            same type as the data argument.
        num_actors: Number of actors to shard this data for. If this is
            not None, data will be loaded and stored into the object store
            after initialization. If this is None, it will be set by
            the ``xgboost_ray.train()`` function, and it will be loaded and
            stored in the object store then. Defaults to None.
        filetype: Type of data to read.
            This is disregarded if a data object like a pandas dataframe
            is passed as the ``data`` argument. For filenames,
            the filetype is automaticlly detected via the file name
            (e.g. ``.csv`` will be detected as ``RayFileType.CSV``).
            Passing this argument will overwrite the detected filename.
            If the filename cannot be determined from the ``data`` object,
            passing this is mandatory. Defaults to ``None`` (auto detection).
        ignore: Exclude these columns from the
            dataframe after loading the data.
        distributed: If True, use distributed loading
            (each worker loads a share of the dataset). If False, use
            central loading (the head node loads the whole dataset and
            distributed it). If None, auto-detect and default to
            distributed loading, if possible.
        sharding: How to shard the data for different
            workers. ``RayShardingMode.INTERLEAVED`` will divide the data
            per row, i.e. every i-th row will be passed to the first worker,
            every (i+1)th row to the second worker, etc.
            ``RayShardingMode.BATCH`` will divide the data in batches, i.e.
            the first 0-(m-1) rows will be passed to the first worker, the
            m-(2m-1) rows to the second worker, etc. Defaults to
            ``RayShardingMode.INTERLEAVED``. If using distributed data
            loading, sharding happens on a per-file basis, and not on a
            per-row basis, i.e. For interleaved every ith *file* will be
            passed into the first worker, etc.
        lazy: If ``num_actors`` is passed, setting this to ``True``
            will defer data loading and storing until ``load_data()`` or
            ``get_data()`` is called. Defaults to ``False``.
        **kwargs: Keyword arguments will be passed to the data loading
            function. For instance, with ``RayFileType.PARQUET``, these
            arguments will be passed to ``pandas.read_parquet()``.


    .. code-block:: python

        from xgboost_ray import RayDMatrix, RayFileType

        files = ["data_one.parquet", "data_two.parquet"]

        columns = ["feature_1", "feature_2", "label_column"]

        dtrain = RayDMatrix(
            files,
            num_actors=4,  # Will shard the data for four workers
            label="label_column",  # Will select this column as the label
            columns=columns,  # Will be passed to ``pandas.read_parquet()``
            filetype=RayFileType.PARQUET)

    """

    def __init__(
        self,
        data: Data,
        label: Optional[Data] = None,
        weight: Optional[Data] = None,
        feature_weights: Optional[Data] = None,
        base_margin: Optional[Data] = None,
        missing: Optional[float] = None,
        label_lower_bound: Optional[Data] = None,
        label_upper_bound: Optional[Data] = None,
        feature_names: Optional[List[str]] = None,
        feature_types: Optional[List[np.dtype]] = None,
        qid: Optional[Data] = None,
        enable_categorical: Optional[bool] = None,
        num_actors: Optional[int] = None,
        filetype: Optional[RayFileType] = None,
        ignore: Optional[List[str]] = None,
        distributed: Optional[bool] = None,
        sharding: RayShardingMode = RayShardingMode.INTERLEAVED,
        lazy: bool = False,
        **kwargs,
    ):

        if kwargs.get("group", None) is not None:
            raise ValueError(
                "`group` parameter is not supported. "
                "If you are using XGBoost-Ray, use `qid` parameter instead. "
                "If you are using LightGBM-Ray, ranking is not yet supported."
            )

        if qid is not None and weight is not None:
            raise NotImplementedError("per-group weight is not implemented.")

        self._uid = uuid.uuid4().int

        self.feature_names = feature_names
        self.feature_types = feature_types
        self.qid = qid
        self.enable_categorical = enable_categorical
        self.missing = missing

        self.num_actors = num_actors
        self.sharding = sharding

        if distributed is None:
            distributed = _detect_distributed(data)
        else:
            if distributed and not _can_load_distributed(data):
                raise ValueError(
                    f"You passed `distributed=True` to the `RayDMatrix` but "
                    f"the specified data source of type {type(data)} cannot "
                    f"be loaded in a distributed fashion. "
                    f"\nFIX THIS by passing a list of sources (e.g. parquet "
                    f"files stored in a network location) instead."
                )

        self.distributed = distributed

        if self.distributed:
            self.loader = _DistributedRayDMatrixLoader(
                data=data,
                label=label,
                missing=missing,
                weight=weight,
                feature_weights=feature_weights,
                base_margin=base_margin,
                label_lower_bound=label_lower_bound,
                label_upper_bound=label_upper_bound,
                feature_names=feature_names,
                feature_types=feature_types,
                enable_categorical=enable_categorical,
                filetype=filetype,
                ignore=ignore,
                qid=qid,
                **kwargs,
            )
        else:
            self.loader = _CentralRayDMatrixLoader(
                data=data,
                label=label,
                missing=missing,
                weight=weight,
                feature_weights=feature_weights,
                base_margin=base_margin,
                label_lower_bound=label_lower_bound,
                label_upper_bound=label_upper_bound,
                feature_names=feature_names,
                feature_types=feature_types,
                enable_categorical=enable_categorical,
                filetype=filetype,
                ignore=ignore,
                qid=qid,
                **kwargs,
            )

        self.refs: Dict[int, Dict[str, ray.ObjectRef]] = {}
        self.n = None

        self.loaded = False

        if not distributed and num_actors is not None and not lazy:
            self.load_data(num_actors)

    @property
    def has_label(self):
        return self.loader.label is not None

    def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool:
        success = self.loader.assign_shards_to_actors(actors)
        if success:
            self.sharding = RayShardingMode.FIXED
        return success

    def assert_enough_shards_for_actors(self, num_actors: int):
        self.loader.assert_enough_shards_for_actors(num_actors=num_actors)

    def load_data(self, num_actors: Optional[int] = None, rank: Optional[int] = None):
        """Load data, putting it into the Ray object store.

        If a rank is given, only data for this rank is loaded (for
        distributed data sources only).
        """
        if not self.loaded:
            if num_actors is not None:
                if self.num_actors is not None and num_actors != self.num_actors:
                    raise ValueError(
                        f"The `RayDMatrix` was initialized or `load_data()`"
                        f"has been called with a different numbers of"
                        f"`actors`. Existing value: {self.num_actors}. "
                        f"Current value: {num_actors}."
                        f"\nFIX THIS by not instantiating the matrix with "
                        f"`num_actors` and making sure calls to `load_data()` "
                        f"or `get_data()` use the same numbers of actors "
                        f"at each call."
                    )
                self.num_actors = num_actors
            if self.num_actors is None:
                raise ValueError(
                    "Trying to load data for `RayDMatrix` object, but "
                    "`num_actors` is not set."
                    "\nFIX THIS by passing `num_actors` on instantiation "
                    "of the `RayDMatrix` or when calling `load_data()`."
                )
            refs, self.n = self.loader.load_data(
                self.num_actors, self.sharding, rank=rank
            )
            self.refs.update(refs)
            self.loaded = True

    def get_data(
        self, rank: int, num_actors: Optional[int] = None
    ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]:
        """Get data, i.e. return dataframe for a specific actor.

        This method is called from an actor, given its rank and the
        total number of actors. If the data is not yet loaded, loading
        is triggered.
        """
        self.load_data(num_actors=num_actors, rank=rank)

        refs = self.refs[rank]
        ray.get(list(refs.values()))

        data = {k: ray.get(v) for k, v in refs.items()}

        return data

    def unload_data(self):
        """Delete object references to clear object store"""
        for rank in list(self.refs.keys()):
            for name in list(self.refs[rank].keys()):
                del self.refs[rank][name]
        self.loaded = False

    def update_matrix_properties(self, matrix: "xgb.DMatrix"):
        self.loader.update_matrix_properties(matrix)

    def __hash__(self):
        return self._uid

    def __eq__(self, other):
        return self.__hash__() == other.__hash__()


class RayQuantileDMatrix(RayDMatrix):
    """Currently just a thin wrapper for type detection"""

    pass


class RayDeviceQuantileDMatrix(RayDMatrix):
    """Currently just a thin wrapper for type detection"""

    def __init__(
        self,
        data: Data,
        label: Optional[Data] = None,
        weight: Optional[Data] = None,
        base_margin: Optional[Data] = None,
        missing: Optional[float] = None,
        label_lower_bound: Optional[Data] = None,
        label_upper_bound: Optional[Data] = None,
        feature_names: Optional[List[str]] = None,
        feature_types: Optional[List[np.dtype]] = None,
        qid: Optional[Data] = None,
        enable_categorical: Optional[bool] = None,
        *args,
        **kwargs,
    ):
        if cp is None:
            raise RuntimeError(
                "RayDeviceQuantileDMatrix requires cupy to be installed."
                "\nFIX THIS by installing cupy: `pip install cupy-cudaXYZ` "
                "where XYZ is your local CUDA version."
            )
        if label_lower_bound or label_upper_bound:
            raise RuntimeError(
                "RayDeviceQuantileDMatrix does not support "
                "`label_lower_bound` and `label_upper_bound` (just as the "
                "xgboost.DeviceQuantileDMatrix). Please pass None instead."
            )
        super(RayDeviceQuantileDMatrix, self).__init__(
            data=data,
            label=label,
            weight=weight,
            base_margin=base_margin,
            missing=missing,
            label_lower_bound=None,
            label_upper_bound=None,
            feature_names=feature_names,
            feature_types=feature_types,
            qid=qid,
            enable_categorical=enable_categorical,
            *args,
            **kwargs,
        )

    def get_data(
        self, rank: int, num_actors: Optional[int] = None
    ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]:
        data_dict = super(RayDeviceQuantileDMatrix, self).get_data(
            rank=rank, num_actors=num_actors
        )
        # Remove some dict keys here that are generated automatically
        data_dict.pop("label_lower_bound", None)
        data_dict.pop("label_upper_bound", None)
        return data_dict


def _can_load_distributed(source: Data) -> bool:
    """Returns True if it might be possible to use distributed data loading"""
    from xgboost_ray.data_sources.modin import Modin

    if isinstance(source, (int, float, bool)):
        return False
    elif Modin.is_data_type(source):
        return True
    elif isinstance(source, RayDataset):
        return True
    elif isinstance(source, str):
        # Strings should point to files or URLs
        # Usually parquet files point to directories
        return source.endswith(".parquet")
    elif isinstance(source, Sequence):
        # Sequence of strings should point to files or URLs
        return isinstance(source[0], str)
    elif isinstance(source, Iterable):
        # If we get an iterable but not a sequence, the best we can do
        # is check if we have a known non-distributed object
        if isinstance(source, (pd.DataFrame, pd.Series, np.ndarray)):
            return False

    # Per default, allow distributed loading.
    return True


def _detect_distributed(source: Data) -> bool:
    """Returns True if we should try to use distributed data loading"""
    from xgboost_ray.data_sources.modin import Modin

    if not _can_load_distributed(source):
        return False
    if Modin.is_data_type(source):
        return True
    if isinstance(source, RayDataset):
        return True
    if (
        isinstance(source, Iterable)
        and not isinstance(source, str)
        and not (isinstance(source, Sequence) and isinstance(source[0], str))
    ):
        # This is an iterable but not a Sequence of strings, and not a
        # pandas dataframe, series, or numpy array.
        # Detect False per default, can be overridden by passing
        # `distributed=True` to the RayDMatrix object.
        return False

    # Otherwise, assume distributed loading is possible
    return True


def _get_sharding_indices(
    sharding: RayShardingMode, rank: int, num_actors: int, n: int
):
    """Return indices that belong to worker with rank `rank`"""
    if sharding == RayShardingMode.BATCH:
        # based on numpy.array_split
        # github.com/numpy/numpy/blob/v1.21.0/numpy/lib/shape_base.py
        n_per_actor, extras = divmod(n, num_actors)
        div_points = np.array(
            [0] + extras * [n_per_actor + 1] + (num_actors - extras) * [n_per_actor]
        ).cumsum()
        indices = list(range(div_points[rank], div_points[rank + 1]))
    elif sharding == RayShardingMode.INTERLEAVED:
        indices = list(range(rank, n, num_actors))
    else:
        raise ValueError(
            f"Invalid value for `sharding` parameter: "
            f"{sharding}"
            f"\nFIX THIS by passing any item of the "
            f"`RayShardingMode` enum, for instance "
            f"`RayShardingMode.BATCH`."
        )
    return indices


@DeveloperAPI
def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray:
    if sharding not in (RayShardingMode.BATCH, RayShardingMode.INTERLEAVED):
        raise ValueError(
            f"Invalid value for `sharding` parameter: "
            f"{sharding}"
            f"\nFIX THIS by passing any item of the "
            f"`RayShardingMode` enum, for instance "
            f"`RayShardingMode.BATCH`."
        )

    # discard empty arrays that show up with BATCH
    data = [d for d in data if len(d)]

    if data[0].ndim == 1:
        # most common case
        if sharding == RayShardingMode.BATCH:
            res = np.concatenate(data)
        elif sharding == RayShardingMode.INTERLEAVED:
            # Sometimes the lengths are off by 1 for uneven divisions
            min_len = min(len(d) for d in data)
            res = np.ravel(np.column_stack([d[0:min_len] for d in data]))
            # Append these here
            res = np.concatenate(
                [res] + [d[min_len:] for d in data if len(d) > min_len]
            )
    else:
        # objective="multi:softprob" returns n-dimensional arrays that
        # need to be handled differently
        if sharding == RayShardingMode.BATCH:
            res = np.vstack(data)
        elif sharding == RayShardingMode.INTERLEAVED:
            # Sometimes the lengths are off by 1 for uneven divisions
            min_len = min(len(d) for d in data)
            # the number of classes will be constant, though
            class_len = data[0].shape[1]
            min_len_data = [d[0:min_len] for d in data]
            res = np.hstack(min_len_data).reshape(
                len(min_len_data) * min_len, class_len
            )
            # Append these here
            res = np.concatenate(
                [res] + [d[min_len:] for d in data if len(d) > min_len]
            )
    return res


================================================
FILE: xgboost_ray/session.py
================================================
from typing import Optional

from ray.util.annotations import DeveloperAPI, PublicAPI
from ray.util.queue import Queue


@DeveloperAPI
class RayXGBoostSession:
    def __init__(self, rank: int, queue: Optional[Queue]):
        self._rank = rank
        self._queue = queue

    def get_actor_rank(self):
        return self._rank

    def set_queue(self, queue):
        self._queue = queue

    def put_queue(self, item):
        if self._queue is None:
            raise ValueError(
                "Trying to put something into session queue, but queue "
                "was not initialized. This is probably a bug, please raise "
                "an issue at https://github.com/ray-project/xgboost_ray"
            )
        self._queue.put((self._rank, item))


_session = None


@DeveloperAPI
def init_session(*args, **kwargs):
    global _session
    if _session:
        raise ValueError(
            "Trying to initialize RayXGBoostSession twice."
            "\nFIX THIS by not calling `init_session()` manually."
        )
    _session = RayXGBoostSession(*args, **kwargs)


@DeveloperAPI
def get_session() -> RayXGBoostSession:
    global _session
    if not _session or not isinstance(_session, RayXGBoostSession):
        raise ValueError(
            "Trying to access RayXGBoostSession from outside an XGBoost run."
            "\nFIX THIS by calling function in `session.py` like "
            "`get_actor_rank()` only from within an XGBoost actor session."
        )
    return _session


@DeveloperAPI
def set_session_queue(queue: Queue):
    session = get_session()
    session.set_queue(queue)


@PublicAPI
def get_actor_rank() -> int:
    session = get_session()
    return session.get_actor_rank()


@PublicAPI
def get_rabit_rank() -> int:
    import xgboost as xgb

    try:
        # From xgboost>=1.7.0, rabit is replaced by a collective communicator
        return xgb.collective.get_rank()
    except (ImportError, AttributeError):
        return xgb.rabit.get_rank()


@PublicAPI
def put_queue(*args, **kwargs):
    session = get_session()
    session.put_queue(*args, **kwargs)


================================================
FILE: xgboost_ray/sklearn.py
================================================
"""scikit-learn wrapper for xgboost-ray. Based on xgboost 1.4.0
sklearn wrapper, with some provisions made for 1.5.0 and compatibility
for legacy versions (not everything may work).
Seems to error out with 1.0.0, but 0.90 and 1.1.0 work fine.
Requires xgboost>=0.90"""

# Copyright 2021 by XGBoost Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# File based on:
# https://github.com/dmlc/xgboost/blob/c6a0bdbb5a68232cd59ea556c981c633cc0646ca/python-package/xgboost/sklearn.py

# License:
# https://github.com/dmlc/xgboost/blob/c6a0bdbb5a68232cd59ea556c981c633cc0646ca/LICENSE

import functools
import inspect
import warnings
from inspect import _finddoc
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
from packaging.version import Version
from ray.util.annotations import DeveloperAPI, PublicAPI
from xgboost import Booster
from xgboost import __version__ as xgboost_version
from xgboost.sklearn import (
    XGBClassifier,
    XGBModel,
    XGBRanker,
    XGBRegressor,
    XGBRFClassifier,
    XGBRFRegressor,
    _objective_decorator,
)

from xgboost_ray.main import LEGACY_WARNING, XGBOOST_VERSION, RayParams, predict, train
from xgboost_ray.matrix import RayDMatrix

# avoiding exception in xgboost==0.9.0
try:
    from xgboost.sklearn import _deprecate_positional_args
except ImportError:

    def _deprecate_positional_args(f):
        """Dummy decorator, does nothing"""

        @functools.wraps(f)
        def inner_f(*args, **kwargs):
            return f(*args, **kwargs)

        return inner_f


# If _wrap_evaluation_matrices has new arguments added in xgboost, update
# RayXGBMixin._ray_get_wrap_evaluation_matrices_compat_kwargs
try:
    from xgboost.sklearn import _wrap_evaluation_matrices
except ImportError:
    # copied from the file in the top comment
    def _wrap_evaluation_matrices(
        missing: float,
        X: Any,
        y: Any,
        group: Optional[Any],
        qid: Optional[Any],
        sample_weight: Optional[Any],
        base_margin: Optional[Any],
        feature_weights: Optional[Any],
        eval_set: Optional[List[Tuple[Any, Any]]],
        sample_weight_eval_set: Optional[List[Any]],
        base_margin_eval_set: Optional[List[Any]],
        eval_group: Optional[List[Any]],
        eval_qid: Optional[List[Any]],
        create_dmatrix: Callable,
        label_transform: Callable = lambda x: x,
    ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]:
        """Convert array_like evaluation matrices into DMatrix.
        Perform validation on the way.
        """
        train_dmatrix = create_dmatrix(
            data=X,
            label=label_transform(y),
            group=group,
            qid=qid,
            weight=sample_weight,
            base_margin=base_margin,
            feature_weights=feature_weights,
            missing=missing,
        )

        n_validation = 0 if eval_set is None else len(eval_set)

        def validate_or_none(meta: Optional[List], name: str) -> List:
            if meta is None:
                return [None] * n_validation
            if len(meta) != n_validation:
                raise ValueError(
                    f"{name}'s length does not eqaul to `eval_set`, "
                    + f"expecting {n_validation}, got {len(meta)}"
                )
            return meta

        if eval_set is not None:
            sample_weight_eval_set = validate_or_none(
                sample_weight_eval_set, "sample_weight_eval_set"
            )
            base_margin_eval_set = validate_or_none(
                base_margin_eval_set, "base_margin_eval_set"
            )
            eval_group = validate_or_none(eval_group, "eval_group")
            eval_qid = validate_or_none(eval_qid, "eval_qid")

            evals = []
            for i, (valid_X, valid_y) in enumerate(eval_set):
                # Skip the duplicated entry.
                if all(
                    (
                        valid_X is X,
                        valid_y is y,
                        sample_weight_eval_set[i] is sample_weight,
                        base_margin_eval_set[i] is base_margin,
                        eval_group[i] is group,
                        eval_qid[i] is qid,
                    )
                ):
                    evals.append(train_dmatrix)
                else:
                    m = create_dmatrix(
                        data=valid_X,
                        label=label_transform(valid_y),
                        weight=sample_weight_eval_set[i],
                        group=eval_group[i],
                        qid=eval_qid[i],
                        base_margin=base_margin_eval_set[i],
                        missing=missing,
                    )
                    evals.append(m)
            nevals = len(evals)
            eval_names = ["validation_{}".format(i) for i in range(nevals)]
            evals = list(zip(evals, eval_names))
        else:
            if any(
                meta is not None
                for meta in [
                    sample_weight_eval_set,
                    base_margin_eval_set,
                    eval_group,
                    eval_qid,
                ]
            ):
                raise ValueError(
                    "`eval_set` is not set but one of the other evaluation "
                    "meta info is not None."
                )
            evals = []

        return train_dmatrix, evals


try:
    from xgboost.sklearn import _cls_predict_proba
except ImportError:
    # copied from the file in the top comment
    def _cls_predict_proba(n_classes: int, prediction, vstack: Callable):
        assert len(prediction.shape) <= 2
        if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
            return prediction
        # binary logistic function
        classone_probs = prediction
        classzero_probs = 1.0 - classone_probs
        return vstack((classzero_probs, classone_probs)).transpose()


try:
    from xgboost.sklearn import _is_cudf_df, _is_cudf_ser, _is_cupy_array
except ImportError:
    _is_cudf_df = None
    _is_cudf_ser = None
    _is_cupy_array = None


_RAY_PARAMS_DOC = """ray_params : None or RayParams or Dict
            Parameters to configure Ray-specific behavior.
            See :class:`RayParams` for a list of valid
            configuration parameters. Will override ``n_jobs`` attribute
            with own ``num_actors`` parameter.
        _remote : bool
            Whether to run the driver process in a remote
            function. This is enabled by default in Ray client mode.
        ray_dmatrix_params : dict
            Dict of parameters (such as sharding mode) passed to the
            internal RayDMatrix initialization.

"""

_N_JOBS_DOC_REPLACE = (
    """    n_jobs : int
        Number of parallel threads used to run xgboost.  When used with other Scikit-Learn
        algorithms like grid search, you may choose which algorithm to parallelize and
        balance the threads.  Creating thread contention will significantly slow down both
        algorithms.""",  # noqa: E501, W291
    """    n_jobs : int
        Number of Ray actors used to run xgboost in parallel.
        In order to set number of threads per actor, pass a :class:`RayParams`
        object to the relevant method as a ``ray_params`` argument. Will be
        overriden by the ``num_actors`` parameter of ``ray_params`` argument
        should it be passed to a method.""",  # noqa: E501, W291
)


def _get_doc(object: Any) -> Optional[str]:
    """Same as ``inspect.getdoc``, but without ``cleandoc`` applied."""
    try:
        doc = object.__doc__
    except AttributeError:
        return None
    if doc is None:
        try:
            doc = _finddoc(object)
        except (AttributeError, TypeError):
            return None
    if not isinstance(doc, str):
        return None
    return doc


def _treat_estimator_doc(doc: Optional[str]) -> Optional[str]:
    """Helper function to make nececssary changes in estimator docstrings"""
    if doc:
        doc = (
            doc.replace(*_N_JOBS_DOC_REPLACE)
            .replace(
                "scikit-learn API for XGBoost",
                "scikit-learn API for Ray-distributed XGBoost",
            )
            .replace(":doc:`tree method\n        </treemethod>`", "tree method")
        )
    return doc


def _treat_X_doc(doc: Optional[str]) -> Optional[str]:
    if doc:
        doc = doc.replace(
            "Data to predict with.",
            "Data to predict with. Can also be a ``RayDMatrix``.",
        )
        doc = doc.replace(
            "Feature matrix.", "Feature matrix. Can also be a ``RayDMatrix``."
        )
        doc = doc.replace(
            "Feature matrix", "Feature matrix. Can also be a ``RayDMatrix``."
        )
    return doc


def _xgboost_version_warn(f):
    """Decorator to warn when xgboost version is < 1.4.0"""

    @functools.wraps(f)
    def inner_f(*args, **kwargs):
        if XGBOOST_VERSION < Version("1.4.0"):
            warnings.warn(LEGACY_WARNING)
        return f(*args, **kwargs)

    return inner_f


def _check_if_params_are_ray_dmatrix(
    X,
    sample_weight,
    base_margin,
    eval_set,
    sample_weight_eval_set,
    base_margin_eval_set,
    eval_qid=None,
):
    train_dmatrix = None
    evals = ()
    eval_set = eval_set or ()
    if isinstance(X, RayDMatrix):
        params_to_warn_about = ["y"]
        if sample_weight is not None:
            params_to_warn_about.append("sample_weight")
        if base_margin is not None:
            params_to_warn_about.append("base_margin")
        warnings.warn(
            f"X is a RayDMatrix, {', '.join(params_to_warn_about)}" " will be ignored!"
        )
        train_dmatrix = X
        if eval_set:
            if any(
                not isinstance(eval_data, RayDMatrix) or not isinstance(eval_name, str)
                for eval_data, eval_name in eval_set
            ):
                raise ValueError(
                    "If X is a RayDMatrix, all elements of "
                    "`eval_set` must be (RayDMatrix, str) "
                    "tuples."
                )
        params_to_warn_about = []
        if sample_weight_eval_set is not None:
            params_to_warn_about.append("sample_weight_eval_set")
        if base_margin_eval_set is not None:
            params_to_warn_about.append("base_margin_eval_set")
        if eval_qid is not None:
            params_to_warn_about.append("eval_qid")
        if params_to_warn_about:
            warnings.warn(
                "`eval_set` is composed of RayDMatrix tuples, "
                f"{', '.join(params_to_warn_about)} will be ignored!"
            )
        evals = eval_set or ()
    elif any(
        isinstance(eval_x, RayDMatrix) or isinstance(eval_y, RayDMatrix)
        for eval_x, eval_y in eval_set
    ):
        raise ValueError(
            "If X is not a RayDMatrix, all `eval_set` "
            "elements must be (array_like, array_like)"
            " tuples."
        )
    return train_dmatrix, evals


@DeveloperAPI
class RayXGBMixin:
    """Mixin class to provide xgboost-ray functionality"""

    def _ray_set_ray_params_n_jobs(
        self, ray_params: Optional[Union[RayParams, dict]], n_jobs: Optional[int]
    ) -> RayParams:
        """Helper function to set num_actors in ray_params if not
        set by the user"""
        if ray_params is None:
            if not n_jobs or n_jobs < 1:
                n_jobs = 1
            ray_params = RayParams(num_actors=n_jobs)
        elif n_jobs is not None:
            warnings.warn(
                "`ray_params` is not `None` and will override "
                "the `n_jobs` attribute."
            )
        return ray_params

    def _ray_predict(
        self: "XGBModel",
        X,
        output_margin=False,
        validate_features=True,
        base_margin=None,
        iteration_range=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
        **kwargs,
    ):
        """Distributed predict via Ray"""
        ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs)
        ray_dmatrix_params = ray_dmatrix_params or {}

        if not isinstance(X, RayDMatrix):
            test = RayDMatrix(
                X, base_margin=base_margin, missing=self.missing, **ray_dmatrix_params
            )
        else:
            test = X
            if base_margin is not None:
                warnings.warn("X is a RayDMatrix, base_margin will be ignored!")

        return predict(
            self.get_booster(),
            data=test,
            output_margin=output_margin,
            validate_features=validate_features,
            ray_params=ray_params,
            _remote=_remote,
            **kwargs,
        )

    def _ray_get_wrap_evaluation_matrices_compat_kwargs(
        self, label_transform=None
    ) -> dict:
        ret = {}
        wrap_evaluation_matrices_parameters = inspect.signature(
            _wrap_evaluation_matrices
        ).parameters
        if "label_transform" in wrap_evaluation_matrices_parameters:
            # XGBoost < 1.6.0
            identity_func = lambda x: x  # noqa
            ret["label_transform"] = label_transform or identity_func
        if (
            hasattr(self, "enable_categorical")
            and "enable_categorical" in wrap_evaluation_matrices_parameters
        ):
            ret["enable_categorical"] = self.enable_categorical
        if (
            hasattr(self, "feature_types")
            and "feature_types" in wrap_evaluation_matrices_parameters
        ):
            ret["feature_types"] = self.feature_types
        return ret

    # copied from the file in the top comment
    # provided here for compatibility with legacy xgboost versions
    # will be overwritten by vanilla xgboost if possible
    def _configure_fit(
        self,
        booster: Optional[Union[Booster, "XGBModel", str]],
        eval_metric: Optional[Union[Callable, str, List[str]]],
        params: Dict[str, Any],
    ) -> Tuple[Optional[Union[Booster, str]], Dict[str, Any]]:
        # pylint: disable=protected-access, no-self-use
        if isinstance(booster, XGBModel):
            # Handle the case when xgb_model is a sklearn model object
            model: Optional[Union[Booster, str]] = booster._Booster
        else:
            model = booster

        feval = eval_metric if callable(eval_metric) else None
        if eval_metric is not None:
            if callable(eval_metric):
                eval_metric = None
            else:
                params.update({"eval_metric": eval_metric})
        return model, feval, params

    # copied from the file in the top comment
    # provided here for compatibility with legacy xgboost versions
    # will be overwritten by vanilla xgboost if possible
    def _set_evaluation_result(self, evals_result) -> None:
        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result


@PublicAPI(stability="beta")
class RayXGBRegressor(XGBRegressor, RayXGBMixin):
    __init__ = _xgboost_version_warn(XGBRegressor.__init__)

    @_deprecate_positional_args
    def fit(
        self,
        X,
        y,
        *,
        sample_weight=None,
        base_margin=None,
        eval_set=None,
        eval_metric=None,
        early_stopping_rounds=None,
        verbose=True,
        xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None,
        sample_weight_eval_set=None,
        base_margin_eval_set=None,
        feature_weights=None,
        callbacks=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
    ):
        evals_result = {}
        ray_dmatrix_params = ray_dmatrix_params or {}

        train_dmatrix, evals = _check_if_params_are_ray_dmatrix(
            X,
            sample_weight,
            base_margin,
            eval_set,
            sample_weight_eval_set,
            base_margin_eval_set,
        )

        if train_dmatrix is None:
            train_dmatrix, evals = _wrap_evaluation_matrices(
                missing=self.missing,
                X=X,
                y=y,
                group=None,
                qid=None,
                sample_weight=sample_weight,
                base_margin=base_margin,
                feature_weights=feature_weights,
                eval_set=eval_set,
                sample_weight_eval_set=sample_weight_eval_set,
                base_margin_eval_set=base_margin_eval_set,
                eval_group=None,
                eval_qid=None,
                # changed in xgboost-ray:
                create_dmatrix=lambda **kwargs: RayDMatrix(
                    **{**kwargs, **ray_dmatrix_params}
                ),
                **self._ray_get_wrap_evaluation_matrices_compat_kwargs(),
            )

        params = self.get_xgb_params()

        if callable(self.objective):
            obj = _objective_decorator(self.objective)
            params["objective"] = "reg:squarederror"
        else:
            obj = None

        try:
            model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
        except TypeError:
            # XGBoost >= 1.6.0
            (
                model,
                feval,
                params,
                early_stopping_rounds,
                callbacks,
            ) = self._configure_fit(
                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
            )

        # remove those as they will be set in RayXGBoostActor
        params.pop("n_jobs", None)
        params.pop("nthread", None)

        ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs)

        additional_results = {}

        self._Booster = train(
            params,
            train_dmatrix,
            self.get_num_boosting_rounds(),
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            evals_result=evals_result,
            obj=obj,
            feval=feval,
            verbose_eval=verbose,
            xgb_model=model,
            callbacks=callbacks,
            # changed in xgboost-ray:
            additional_results=additional_results,
            ray_params=ray_params,
            _remote=_remote,
        )

        self.additional_results_ = additional_results

        self._set_evaluation_result(evals_result)
        return self

    fit.__doc__ = _treat_X_doc(_get_doc(XGBRegressor.fit)) + _RAY_PARAMS_DOC

    def _can_use_inplace_predict(self) -> bool:
        return False

    def predict(
        self,
        X,
        output_margin=False,
        validate_features=True,
        base_margin=None,
        iteration_range=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
        **kwargs,
    ):
        return self._ray_predict(
            X,
            output_margin=output_margin,
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
            ray_params=ray_params,
            _remote=_remote,
            ray_dmatrix_params=ray_dmatrix_params,
            **kwargs,
        )

    predict.__doc__ = _treat_X_doc(_get_doc(XGBRegressor.predict)) + _RAY_PARAMS_DOC

    def load_model(self, fname):
        if not hasattr(self, "_Booster"):
            self._Booster = Booster()
        return super().load_model(fname)


RayXGBRegressor.__doc__ = _treat_estimator_doc(_get_doc(XGBRegressor))


class RayXGBRFRegressor(RayXGBRegressor):

    # too much work to make this compatible with 0.90
    if xgboost_version == "0.90":

        def __init__(self, *args, **kwargs):
            raise ValueError("RayXGBRFRegressor not available with xgboost<1.0.0")

    else:

        @_deprecate_positional_args
        @_xgboost_version_warn
        def __init__(
            self,
            *,
            learning_rate=1,
            subsample=0.8,
            colsample_bynode=0.8,
            reg_lambda=1e-5,
            **kwargs,
        ):
            super().__init__(
                learning_rate=learning_rate,
                subsample=subsample,
                colsample_bynode=colsample_bynode,
                reg_lambda=reg_lambda,
                **kwargs,
            )

    def get_xgb_params(self):
        params = super().get_xgb_params()
        params["num_parallel_tree"] = self.n_estimators
        return params

    def get_num_boosting_rounds(self):
        return 1


RayXGBRFRegressor.__doc__ = _treat_estimator_doc(_get_doc(XGBRFRegressor))


@PublicAPI(stability="beta")
class RayXGBClassifier(XGBClassifier, RayXGBMixin):
    __init__ = _xgboost_version_warn(XGBClassifier.__init__)

    @_deprecate_positional_args
    def fit(
        self,
        X,
        y,
        *,
        sample_weight=None,
        base_margin=None,
        eval_set=None,
        eval_metric=None,
        early_stopping_rounds=None,
        verbose=True,
        xgb_model=None,
        sample_weight_eval_set=None,
        base_margin_eval_set=None,
        feature_weights=None,
        callbacks=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
    ):

        evals_result = {}
        ray_dmatrix_params = ray_dmatrix_params or {}

        params = self.get_xgb_params()

        train_dmatrix, evals = _check_if_params_are_ray_dmatrix(
            X,
            sample_weight,
            base_margin,
            eval_set,
            sample_weight_eval_set,
            base_margin_eval_set,
        )

        if train_dmatrix is not None:
            if "num_class" not in params:
                raise ValueError(
                    "`num_class` must be set during initalization if X"
                    " is a RayDMatrix"
                )
            if XGBOOST_VERSION < Version("2.0.0"):
                self.classes_ = list(range(0, params["num_class"]))
            self.n_classes_ = params["num_class"]
            if self.n_classes_ <= 2:
                params.pop("num_class")
            label_transform = lambda x: x  # noqa: E731
        else:
            if len(X.shape) != 2:
                # Simply raise an error here since there might be many
                # different ways of reshaping
                raise ValueError(
                    "Please reshape the input data X into 2-dimensional " "matrix."
                )

            label_transform = lambda x: x  # noqa: E731
            if XGBOOST_VERSION < Version("2.0.0"):
                self.classes_ = np.unique(y)
            self.n_classes_ = len(np.unique(y))

        if callable(self.objective):
            obj = _objective_decorator(self.objective)
            # Use default value. Is it really not used ?
            params["objective"] = "binary:logistic"
        else:
            obj = None

        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying
            # XGB instance
            params["objective"] = "multi:softprob"
            params["num_class"] = self.n_classes_

        try:
            model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
        except TypeError:
            # XGBoost >= 1.6.0
            (
                model,
                feval,
                params,
                early_stopping_rounds,
                callbacks,
            ) = self._configure_fit(
                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
            )

        if train_dmatrix is None:
            train_dmatrix, evals = _wrap_evaluation_matrices(
                missing=self.missing,
                X=X,
                y=y,
                group=None,
                qid=None,
                sample_weight=sample_weight,
                base_margin=base_margin,
                feature_weights=feature_weights,
                eval_set=eval_set,
                sample_weight_eval_set=sample_weight_eval_set,
                base_margin_eval_set=base_margin_eval_set,
                eval_group=None,
                eval_qid=None,
                # changed in xgboost-ray:
                create_dmatrix=lambda **kwargs: RayDMatrix(
                    **{**kwargs, **ray_dmatrix_params}
                ),
                **self._ray_get_wrap_evaluation_matrices_compat_kwargs(
                    label_transform=label_transform
                ),
            )

        # remove those as they will be set in RayXGBoostActor
        params.pop("n_jobs", None)
        params.pop("nthread", None)

        ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs)

        additional_results = {}

        self._Booster = train(
            params,
            train_dmatrix,
            self.get_num_boosting_rounds(),
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            evals_result=evals_result,
            obj=obj,
            feval=feval,
            verbose_eval=verbose,
            xgb_model=model,
            callbacks=callbacks,
            # changed in xgboost-ray:
            additional_results=additional_results,
            ray_params=ray_params,
            _remote=_remote,
        )

        if not callable(self.objective):
            self.objective = params["objective"]

        self.additional_results_ = additional_results

        self._set_evaluation_result(evals_result)
        return self

    fit.__doc__ = _treat_X_doc(_get_doc(XGBClassifier.fit)) + _RAY_PARAMS_DOC

    def _can_use_inplace_predict(self) -> bool:
        return False

    def predict(
        self,
        X,
        output_margin=False,
        validate_features=True,
        base_margin=None,
        iteration_range: Optional[Tuple[int, int]] = None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
        **kwargs,
    ):
        class_probs = self._ray_predict(
            X=X,
            output_margin=output_margin,
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
            ray_params=ray_params,
            _remote=_remote,
            ray_dmatrix_params=ray_dmatrix_params,
            **kwargs,
        )
        if output_margin:
            # If output_margin is active, simply return the scores
            return class_probs

        if len(class_probs.shape) > 1:
            # turns softprob into softmax
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            # turns soft logit into class label
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1

        if hasattr(self, "_le"):
            return self._le.inverse_transform(column_indexes)
        return column_indexes

    predict.__doc__ = _treat_X_doc(_get_doc(XGBModel.predict)) + _RAY_PARAMS_DOC

    def predict_proba(
        self,
        X,
        validate_features=False,
        base_margin=None,
        iteration_range: Optional[Tuple[int, int]] = None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
        **kwargs,
    ) -> np.ndarray:

        class_probs = self._ray_predict(
            X=X,
            output_margin=self.objective == "multi:softmax",
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
            ray_params=ray_params,
            _remote=_remote,
            ray_dmatrix_params=ray_dmatrix_params,
            **kwargs,
        )
        # If model is loaded from a raw booster there's no `n_classes_`
        return _cls_predict_proba(
            getattr(self, "n_classes_", None), class_probs, np.vstack
        )

    def load_model(self, fname):
        if not hasattr(self, "_Booster"):
            self._Booster = Booster()
        return super().load_model(fname)

    predict_proba.__doc__ = (
        _treat_X_doc(_get_doc(XGBClassifier.predict_proba)) + _RAY_PARAMS_DOC
    )


RayXGBClassifier.__doc__ = _treat_estimator_doc(_get_doc(XGBClassifier))


class RayXGBRFClassifier(RayXGBClassifier):
    # too much work to make this compatible with 0.90
    if xgboost_version == "0.90":

        def __init__(self, *args, **kwargs):
            raise ValueError("RayXGBRFClassifier not available with xgboost<1.0.0")

    else:

        @_deprecate_positional_args
        @_xgboost_version_warn
        def __init__(
            self,
            *,
            learning_rate=1,
            subsample=0.8,
            colsample_bynode=0.8,
            reg_lambda=1e-5,
            **kwargs,
        ):
            super().__init__(
                learning_rate=learning_rate,
                subsample=subsample,
                colsample_bynode=colsample_bynode,
                reg_lambda=reg_lambda,
                **kwargs,
            )

    def get_xgb_params(self):
        params = super().get_xgb_params()
        params["num_parallel_tree"] = self.n_estimators
        return params

    def get_num_boosting_rounds(self):
        return 1


RayXGBRFClassifier.__doc__ = _treat_estimator_doc(_get_doc(XGBRFClassifier))


@PublicAPI(stability="beta")
class RayXGBRanker(XGBRanker, RayXGBMixin):
    __init__ = _xgboost_version_warn(XGBRanker.__init__)

    @_deprecate_positional_args
    def fit(
        self,
        X,
        y,
        *,
        group=None,
        qid=None,
        sample_weight=None,
        base_margin=None,
        eval_set=None,
        eval_group=None,
        eval_qid=None,
        eval_metric=None,
        early_stopping_rounds=None,
        verbose=False,
        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set=None,
        base_margin_eval_set=None,
        feature_weights=None,
        callbacks=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
    ):

        if not (group is None and eval_group is None):
            raise ValueError("Use `qid` instead of `group` for RayXGBRanker.")
        if qid is None:
            raise ValueError("`qid` is required for ranking.")

        if eval_set is not None:
            if eval_qid is None:
                raise ValueError("`eval_qid `is required if" " `eval_set` is not None")

        evals_result = {}
        ray_dmatrix_params = ray_dmatrix_params or {}

        params = self.get_xgb_params()

        train_dmatrix, evals = _check_if_params_are_ray_dmatrix(
            X,
            sample_weight,
            base_margin,
            eval_set,
            sample_weight_eval_set,
            base_margin_eval_set,
            eval_qid,
        )

        if train_dmatrix is None:
            train_dmatrix, evals = _wrap_evaluation_matrices(
                missing=self.missing,
                X=X,
                y=y,
                group=group,
                qid=qid,
                sample_weight=sample_weight,
                base_margin=base_margin,
                feature_weights=feature_weights,
                eval_set=eval_set,
                sample_weight_eval_set=sample_weight_eval_set,
                base_margin_eval_set=base_margin_eval_set,
                eval_group=eval_group,
                eval_qid=eval_qid,
                # changed in xgboost-ray:
                create_dmatrix=lambda **kwargs: RayDMatrix(
                    **{**kwargs, **ray_dmatrix_params}
                ),
                **self._ray_get_wrap_evaluation_matrices_compat_kwargs(),
            )

        try:
            model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
        except TypeError:
            # XGBoost >= 1.6.0
            (
                model,
                feval,
                params,
                early_stopping_rounds,
                callbacks,
            ) = self._configure_fit(
                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
            )
        if callable(feval):
            raise ValueError(
                "Custom evaluation metric is not yet supported for XGBRanker."
            )

        # remove those as they will be set in RayXGBoostActor
        params.pop("n_jobs", None)
        params.pop("nthread", None)

        ray_params = self._ray_set_ray_params_n_jobs(ray_params, self.n_jobs)

        additional_results = {}

        self._Booster = train(
            params,
            train_dmatrix,
            self.n_estimators,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            evals_result=evals_result,
            feval=feval,
            verbose_eval=verbose,
            xgb_model=model,
            callbacks=callbacks,
            # changed in xgboost-ray:
            additional_results=additional_results,
            ray_params=ray_params,
            _remote=_remote,
        )

        self.objective = params["objective"]

        self.additional_results_ = additional_results

        self._set_evaluation_result(evals_result)
        return self

    fit.__doc__ = _treat_X_doc(_get_doc(XGBRanker.fit)) + _RAY_PARAMS_DOC

    def _can_use_inplace_predict(self) -> bool:
        return False

    def predict(
        self,
        X,
        output_margin=False,
        validate_features=True,
        base_margin=None,
        iteration_range=None,
        ray_params: Union[None, RayParams, Dict] = None,
        _remote: Optional[bool] = None,
        ray_dmatrix_params: Optional[Dict] = None,
        **kwargs,
    ):
        return self._ray_predict(
            X,
            output_margin=output_margin,
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
            ray_params=ray_params,
            _remote=_remote,
            ray_dmatrix_params=ray_dmatrix_params,
            **kwargs,
        )

    predict.__doc__ = _treat_X_doc(_get_doc(XGBRanker.predict)) + _RAY_PARAMS_DOC

    def load_model(self, fname):
        if not hasattr(self, "_Booster"):
            self._Booster = Booster()
        return super().load_model(fname)


RayXGBRanker.__doc__ = _treat_estimator_doc(_get_doc(XGBRanker))


================================================
FILE: xgboost_ray/tests/__init__.py
================================================


================================================
FILE: xgboost_ray/tests/conftest.py
================================================
from contextlib import contextmanager
from functools import partial

import pytest
import ray

try:
    # Ray 1.3+
    from ray._private.cluster_utils import Cluster
except ImportError:
    from ray.cluster_utils import Cluster


def get_default_fixure_system_config():
    system_config = {
        "object_timeout_milliseconds": 200,
        "health_check_initial_delay_ms": 0,
        "health_check_failure_threshold": 10,
        "object_store_full_delay_ms": 100,
    }
    return system_config


def get_default_fixture_ray_kwargs():
    system_config = get_default_fixure_system_config()
    ray_kwargs = {
        "num_cpus": 1,
        "object_store_memory": 150 * 1024 * 1024,
        "dashboard_port": None,
        "namespace": "default_test_namespace",
        "_system_config": system_config,
    }
    return ray_kwargs


@contextmanager
def _ray_start_cluster(**kwargs):
    init_kwargs = get_default_fixture_ray_kwargs()
    num_nodes = 0
    do_init = False
    # num_nodes & do_init are not arguments for ray.init, so delete them.
    if "num_nodes" in kwargs:
        num_nodes = kwargs["num_nodes"]
        del kwargs["num_nodes"]
    if "do_init" in kwargs:
        do_init = kwargs["do_init"]
        del kwargs["do_init"]
    elif num_nodes > 0:
        do_init = True
    init_kwargs.update(kwargs)
    cluster = Cluster()
    remote_nodes = []
    for i in range(num_nodes):
        if i > 0 and "_system_config" in init_kwargs:
            del init_kwargs["_system_config"]
        remote_nodes.append(cluster.add_node(**init_kwargs))
        # We assume driver will connect to the head (first node),
        # so ray init will be invoked if do_init is true
        if len(remote_nodes) == 1 and do_init:
            ray.init(address=cluster.address)
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()


# This fixture will start a cluster with empty nodes.
@pytest.fixture(scope="function")
def ray_start_cluster(request):
    param = getattr(request, "param", {})
    request.cls.ray_start_cluster = partial(_ray_start_cluster, **param)


================================================
FILE: xgboost_ray/tests/env_info.sh
================================================
#!/bin/bash
# shellcheck disable=SC2005

echo "Test environment information"
echo "----------------------------"
echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')"
echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')"
echo "Installed pip packages:"
echo "$(python -m pip freeze 2>/dev/null || echo 'Pip not installed')"
echo "----------------------------"


================================================
FILE: xgboost_ray/tests/fault_tolerance.py
================================================
import os
import time
from collections import defaultdict
from typing import Dict, Set, Tuple

import ray
from ray.actor import ActorHandle

from xgboost_ray.callback import DistributedCallback
from xgboost_ray.compat import TrainingCallback
from xgboost_ray.session import get_actor_rank


@ray.remote(num_cpus=0)
class FaultToleranceManager:
    def __init__(self, start_boost_round: int = 0):
        self.global_boost_round = start_boost_round

        # Dict from boost_round -> actor ranks to die
        self.scheduled_kill: Dict[int, Set[int]] = defaultdict(set)

        # Dict from actor rank -> starts/ends of boost rounds to sleep
        self.delayed_return: Dict[int, Set[Tuple[int, int]]] = defaultdict(set)

        # List of tuples (global_boost_round, actor_boost_round) to log
        # actor iterations
        self.training_logs = defaultdict(list)

    def schedule_kill(self, rank: int, boost_round: int):
        """Kill an actor when reaching this global boost round"""
        self.scheduled_kill[boost_round].add(rank)

    def delay_return(self, rank: int, start_boost_round: int, end_boost_round: int):
        """Do not allow an actor to finish data loading between these rounds"""
        self.delayed_return[rank].add((start_boost_round, end_boost_round))

    def inc_boost_round(self, rank: int):
        """Increase global boosting round"""
        if rank == 0:
            self.global_boost_round += 1

    def log_iteration(self, rank: int, boost_round: int):
        """Log iteration"""
        self.training_logs[rank].append((self.global_boost_round, boost_round))

    def should_die(self, rank: int):
        """Returns True if the actor should terminate the training job now."""
        die = False
        for round in range(self.global_boost_round + 1):
            # Loop through all rounds until now to deal with race conditions
            if rank in self.scheduled_kill[round]:
                self.scheduled_kill[round].remove(rank)
                die = True
        return die

    def should_sleep(self, rank: int):
        """Returns True if the actor should not finish data loading, yet."""
        if self.delayed_return[rank]:
            for start, end in self.delayed_return[rank]:
                if start <= self.global_boost_round < end:
                    return True
        return False

    def get_logs(self):
        return self.training_logs


class DelayedLoadingCallback(DistributedCallback):
    """Used to control when actors return to training"""

    def __init__(self, ft_manager: ActorHandle, reload_data=True, sleep_time=0.5):
        self.ft_manager = ft_manager
        self.reload_data = reload_data
        self.sleep_time = sleep_time

    def after_data_loading(self, actor, data, *args, **kwargs):
        print(f"Rank {actor.rank} - after load")
        while ray.get(self.ft_manager.should_sleep.remote(actor.rank)):
            time.sleep(self.sleep_time)
        print(f"Rank {actor.rank} - returning now")


class DieCallback(TrainingCallback):
    """Used to control when actors should die during training.

    Also can add delay to each boosting round.
    """

    def __init__(self, ft_manager: ActorHandle, training_delay: float = 0):
        self.ft_manager = ft_manager
        self.training_delay = training_delay
        super(DieCallback, self).__init__()

    def before_iteration(self, model, epoch, evals_log):
        if ray.get(self.ft_manager.should_die.remote(get_actor_rank())):
            pid = os.getpid()
            print(f"Killing process: {pid}")
            print(f"Rank {get_actor_rank()} will now die.")
            time.sleep(1)
            os.kill(pid, 9)
            time.sleep(10)  # Don't continue training, just die

    def after_iteration(self, model, epoch, evals_log):
        # ray.get to make sure this is up to date in the next iteration
        ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch))
        if self.training_delay > 0:
            time.sleep(self.training_delay)
        if get_actor_rank() == 0:
            ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank()))


================================================
FILE: xgboost_ray/tests/release/benchmark_cpu_gpu.py
================================================
import argparse
import glob
import os
import shutil
import time

import ray

from xgboost_ray import (
    RayDeviceQuantileDMatrix,
    RayDMatrix,
    RayFileType,
    RayParams,
    train,
)
from xgboost_ray.tests.utils import create_parquet_in_tempdir

if "OMP_NUM_THREADS" in os.environ:
    del os.environ["OMP_NUM_THREADS"]


def train_ray(
    path,
    num_workers,
    num_boost_rounds,
    num_files=0,
    regression=False,
    use_gpu=False,
    smoke_test=False,
    ray_params=None,
    xgboost_params=None,
    **kwargs,
):
    if num_files:
        files = sorted(glob.glob(f"{path}/**/*.parquet"))
        while num_files > len(files):
            files = files + files
        path = files[0:num_files]

    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401

            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(
            path,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )
    else:
        dtrain = RayDMatrix(
            path,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )

    config = xgboost_params or {"tree_method": "hist" if not use_gpu else "gpu_hist"}
    if not regression:
        # Classification
        config.update(
            {
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error"],
            }
        )
    else:
        # Regression
        config.update(
            {
                "objective": "reg:squarederror",
                "eval_metric": ["logloss", "rmse"],
            }
        )

    start = time.time()
    evals_result = {}
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        num_boost_round=num_boost_rounds,
        ray_params=ray_params
        or RayParams(
            max_actor_restarts=2,
            num_actors=num_workers,
            cpus_per_actor=4 if not smoke_test else 1,
            gpus_per_actor=0 if not use_gpu else 1,
        ),
        evals=[(dtrain, "train")],
        **kwargs,
    )
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
    print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
    return bst, taken


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some integers.")

    parser.add_argument("num_workers", type=int, help="num workers")
    parser.add_argument("num_rounds", type=int, help="num boost rounds")
    parser.add_argument("num_files", type=int, help="num files")

    parser.add_argument(
        "--file", default="/data/parted.parquet", type=str, help="data file"
    )

    parser.add_argument(
        "--regression", action="store_true", default=False, help="regression"
    )

    parser.add_argument("--gpu", action="store_true", default=False, help="gpu")

    parser.add_argument(
        "--smoke-test", action="store_true", default=False, help="smoke test"
    )

    args = parser.parse_args()

    num_workers = args.num_workers
    num_boost_rounds = args.num_rounds
    num_files = args.num_files
    use_gpu = args.gpu

    temp_dir = None
    if args.smoke_test:
        temp_dir, path = create_parquet_in_tempdir(
            filename="smoketest.parquet",
            num_rows=args.num_workers * 500,
            num_features=4,
            num_classes=2,
            num_partitions=args.num_workers * 10,
        )
        use_gpu = False
    else:
        path = args.file
        if not os.path.exists(path):
            raise ValueError(
                f"Benchmarking data not found: {path}."
                f"\nFIX THIS by running `python create_test_data.py` first."
            )

    init_start = time.time()
    if args.smoke_test:
        ray.init(num_cpus=num_workers)
    else:
        ray.init(address="auto")
    init_taken = time.time() - init_start

    full_start = time.time()
    bst, train_taken = train_ray(
        path=path,
        num_workers=num_workers,
        num_boost_rounds=num_boost_rounds,
        num_files=num_files,
        regression=args.regression,
        use_gpu=use_gpu,
        smoke_test=args.smoke_test,
    )
    full_taken = time.time() - full_start
    print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)")

    if args.smoke_test:
        shutil.rmtree(temp_dir, ignore_errors=True)
    else:
        with open("res.csv", "at") as fp:
            fp.writelines(
                [
                    ",".join(
                        [
                            str(e)
                            for e in [
                                num_workers,
                                num_files,
                                int(use_gpu),
                                num_boost_rounds,
                                init_taken,
                                full_taken,
                                train_taken,
                            ]
                        ]
                    )
                    + "\n"
                ]
            )


================================================
FILE: xgboost_ray/tests/release/benchmark_ft.py
================================================
import argparse
import glob
import os
from typing import Dict, List

import numpy as np
import ray
from ray import tune
from ray.tune import CLIReporter

from xgboost_ray import (
    RayDeviceQuantileDMatrix,
    RayDMatrix,
    RayFileType,
    RayParams,
    RayShardingMode,
    train,
)
from xgboost_ray.callback import EnvironmentCallback
from xgboost_ray.matrix import _get_sharding_indices
from xgboost_ray.tests.fault_tolerance import (
    DelayedLoadingCallback,
    DieCallback,
    FaultToleranceManager,
)
from xgboost_ray.tests.utils import create_parquet_in_tempdir

if "OMP_NUM_THREADS" in os.environ:
    del os.environ["OMP_NUM_THREADS"]


def train_ray(
    train_files,
    eval_files,
    num_workers,
    num_boost_round,
    regression=False,
    use_gpu=False,
    ray_params=None,
    xgboost_params=None,
    ft_manager=None,
    aws=None,
    **kwargs,
):
    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401

            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_gpu and use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(
            train_files,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )
        deval = RayDeviceQuantileDMatrix(
            eval_files,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )
    else:
        dtrain = RayDMatrix(
            train_files,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )
        deval = RayDMatrix(
            eval_files,
            num_actors=num_workers,
            label="labels",
            ignore=["partition"],
            filetype=RayFileType.PARQUET,
        )

    config = xgboost_params or {"tree_method": "hist"}

    if use_gpu:
        config.update({"tree_method": "gpu_hist"})

    if not regression:
        # Classification
        config.update(
            {
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error"],
            }
        )
        return_metric = "error"
    else:
        # Regression
        config.update(
            {
                "objective": "reg:squarederror",
                "eval_metric": ["logloss", "rmse"],
            }
        )
        return_metric = "rmse"

    xgboost_callbacks = []
    distributed_callbacks = []
    if ft_manager:
        delay_callback = DelayedLoadingCallback(
            ft_manager, reload_data=True, sleep_time=0.1
        )
        distributed_callbacks.append(delay_callback)

        die_callback = DieCallback(ft_manager, training_delay=0.1)
        xgboost_callbacks.append(die_callback)

    if aws:
        aws_callback = EnvironmentCallback(aws)
        distributed_callbacks.append(aws_callback)
        os.environ.update(aws)

    ray_params = ray_params or RayParams()
    ray_params.num_actors = num_workers
    ray_params.gpus_per_actor = 0 if not use_gpu else 1
    ray_params.distributed_callbacks = distributed_callbacks

    evals_result = {}
    additional_results = {}
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        additional_results=additional_results,
        num_boost_round=num_boost_round,
        ray_params=ray_params,
        evals=[(dtrain, "train"), (deval, "eval")],
        callbacks=xgboost_callbacks,
        **kwargs,
    )

    bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
    print(
        "Final training error: {:.4f}".format(evals_result["train"][return_metric][-1])
    )

    results = {
        "train-logloss": evals_result["train"]["logloss"][-1],
        f"train-{return_metric}": evals_result["train"][return_metric][-1],
        "eval-logloss": evals_result["eval"]["logloss"][-1],
        f"eval-{return_metric}": evals_result["eval"][return_metric][-1],
        "total_n": additional_results["total_n"],
    }

    return bst, results


def ft_setup(
    workers: List[int],
    num_rounds: int,
    die_round_factor: 0.25,
    comeback_round_factor: 0.75,
):
    """Setup fault tolerance manager, schedule kills and comebacks"""
    if workers is None:
        return None

    ft_manager = FaultToleranceManager.remote()

    # Choose some nodes to kill
    die_round = int(die_round_factor * num_rounds)
    comeback_round = int(comeback_round_factor * num_rounds)

    for worker in workers:
        ft_manager.schedule_kill.remote(rank=worker, boost_round=die_round)
        ft_manager.delay_return.remote(
            rank=1, start_boost_round=die_round - 2, end_boost_round=comeback_round - 1
        )

    print(
        f"Scheduled workers {list(workers)} to die at round {die_round} "
        f"and to come back at round {comeback_round} "
        f"(total {num_rounds} training rounds)"
    )

    return ft_manager


def run_experiments(config, files, aws):
    """Ray Tune-compatible function trainable to run experiments"""
    os.environ["RXGB_ALLOW_ELASTIC_TUNE"] = "1"

    condition = config["condition"]

    num_boost_round = config["num_boost_round"]
    num_workers = config["num_workers"]
    num_affected_workers = config["affected_workers"]
    regression = config["regression"]
    use_gpu = config["use_gpu"]
    seed = config["seed"]

    metric = "eval-error" if not regression else "eval-rmse"

    xgboost_params: Dict = config["xgboost_params"]
    ray_params: RayParams = config["ray_params"]

    # Select a fixed subset of the files for evaluation only
    np.random.seed(seed)
    np.random.shuffle(files)
    last_train_index = int(0.8 * len(files))
    train_files = list(files[:last_train_index])
    eval_files = list(files[last_train_index:])

    if num_affected_workers:
        affected_workers = np.random.choice(
            np.arange(1, num_workers), size=num_affected_workers, replace=False
        ).tolist()
    else:
        affected_workers = None

    np.random.seed(seed)  # Re-seed because of conditional evaluation

    # Dataset to train on
    sharding_mode = RayShardingMode.INTERLEAVED

    if condition == "calibrate":
        final_files = train_files
        final_workers = num_workers

        ray_params.num_actors = final_workers

        bst, results = train_ray(
            train_files=final_files,
            eval_files=eval_files,
            num_workers=final_workers,
            num_boost_round=num_boost_round,
            regression=regression,
            use_gpu=use_gpu,
            ray_params=ray_params,
            xgboost_params=xgboost_params,
            ft_manager=None,
            aws=aws,
            early_stopping_rounds=10,
        )

        return results

    if condition == "fewer_workers":
        # Sanity check: Just train with fewer workers
        remove_shards = []

        if affected_workers is not None:
            for rank in affected_workers:
                remove_shards += _get_sharding_indices(
                    sharding=sharding_mode,
                    rank=rank,
                    num_actors=num_workers,
                    n=len(train_files),
                )

            mask = np.ones(len(train_files), dtype=bool)
            mask[remove_shards] = False

            final_files = np.array(train_files)[mask].tolist()
            final_workers = num_workers - len(affected_workers)
        else:
            final_files = train_files
            final_workers = num_workers

        ray_params.num_actors = final_workers

        bst, results = train_ray(
            train_files=final_files,
            eval_files=eval_files,
            num_workers=final_workers,
            num_boost_round=num_boost_round,
            regression=regression,
            use_gpu=use_gpu,
            ray_params=ray_params,
            xgboost_params=xgboost_params,
            ft_manager=None,
            aws=aws,
        )

        return results

    if num_affected_workers == 0:
        # No duplicate baseline runs
        return {metric: float("inf")}

    if condition == "non_elastic":
        # Non-elastic training: Actors die after 50% and come back
        ray_params.elastic_training = False
        ray_params.max_failed_actors = len(affected_workers)
        ray_params.max_actor_restarts = 1

        ft_manager = ft_setup(
            workers=affected_workers,
            num_rounds=num_boost_round,
            die_round_factor=0.5,
            comeback_round_factor=0.0,
        )

    elif condition == "elastic_no_comeback":
        # Elastic training: Actors die after 50% and don't come back
        ray_params.elastic_training = True
        ray_params.max_failed_actors = len(affected_workers)
        ray_params.max_actor_restarts = 1

        ft_manager = ft_setup(
            workers=affected_workers,
            num_rounds=num_boost_round,
            die_round_factor=0.5,
            comeback_round_factor=1.1,
        )

    elif condition == "elastic_comeback":
        # Elastic training: Actors die after 50% and come back
        ray_params.elastic_training = True
        ray_params.max_failed_actors = len(affected_workers)
        ray_params.max_actor_restarts = 1

        ft_manager = ft_setup(
            workers=affected_workers,
            num_rounds=num_boost_round,
            die_round_factor=0.5,
            comeback_round_factor=0.75,
        )
    else:
        raise ValueError("Unknown condition:", condition)

    bst, results = train_ray(
        train_files=train_files,
        eval_files=eval_files,
        num_workers=num_workers,
        num_boost_round=num_boost_round,
        regression=regression,
        use_gpu=use_gpu,
        ray_params=ray_params,
        xgboost_params=xgboost_params,
        ft_manager=ft_manager,
        aws=aws,
    )

    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some integers.")

    parser.add_argument("num_workers", type=int, help="num workers")
    parser.add_argument("num_rounds", type=int, help="num boost rounds")
    parser.add_argument("num_files", type=int, help="num files")

    parser.add_argument("--cpu", default=0, type=int, help="num cpus per worker")

    parser.add_argument(
        "--file", default="/data/parted.parquet", type=str, help="data file"
    )

    parser.add_argument(
        "--regression", action="store_true", default=False, help="regression"
    )

    parser.add_argument("--gpu", action="store_true", default=False, help="gpu")

    parser.add_argument(
        "--calibrate", action="store_true", default=False, help="calibrate boost rounds"
    )

    parser.add_argument(
        "--smoke-test", action="store_true", default=False, help="smoke test"
    )

    args = parser.parse_args()

    num_workers = args.num_workers
    num_boost_round = args.num_rounds
    num_files = args.num_files
    use_gpu = args.gpu

    aws = None

    temp_dir = None
    if args.smoke_test:
        temp_dir, files = create_parquet_in_tempdir(
            filename="smoketest.parquet",
            num_rows=args.num_workers * 500,
            num_features=4,
            num_classes=2,
            num_partitions=args.num_workers * 10,
        )
        use_gpu = False
    else:
        path = args.file
        if path.startswith("s3://"):
            base, num_partitions = path.split("#", maxsplit=1)
            num_partitions = int(num_partitions)
            files = [
                f"{base}/partition={i}/part_{i}.parquet" for i in range(num_partitions)
            ]
            print(
                f"Using S3 dataset with base {base} and "
                f"{num_partitions} partitions."
            )
            try:
                aws = {
                    "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
                    "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
                    "AWS_SESSION_TOKEN": os.environ["AWS_SESSION_TOKEN"],
                }
            except KeyError as e:
                raise ValueError(
                    "Trying to access AWS S3, but credentials are not set "
                    "in the environment. Did you forget to set your "
                    "credentials?"
                ) from e

        elif not os.path.exists(path):
            raise ValueError(
                f"Benchmarking data not found: {path}."
                f"\nFIX THIS by running `python create_test_data.py` first."
            )
        else:
            files = sorted(glob.glob(f"{path}/**/*.parquet"))
            print(
                f"Using local dataset with base {path} and " f"{len(files)} partitions."
            )

    if num_files:
        while num_files > len(files):
            files = files + files
        files = files[0:num_files]

    if args.smoke_test:
        ray.init(num_cpus=num_workers)
    else:
        ray.init(address="auto")

    ray_params = RayParams(
        num_actors=num_workers,
        cpus_per_actor=args.cpu,
        checkpoint_frequency=1,
    )

    config = {
        "num_workers": num_workers,
        "num_boost_round": num_boost_round,
        "seed": 1000,
        "condition": tune.grid_search(
            [
                "fewer_workers",
                "non_elastic",
                "elastic_no_comeback",
                "elastic_comeback",
            ]
        ),
        "affected_workers": tune.grid_search([0, 1, 2, 3]),
        "regression": args.regression,
        "use_gpu": args.gpu,
        "xgboost_params": {},
        "ray_params": ray_params,
    }

    if args.calibrate:
        config["condition"] = "calibrate"
        config["affected_workers"] = 0

    metric = "eval-error" if not args.regression else "eval-rmse"
    train_metric = "train-error" if not args.regression else "train-rmse"

    reporter = CLIReporter(
        parameter_columns=["condition", "affected_workers"],
        metric_columns=[
            metric,
            "eval-logloss",
            train_metric,
            "total_n",
            "time_total_s",
        ],
        print_intermediate_tables=True,
    )

    analysis = tune.run(
        tune.with_parameters(run_experiments, files=files, aws=aws),
        config=config,
        metric=metric,
        mode="min",
        resources_per_trial=ray_params.get_tune_resources(),
        reuse_actors=True,
        progress_reporter=reporter,
        log_to_file=True,
        verbose=2,
    )

    print(f"Best config: {analysis.best_config} " f"with result {analysis.best_result}")


================================================
FILE: xgboost_ray/tests/release/cluster_cpu.yaml
================================================
cluster_name: xgboost_ray_release_tests_cpu_{{env["NUM_WORKERS"] | default(0)}}

max_workers: {{env["NUM_WORKERS"] | default(0)}}
upscaling_speed: 9999

idle_timeout_minutes: 15

docker:
    image: anyscale/ray:nightly
    container_name: ray_container
    pull_before_run: true
    run_options:
      - --privileged

provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: false

available_node_types:
    cpu_4_ondemand:
        node_config:
            InstanceType: m5.xlarge
        resources: {"CPU": 4}
        min_workers: {{env["NUM_WORKERS"] | default(0)}}
        max_workers: {{env["NUM_WORKERS"] | default(0)}}

auth:
    ssh_user: ubuntu

head_node_type: cpu_4_ondemand
worker_default_node_type: cpu_4_ondemand

file_mounts_sync_continuously: false

setup_commands:
    - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
    - pip install dask pytest
    - pip install -U {{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}


================================================
FILE: xgboost_ray/tests/release/cluster_ft.yaml
================================================
cluster_name: xgboost_ray_release_tests_ft_cluster

max_workers: 9

upscaling_speed: 32

idle_timeout_minutes: 15

docker:
    image: anyscale/ray:nightly
    container_name: ray_container
    pull_before_run: true

provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: false

available_node_types:
    cpu_16_ondemand:
        node_config:
            InstanceType: m5.4xlarge
        resources: {"CPU": 16}
        min_workers: 9
        max_workers: 9

file_mounts: {
  "/release_tests": "./"
}


auth:
    ssh_user: ubuntu

head_node_type: cpu_16_ondemand
worker_default_node_type: cpu_16_ondemand

setup_commands:
    - pip install -U awscli fsspec petastorm s3fs botocore
    - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
    - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh

file_mounts_sync_continuously: false


================================================
FILE: xgboost_ray/tests/release/cluster_gpu.yaml
================================================
cluster_name: xgboost_ray_release_tests_gpu_{{env["NUM_WORKERS"] | default(0)}}

max_workers: {{env["NUM_WORKERS"] | default(0)}}
upscaling_speed: 9999

idle_timeout_minutes: 15

docker:
    image: anyscale/ray:nightly-gpu
    container_name: ray_container
    pull_before_run: true
    run_options:
      - --privileged

provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: false

available_node_types:
    gpu_4_ondemand:
        node_config:
            InstanceType: p2.xlarge
        resources: {"CPU": 4, "GPU": 1}
        min_workers: {{env["NUM_WORKERS"] | default(0)}}
        max_workers: {{env["NUM_WORKERS"] | default(0)}}

auth:
    ssh_user: ubuntu

head_node_type: gpu_4_ondemand
worker_default_node_type: gpu_4_ondemand

file_mounts: {
    "~/xgboost_tests": "."
}

file_mounts_sync_continuously: false

setup_commands:
    - pip install -U pyarrow cupy-cuda101
    - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
    - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh


================================================
FILE: xgboost_ray/tests/release/create_learnable_data.py
================================================
import argparse
import os

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_regression

if __name__ == "__main__":
    if "OMP_NUM_THREADS" in os.environ:
        del os.environ["OMP_NUM_THREADS"]

    parser = argparse.ArgumentParser(description="Create fake data.")
    parser.add_argument("filename", type=str, default="/data/parted.parquet/")
    parser.add_argument(
        "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows"
    )
    parser.add_argument(
        "-p",
        "--num-partitions",
        required=False,
        type=int,
        default=100,
        help="num partitions",
    )
    parser.add_argument(
        "-c",
        "--num-cols",
        required=False,
        type=int,
        default=4,
        help="num columns (features)",
    )
    parser.add_argument(
        "-C", "--num-classes", required=False, type=int, default=2, help="num classes"
    )
    parser.add_argument(
        "-s", "--seed", required=False, type=int, default=1234, help="random seed"
    )
    parser.add_argument(
        "-T",
        "--target",
        required=False,
        type=float,
        default=0.8,
        help="target accuracy",
    )

    args = parser.parse_args()

    seed = int(args.seed)
    np.random.seed(seed)

    num_rows = int(args.num_rows)
    num_cols = int(args.num_cols)
    num_classes = int(args.num_classes)
    target = float(args.target)

    if num_classes > 0:
        x, y = make_classification(
            n_samples=num_rows,
            n_features=num_cols,
            n_informative=num_cols // 2,
            n_redundant=num_cols // 10,
            n_repeated=0,
            n_classes=num_classes,
            n_clusters_per_class=2,
            flip_y=1 - target,
            random_state=seed,
        )
    else:
        x, y = make_regression(
            n_samples=num_rows,
            n_features=num_cols,
            n_informative=num_cols // 2,
            n_targets=1,
            noise=0.1,
            random_state=seed,
        )

    filename = args.filename
    num_partitions = args.num_partitions

    data = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(num_cols)])

    rows_per_partition = np.floor(len(data) / num_partitions)

    partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition)
    if len(partition_arr) < len(data):
        # If this was not evenly divided, append
        missing = len(data) - len(partition_arr)
        partition_arr = np.append(partition_arr, np.arange(missing))

    partition = pd.Series(partition_arr, copy=False, dtype=np.int32)

    data["labels"] = y
    data["partition"] = partition

    os.makedirs(filename, 0o755, exist_ok=True)

    # Write partition-wise to avoid OOM errors
    for i in range(num_partitions):
        part = data[partition_arr == i]
        part.to_parquet(
            filename,
            partition_cols=["partition"],
            engine="pyarrow",
            partition_filename_cb=lambda key: f"part_{key[0]}.parquet",
        )


================================================
FILE: xgboost_ray/tests/release/create_test_data.py
================================================
import argparse
import os

import numpy as np

from xgboost_ray.tests.utils import create_parquet

if __name__ == "__main__":
    if "OMP_NUM_THREADS" in os.environ:
        del os.environ["OMP_NUM_THREADS"]

    parser = argparse.ArgumentParser(description="Create fake data.")
    parser.add_argument(
        "filename", type=str, default="/data/parted.parquet/", help="ray/dask"
    )
    parser.add_argument(
        "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows"
    )
    parser.add_argument(
        "-p",
        "--num-partitions",
        required=False,
        type=int,
        default=100,
        help="num partitions",
    )
    parser.add_argument(
        "-c",
        "--num-cols",
        required=False,
        type=int,
        default=4,
        help="num columns (features)",
    )
    parser.add_argument(
        "-C", "--num-classes", required=False, type=int, default=2, help="num classes"
    )
    parser.add_argument(
        "-s", "--seed", required=False, type=int, default=1234, help="random seed"
    )

    args = parser.parse_args()

    np.random.seed(args.seed)
    create_parquet(
        args.filename,
        num_rows=int(args.num_rows),
        num_partitions=int(args.num_partitions),
        num_features=int(args.num_cols),
        num_classes=int(args.num_classes),
    )


================================================
FILE: xgboost_ray/tests/release/custom_objective_metric.py
================================================
import ray

from xgboost_ray.tests.test_xgboost_api import XGBoostAPITest


class XGBoostDistributedAPITest(XGBoostAPITest):
    def _init_ray(self):
        if not ray.is_initialized():
            ray.init(address="auto")


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"]))


================================================
FILE: xgboost_ray/tests/release/run_e2e_gpu.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

NOW=$(date +%s)
export SESSION_NAME="xgboost_ray_ci_gpu_${NOW}"
export NUM_WORKERS=3
export XGBOOST_RAY_PACKAGE="git+https://github.com/ray-project/xgboost_ray.git@${GITHUB_SHA:-master}#egg=xgboost_ray"
export NO_TMUX=1

./start_gpu_cluster.sh
./submit_cpu_gpu_benchmark.sh 4 100 100 --gpu --file /data/classification.parquet
anyscale down "${SESSION_NAME}"


================================================
FILE: xgboost_ray/tests/release/setup_xgboost.sh
================================================
#!/bin/bash

pip install pytest
# Uninstall any existing xgboost_ray repositories
pip uninstall -y xgboost_ray || true

# Install xgboost package
pip install -U "${XGBOOST_RAY_PACKAGE:-xgboost_ray}"

# Create test dataset
sudo mkdir -p /data || true
sudo chown ray:1000 /data || true
rm -rf /data/classification.parquet || true
cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed"
python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2


================================================
FILE: xgboost_ray/tests/release/start_cpu_cluster.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
export NUM_WORKERS="${NUM_WORKERS:-3}"

SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_cpu_$(date +%s)}

echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)"
echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"

CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_cpu.yaml ${SESSION_NAME}"

echo "Running: ${CMD}"
${CMD}


================================================
FILE: xgboost_ray/tests/release/start_ft_cluster.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"

SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_ft_$(date +%s)}

echo "Starting FT cluster"
echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"

CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_ft.yaml ${SESSION_NAME}"

echo "Running: ${CMD}"
${CMD}


================================================
FILE: xgboost_ray/tests/release/start_gpu_cluster.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
export NUM_WORKERS="${NUM_WORKERS:-3}"

SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_gpu_$(date +%s)}

echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)"
echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"

CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_gpu.yaml ${SESSION_NAME}"

echo "Running: ${CMD}"
${CMD}


================================================
FILE: xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

ANYSCALE_CMD="python ~/xgboost_tests/benchmark_cpu_gpu.py $*"

SESSION_STR=""
if [ -n "${SESSION_NAME}" ]; then
  SESSION_STR="--session-name ${SESSION_NAME}"
fi

TMUX="--tmux"
if [ "${NO_TMUX}" = "1" ]; then
  TMUX=""
fi

CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}"

echo "Running: ${CMD}"
${CMD}


================================================
FILE: xgboost_ray/tests/release/submit_ft_benchmark.sh
================================================
#!/bin/bash

if [ ! -f "./.anyscale.yaml" ]; then
  echo "Anyscale project not initialized. Please run 'anyscale init'"
  exit 1
fi

ANYSCALE_CMD="python ~/xgboost_tests/benchmark_ft.py $*"

SESSION_STR=""
if [ -n "${SESSION_NAME}" ]; then
  SESSION_STR="--session-name ${SESSION_NAME}"
fi

TMUX="--tmux"
if [ "${NO_TMUX}" = "1" ]; then
  TMUX=""
fi

CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}"

echo "Running: ${CMD}"
${CMD}


================================================
FILE: xgboost_ray/tests/release/tune_cluster.yaml
================================================
cluster_name: xgboost_ray_release_tests_tune
min_workers: 4
max_workers: 4
initial_workers: 4
autoscaling_mode: default
docker:
    image: "rayproject/ray:latest"
    container_name: ray_container
    pull_before_run: false
    run_options:
        - --privileged
target_utilization_fraction: 0.8
idle_timeout_minutes: 5
provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
    cache_stopped_nodes: true
auth:
    ssh_user: ubuntu
head_node:
    InstanceType: m5.xlarge
    ImageId: ami-05ac7a76b4c679a79
worker_nodes:
    InstanceType: m5.xlarge
    ImageId: ami-05ac7a76b4c679a79
    InstanceMarketOptions:
        MarketType: spot

file_mounts: {
  "/release_tests": "./"
}
cluster_synced_files: []
file_mounts_sync_continuously: true
initialization_commands: []
setup_commands:
    - pip install -U ray
    - pip install -U git+https://github.com/ray-project/xgboost_ray#egg=xgboost-ray
    - pip install -U git+https://github.com/amogkam/xgboost_ray.git@colocation#egg=xgboost-ray
    - mkdir -p /data
    - rm -rf /data/tune_test.parquet || true
    - python /release_tests/create_test_data.py /data/tune_test.parquet --seed 1234 --num-rows 2000 --num-cols 4 --num-partitions 40 --num-classes 2
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:
    - ray stop
    - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"actor_cpus\": 0}'"
worker_start_ray_commands:
    - ray stop
    - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"actor_cpus\": 4}'"
metadata:
    anyscale:
        working_dir: "/release_tests"


================================================
FILE: xgboost_ray/tests/release/tune_placement.py
================================================
"""
NOTE: This example is currently broken (very outdated) and not run in CI.

Test Ray Tune trial placement across cluster nodes.

Example: Run this script on a cluster with 4 workers nodes a 4 CPUs.

    ray up -y tune_cluster.yaml

    ray attach tune_cluster.yaml

    python /release_tests/tune_placement.py 4 4 10 10 --fake-data

This starts 4 trials à 4 actors training 10 boost rounds on 10 data
partitions per actor. This will use fake data created before training.

This test will then confirm that actors of the same trial are PACKed
on the same nodes. In practice we check that each node IP address only
hosts actors of the same Ray Tune trial.
"""

import argparse
import json
import os
import shutil
import tempfile
import time
from collections import defaultdict

import ray
import ray.train
from benchmark_cpu_gpu import train_ray
from ray import tune
from ray.tune.integration.docker import DockerSyncer
from ray.tune.session import get_trial_id
from ray.util import get_node_ip_address

from xgboost_ray import RayParams
from xgboost_ray.compat import TrainingCallback
from xgboost_ray.session import put_queue
from xgboost_ray.tests.utils import create_parquet

if "OMP_NUM_THREADS" in os.environ:
    del os.environ["OMP_NUM_THREADS"]


class PlacementCallback(TrainingCallback):
    """This callback collects the Ray Tune trial ID and node IP"""

    def before_training(self, model):
        ip_address = get_node_ip_address()
        put_queue(ip_address)
        return model

    def after_iteration(self, model, epoch, evals_log):
        if epoch == 1:
            time.sleep(2)
        elif epoch == 2:
            time.sleep(8)


def tune_test(
    path,
    num_trials,
    num_workers,
    num_boost_rounds,
    num_files=0,
    regression=False,
    use_gpu=False,
    fake_data=False,
    smoke_test=False,
):
    ray_params = RayParams(
        elastic_training=False,
        max_actor_restarts=0,
        num_actors=num_workers,
        cpus_per_actor=1,
        gpus_per_actor=0 if not use_gpu else 1,
    )

    def local_train(config):
        temp_dir = None
        if fake_data or smoke_test:
            temp_dir = "/tmp/release_test_data"
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)

            os.makedirs(temp_dir, 0o755)
            local_path = os.path.join(temp_dir, "smoketest.parquet")

            create_parquet(
                filename=local_path,
                num_rows=args.num_workers * 500,
                num_features=4,
                num_classes=2,
                num_partitions=args.num_workers * 10,
            )
        else:
            if not os.path.exists(path):
                raise ValueError(
                    f"Benchmarking data not found: {path}."
                    f"\nFIX THIS by running `python create_test_data.py` "
                    f"on all nodes first."
                )
            local_path = path

        xgboost_params = {
            "tree_method": "hist" if not use_gpu else "gpu_hist",
        }

        xgboost_params.update(
            {
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error"],
            }
        )

        xgboost_params.update(config)

        additional_results = {}

        bst, time_taken = train_ray(
            path=local_path,
            num_workers=num_workers,
            num_boost_rounds=num_boost_rounds,
            num_files=num_files,
            regression=regression,
            use_gpu=use_gpu,
            smoke_test=smoke_test,
            ray_params=ray_params,
            xgboost_params=xgboost_params,
            # kwargs
            additional_results=additional_results,
            callbacks=[PlacementCallback()],
        )

        bst.save_model("tuned.xgb")

        trial_ips = []
        for rank, ips in enumerate(additional_results["callback_returns"]):
            for ip in ips:
                trial_ips.append(ip)

        tune_trial = get_trial_id()
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            with open(
                os.path.join(temp_checkpoint_dir, "callback_returns.json"), "wt"
            ) as f:
                json.dump({tune_trial: trial_ips}, f)
            ray.train.report(
                {}, checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir)
            )

        if temp_dir:
            shutil.rmtree(temp_dir)

    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9),
    }

    analysis = tune.run(
        local_train,
        config=search_space,
        num_samples=num_trials,
        sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer),
        resources_per_trial=ray_params.get_tune_resources(),
    )

    # In our PACK scheduling, we expect that each IP hosts only workers
    # for one Ray Tune trial.
    ip_to_trials = defaultdict(list)
    for trial in analysis.trials:
        trial = trial
        with open(
            os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt"
        ) as f:
            trial_to_ips = json.load(f)
        for tune_trial, ips in trial_to_ips.items():
            for node_ip in ips:
                ip_to_trials[node_ip].append(tune_trial)

    fail = False
    for ip, trial_ids in ip_to_trials.items():
        print(f"For IP {ip} got trial IDs {trial_ids}")
        fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids)

    if fail:
        raise ValueError("Different trial IDs found on same node.")
    else:
        print("Success.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Test Ray Tune placement " "strategy")

    parser.add_argument("num_trials", type=int, help="num trials")
    parser.add_argument("num_workers", type=int, help="num workers (per trial)")
    parser.add_argument("num_rounds", type=int, help="num boost rounds")
    parser.add_argument("num_files", type=int, help="num files (per trial)")

    parser.add_argument(
        "--file", default="/data/parted.parquet", type=str, help="data file"
    )

    parser.add_argument(
        "--regression", action="store_true", default=False, help="regression"
    )

    parser.add_argument("--gpu", action="store_true", default=False, help="gpu")

    parser.add_argument(
        "--fake-data", action="store_true", default=False, help="fake data"
    )

    parser.add_argument(
        "--smoke-test", action="store_true", default=False, help="smoke test"
    )

    args = parser.parse_args()

    num_trials = args.num_trials
    num_workers = args.num_workers
    num_boost_rounds = args.num_rounds
    num_files = args.num_files
    use_gpu = args.gpu

    if args.smoke_test:
        use_gpu = False

    init_start = time.time()
    if args.smoke_test:
        ray.init(num_cpus=num_workers)
    else:
        ray.init(address="auto")

    full_start = time.time()
    tune_test(
        path=args.file,
        num_trials=num_trials,
        num_workers=num_workers,
        num_boost_rounds=num_boost_rounds,
        num_files=num_files,
        regression=args.regression,
        use_gpu=use_gpu,
        fake_data=args.fake_data,
        smoke_test=args.smoke_test,
    )
    full_taken = time.time() - full_start
    print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds ")


================================================
FILE: xgboost_ray/tests/test_client.py
================================================
import os

import pytest
import ray
from ray.util.client.ray_client_helpers import ray_start_client_server

from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE


@pytest.fixture
def start_client_server_4_cpus():
    ray.init(num_cpus=4)
    with ray_start_client_server() as client:
        yield client


@pytest.fixture
def start_client_server_5_cpus():
    ray.init(num_cpus=5)
    with ray_start_client_server() as client:
        yield client


@pytest.fixture
def start_client_server_5_cpus_modin(monkeypatch):
    monkeypatch.setenv("__MODIN_AUTOIMPORT_PANDAS__", "1")
    ray.init(num_cpus=5, runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}})
    with ray_start_client_server() as client:
        yield client


def test_simple_train(start_client_server_4_cpus):
    assert ray.util.client.ray.is_connected()
    from xgboost_ray.examples.simple import main

    main(num_actors=4, cpus_per_actor=1)


@pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests")
def test_simple_tune(start_client_server_4_cpus):
    assert ray.util.client.ray.is_connected()
    from xgboost_ray.examples.simple_tune import main

    main(cpus_per_actor=1, num_actors=1, num_samples=4)


def test_simple_dask(start_client_server_5_cpus):
    assert ray.util.client.ray.is_connected()
    from xgboost_ray.examples.simple_dask import main

    main(cpus_per_actor=1, num_actors=4)


def test_simple_modin(start_client_server_5_cpus_modin):
    assert ray.util.client.ray.is_connected()
    from xgboost_ray.examples.simple_modin import main

    main(cpus_per_actor=1, num_actors=4)


def test_client_actor_cpus(start_client_server_5_cpus):
    assert ray.util.client.ray.is_connected()
    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

    @ray.remote
    class DummyTrainActor:
        def test(self):
            import xgboost_ray

            return xgboost_ray.main._ray_get_actor_cpus()

    actor = DummyTrainActor.options(num_cpus=2).remote()
    assert ray.get(actor.test.remote()) == 2

    pg = ray.util.placement_group([{"CPU": 2}])
    ray.get(pg.ready())
    actor2 = DummyTrainActor.options(
        num_cpus=2,
        scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg),
    ).remote()
    assert ray.get(actor2.test.remote()) == 2


@pytest.mark.skipif(
    not RAY_DATASET_AVAILABLE,
    reason="Ray datasets are not available in this version of Ray",
)
def test_simple_ray_dataset(start_client_server_5_cpus):
    assert ray.util.client.ray.is_connected()
    from xgboost_ray.examples.simple_ray_dataset import main

    main(cpus_per_actor=1, num_actors=4)


if __name__ == "__main__":
    import sys

    import pytest  # noqa: F811

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_colocation.py
================================================
import os
import shutil
import tempfile
import unittest
from unittest.mock import patch

import numpy as np
import pytest
import ray
from ray.util.queue import _QueueActor

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.main import _train
from xgboost_ray.util import _EventActor


class _MockQueueActor(_QueueActor):
    def get_node_id(self):
        return ray.get_runtime_context().get_node_id()


class _MockEventActor(_EventActor):
    def get_node_id(self):
        return ray.get_runtime_context().get_node_id()


@pytest.mark.usefixtures("ray_start_cluster")
class TestColocation(unittest.TestCase):
    def setUp(self) -> None:
        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 0
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 1
            ]
            * repeat
        )
        self.y = np.array([0, 1, 0, 1] * repeat)

        self.params = {
            "booster": "gbtree",
            "tree_method": "hist",
            "nthread": 1,
            "max_depth": 2,
            "objective": "binary:logistic",
            "seed": 1000,
        }

        self.kwargs = {}

        self.tmpdir = str(tempfile.mkdtemp())

        self.die_lock_file = "/tmp/died_worker.lock"
        if os.path.exists(self.die_lock_file):
            os.remove(self.die_lock_file)

    def tearDown(self) -> None:
        if os.path.exists(self.tmpdir):
            shutil.rmtree(self.tmpdir)
        ray.shutdown()

    @patch("ray.util.queue._QueueActor", _MockQueueActor)
    @patch("xgboost_ray.util._EventActor", _MockEventActor)
    def test_communication_colocation(self):
        """Checks that Queue and Event actors are colocated with the driver."""
        os.environ["RXGB_COMMUNICATION_SOFT_PLACEMENT"] = "0"

        with self.ray_start_cluster() as cluster:
            cluster.add_node(num_cpus=3)
            cluster.add_node(num_cpus=3)
            cluster.wait_for_nodes()
            ray.init(address=cluster.address)

            local_node = ray.get_runtime_context().get_node_id()

            # Note that these will have the same IP in the test cluster
            assert len(ray.nodes()) == 2
            assert local_node in [node["NodeID"] for node in ray.nodes()]

            def _mock_train(*args, _training_state, **kwargs):
                assert (
                    ray.get(_training_state.queue.actor.get_node_id.remote())
                    == local_node
                )
                assert (
                    ray.get(_training_state.stop_event.actor.get_node_id.remote())
                    == local_node
                )
                return _train(*args, _training_state=_training_state, **kwargs)

            with patch("xgboost_ray.main._train") as mocked:
                mocked.side_effect = _mock_train
                train(
                    self.params,
                    RayDMatrix(self.x, self.y),
                    num_boost_round=2,
                    ray_params=RayParams(max_actor_restarts=1, num_actors=6),
                )

        os.environ.pop("RXGB_COMMUNICATION_SOFT_PLACEMENT", None)

    def test_no_tune_spread(self):
        """Tests whether workers are spread when not using Tune."""
        with self.ray_start_cluster() as cluster:
            cluster.add_node(num_cpus=2)
            cluster.add_node(num_cpus=2)
            cluster.wait_for_nodes()
            ray.init(address=cluster.address)

            ray_params = RayParams(max_actor_restarts=1, num_actors=2, cpus_per_actor=2)

            def _mock_train(*args, _training_state, **kwargs):
                try:
                    results = _train(*args, _training_state=_training_state, **kwargs)
                    return results
                except Exception:
                    raise
                finally:
                    assert len(_training_state.actors) == 2
                    if not any(a is None for a in _training_state.actors):
                        actor_infos = ray.state.actors()
                        actor_nodes = []
                        for a in _training_state.actors:
                            actor_info = actor_infos.get(a._actor_id.hex())
                            actor_node = actor_info["Address"]["NodeID"]
                            actor_nodes.append(actor_node)
                        assert actor_nodes[0] != actor_nodes[1]

            with patch("xgboost_ray.main._train", _mock_train):
                train(
                    self.params,
                    RayDMatrix(self.x, self.y),
                    num_boost_round=4,
                    ray_params=ray_params,
                )

    def test_tune_pack(self):
        """Tests whether workers are packed when using Tune."""
        try:
            from ray import tune
        except ImportError:
            self.skipTest("Tune is not installed.")
            return
        with self.ray_start_cluster() as cluster:
            num_actors = 2
            cluster.add_node(num_cpus=3)
            cluster.add_node(num_cpus=3)
            ray.init(address=cluster.address)

            ray_params = RayParams(
                max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1
            )

            def _mock_train(*args, _training_state, **kwargs):
                try:
                    results = _train(*args, _training_state=_training_state, **kwargs)
                    return results
                except Exception:
                    raise
                finally:
                    assert len(_training_state.actors) == num_actors
                    if not any(a is None for a in _training_state.actors):
                        actor_infos = ray.state.actors()
                        actor_nodes = []
                        for a in _training_state.actors:
                            actor_info = actor_infos.get(a._actor_id.hex())
                            actor_node = actor_info["Address"]["NodeID"]
                            actor_nodes.append(actor_node)
                        assert actor_nodes[0] == actor_nodes[1]

            def train_func(params, x, y, ray_params):
                def inner_func(config):
                    with patch("xgboost_ray.main._train", _mock_train):
                        train(
                            params,
                            RayDMatrix(x, y),
                            num_boost_round=4,
                            ray_params=ray_params,
                        )

                return inner_func

            tune.run(
                train_func(self.params, self.x, self.y, ray_params),
                resources_per_trial=ray_params.get_tune_resources(),
                num_samples=1,
            )

    def test_timeout(self):
        """Checks that an error occurs when placement group setup times out."""
        os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "5"

        with self.ray_start_cluster() as cluster:
            ray.init(address=cluster.address)

            with self.assertRaises(TimeoutError):
                train(
                    self.params,
                    RayDMatrix(self.x, self.y),
                    num_boost_round=2,
                    ray_params=RayParams(
                        max_actor_restarts=1,
                        num_actors=2,
                        resources_per_actor={"invalid": 1},
                    ),
                )


if __name__ == "__main__":
    import sys

    import pytest  # noqa: F811

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_data_source.py
================================================
import unittest
from typing import List, Sequence
from unittest.mock import patch

import numpy as np
import pandas as pd
import ray
from ray import ObjectRef

from xgboost_ray.data_sources import Dask, Modin, Partitioned
from xgboost_ray.data_sources.dask import DASK_INSTALLED
from xgboost_ray.data_sources.modin import MODIN_INSTALLED
from xgboost_ray.main import _RemoteRayXGBoostActor


class _DistributedDataSourceTest:
    def setUp(self):
        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.repeat(range(8), 16).reshape((32, 4))
        self.y = np.array([0, 1, 2, 3] * repeat)

        self._init_ray()

    def tearDown(self) -> None:
        if ray.is_initialized():
            ray.shutdown()

    def _init_ray(self):
        if not ray.is_initialized():
            ray.init(num_cpus=1)

    def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts):
        raise NotImplementedError

    def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts):
        raise NotImplementedError

    def testAssignEvenTrivial(self):
        """Assign actors to co-located partitions, trivial case.

        In this test case, partitions are already evenly distributed across
        nodes.
        """
        part_nodes = [0, 0, 1, 1, 2, 2, 3, 3]
        actor_nodes = [0, 1, 2, 3]

        expected_actor_parts = {
            0: [0, 1],
            1: [2, 3],
            2: [4, 5],
            3: [6, 7],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)

    def testAssignEvenRedistributeOne(self):
        """Assign actors to co-located partitions, non-trivial case.

        Here two actors are located on node0, but only one can be filled
        with co-located partitions. The other one has to fetch data from
        node1.
        """
        part_nodes = [0, 0, 0, 1, 1, 1, 2, 2]
        actor_nodes = [0, 0, 1, 2]

        expected_actor_parts = {
            0: [0, 2],
            1: [1, 5],
            2: [3, 4],
            3: [6, 7],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)

    def testAssignEvenRedistributeMost(self):
        """Assign actors to co-located partitions, redistribute case.

        In this test case, partitions are all on one node.
        """
        part_nodes = [0, 0, 0, 0, 0, 0, 0, 0]
        actor_nodes = [0, 1, 2, 3]

        expected_actor_parts = {
            0: [0, 1],
            1: [2, 5],
            2: [3, 6],
            3: [4, 7],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)

        # This part of the test never works - Modin materializes partitions
        # onto different nodes while unwrapping.
        # self._testDataSourceAssignment(part_nodes, actor_nodes,
        #                           expected_actor_parts)

    def testAssignUnevenTrivial(self):
        """Assign actors to co-located partitions, trivial uneven case.

        In this test case, not all actors get the same amount of partitions.
        """
        part_nodes = [0, 0, 0, 1, 1, 2, 2, 2]
        actor_nodes = [0, 1, 2]

        expected_actor_parts = {
            0: [0, 1, 2],
            1: [3, 4],
            2: [5, 6, 7],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)

    def testAssignUnevenRedistribute(self):
        """Assign actors to co-located partitions, redistribute uneven case.

        In this test case, not all actors get the same amount of partitions.
        Some actors have to fetch partitions from other nodes
        """
        part_nodes = [0, 0, 1, 1, 1, 1, 2, 3]
        actor_nodes = [0, 1, 2]

        expected_actor_parts = {
            0: [0, 1, 5],
            1: [2, 3, 4],
            2: [6, 7],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)

    def testAssignUnevenRedistributeColocated(self):
        """Assign actors to co-located partitions, redistribute uneven case.

        Here we have an uneven split of partitions. One actor does not get
        a co-located shard assigned in favor for a non-coloated actor.
        """
        part_nodes = [0, 0, 0, 0, 0, 0, 0]
        actor_nodes = [0, 0, 1]

        expected_actor_parts = {
            0: [0, 2, 4],
            1: [1, 3],
            2: [5, 6],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)

    def testAssignUnevenRedistributeAll(self):
        """Assign actors to co-located partitions, redistribute uneven case.

        In this test case, not all actors get the same amount of partitions.
        Some actors have to fetch partitions from other nodes
        """
        part_nodes = [1, 1, 1, 1, 0, 0, 0]
        actor_nodes = [1, 1, 2]

        expected_actor_parts = {
            0: [0, 2, 4],
            1: [1, 3],
            2: [5, 6],
        }
        self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts)
        self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts)


@unittest.skipIf(
    not MODIN_INSTALLED, reason="Modin is not installed in a supported version."
)
class ModinDataSourceTest(_DistributedDataSourceTest, unittest.TestCase):
    """This test suite validates core RayDMatrix functionality."""

    def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts):
        partitions = [ray.put(p) for p in np.array_split(self.x, len(part_nodes))]

        # Dict from partition (obj ref) to node host
        part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes]))
        node_to_part = [(ray.put(n), p) for p, n in part_to_node.items()]

        actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes))

        actor_to_parts = self._getActorToParts(actors_to_node, node_to_part)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                self.assertEqual(
                    actor_to_parts[actor_rank][i],
                    partitions[part_id],
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )

    def _getActorToParts(self, actors_to_node, node_to_part):
        def unwrap(data, *args, **kwargs):
            return data

        def actor_ranks(actors):
            return actors_to_node

        with patch(
            "modin.distributed.dataframe.pandas.unwrap_partitions"
        ) as mock_unwrap, patch(
            "xgboost_ray.data_sources.modin.get_actor_rank_ips"
        ) as mock_ranks:
            mock_unwrap.side_effect = unwrap
            mock_ranks.side_effect = actor_ranks

            _, actor_to_parts = Modin.get_actor_shards(data=node_to_part, actors=[])

        return actor_to_parts

    def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts):
        node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]]
        if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1:
            print("Not running on cluster, skipping rest of this test.")
            return

        actor_node_ips = [node_ips[nid] for nid in actor_nodes]
        part_node_ips = [node_ips[nid] for nid in part_nodes]

        # Initialize data frames on remote nodes
        # This way we can control which partition is on which node
        @ray.remote(num_cpus=0.1)
        def create_remote_df(arr):
            return ray.put(pd.DataFrame(arr))

        partitions = np.array_split(self.x, len(part_nodes))
        node_dfs: Sequence[ObjectRef] = ray.get(
            [
                create_remote_df.options(resources={f"node:{pip}": 0.1}).remote(
                    partitions[pid]
                )
                for pid, pip in enumerate(part_node_ips)
            ]
        )
        node_ip_dfs = [
            (ray.put(part_node_ips[pid]), node_df)
            for pid, node_df in enumerate(node_dfs)
        ]

        # Create modin dataframe from distributed partitions
        from modin.distributed.dataframe.pandas import (
            from_partitions,
            unwrap_partitions,
        )

        modin_df = from_partitions(node_ip_dfs, axis=0)

        # Sanity check
        unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True)
        ip_objs, df_objs = zip(*unwrapped)

        try:
            self.assertSequenceEqual(
                [df[0][0] for df in partitions],
                [df[0][0] for df in ray.get(list(df_objs))],
                msg="Modin mixed up the partition order",
            )

            self.assertSequenceEqual(
                part_node_ips,
                ray.get(list(ip_objs)),
                msg="Modin moved partitions to different IPs",
            )
        except AssertionError as exc:
            print(f"Modin part of the test failed: {exc}")
            print(
                "This is a stochastic test failure. Ignoring the rest " "of this test."
            )
            return

        # Create ray actors
        actors = [
            _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote(
                rank=rank, num_actors=len(actor_nodes)
            )
            for rank, nip in enumerate(actor_node_ips)
        ]

        # Calculate shards
        _, actor_to_parts = Modin.get_actor_shards(modin_df, actors)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                assigned_df = ray.get(actor_to_parts[actor_rank][i])
                part_df = pd.DataFrame(partitions[part_id])

                self.assertTrue(
                    assigned_df.equals(part_df),
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )


@unittest.skipIf(
    not DASK_INSTALLED, reason="Dask is not installed in a supported version."
)
class DaskDataSourceTest(_DistributedDataSourceTest, unittest.TestCase):
    """This test suite validates core RayDMatrix functionality."""

    def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts):
        partitions = list(range(len(part_nodes)))

        # Dict from partition (id) to node host
        part_to_node = dict(
            zip(range(len(partitions)), [f"node{n}" for n in part_nodes])
        )
        node_to_part = [(n, p) for p, n in part_to_node.items()]

        actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes))

        actor_to_parts = self._getActorToParts(actors_to_node, node_to_part)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                self.assertEqual(
                    actor_to_parts[actor_rank][i],
                    partitions[part_id],
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )

    def _getActorToParts(self, actors_to_node, node_to_part):
        def ip_to_parts(data, *args, **kwargs):
            from collections import defaultdict

            ip_to_parts_dict = defaultdict(list)
            for node, pid in data:
                ip_to_parts_dict[node].append(pid)
            return ip_to_parts_dict

        def actor_ranks(actors):
            return actors_to_node

        with patch(
            "xgboost_ray.data_sources.dask.get_ip_to_parts"
        ) as mock_parts, patch(
            "xgboost_ray.data_sources.dask.get_actor_rank_ips"
        ) as mock_ranks:
            mock_parts.side_effect = ip_to_parts
            mock_ranks.side_effect = actor_ranks

            _, actor_to_parts = Dask.get_actor_shards(data=node_to_part, actors=[])

        return actor_to_parts

    def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts):
        self.skipTest("Data-locality aware scheduling using Dask is currently broken.")

        import dask
        import dask.dataframe as dd
        from ray.util.dask import ray_dask_get

        dask.config.set(scheduler=ray_dask_get)

        node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]]
        if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1:
            print("Not running on cluster, skipping rest of this test.")
            return

        actor_node_ips = [node_ips[nid] for nid in actor_nodes]
        part_node_ips = [node_ips[nid] for nid in part_nodes]

        # Initialize data frames on remote nodes
        # This way we can control which partition is on which node
        @ray.remote(num_cpus=0.1)
        def create_remote_df(arr):
            return dd.from_array(arr)

        partitions = np.array_split(self.x, len(part_nodes))
        node_dfs: List[dd.DataFrame] = ray.get(
            [
                create_remote_df.options(resources={f"node:{pip}": 0.1}).remote(
                    partitions[pid]
                )
                for pid, pip in enumerate(part_node_ips)
            ]
        )
        node_dfs_concat = dd.concat(node_dfs).persist()

        # Get node IPs
        partition_locations_df = node_dfs_concat.map_partitions(
            lambda df: pd.DataFrame([ray.util.get_node_ip_address()])
        ).compute()
        partition_locations = [
            partition_locations_df[0].iloc[i]
            for i in range(partition_locations_df.size)
        ]

        # Create dask dataframe from distributed partitions
        dask_df = dd.concat(node_dfs, axis=0)

        # Sanity check
        # persisted = dask_df.persist()
        try:
            self.assertSequenceEqual(
                [df[0][0] for df in partitions],
                [df[0][0] for df in dask_df.partitions.compute()],
                msg="Dask mixed up the partition order",
            )

            self.assertSequenceEqual(
                part_node_ips,
                partition_locations,
                msg="Dask moved partitions to different IPs",
            )
        except AssertionError as exc:
            print(f"Dask part of the test failed: {exc}")
            print(
                "This is a stochastic test failure. Ignoring the rest " "of this test."
            )
            return

        # Create ray actors
        actors = [
            _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote(
                rank=rank, num_actors=len(actor_nodes)
            )
            for rank, nip in enumerate(actor_node_ips)
        ]

        # Calculate shards
        _, actor_to_parts = Dask.get_actor_shards(dask_df, actors)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                assigned_df = ray.get(actor_to_parts[actor_rank][i])
                part_df = pd.DataFrame(partitions[part_id])

                self.assertTrue(
                    assigned_df.equals(part_df),
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )


# Ray Datasets data source is not tested, as we do not make use of xgboost-ray
# partition-to-actor assign logic. Furthermore, xgboost-ray with Ray Datasets
# is tested in ray-project/ray.


class PartitionedSourceTest(_DistributedDataSourceTest, unittest.TestCase):
    def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts):
        partitions = [
            ray.put(pd.DataFrame(p)) for p in np.array_split(self.x, len(part_nodes))
        ]

        # Dict from partition (obj ref) to node host
        part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes]))

        actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes))

        actor_to_parts = self._getActorToParts(
            actors_to_node, partitions, part_to_node, part_nodes
        )

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                self.assertEqual(
                    actor_to_parts[actor_rank][i],
                    partitions[part_id],
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )

    def _mk_partitioned(self, part_to_node, nr, nc, shapes):
        class Parted:
            """Class exposing __partitioned__"""

            def __init__(self, parted):
                self.__partitioned__ = parted

        num_parts = len(part_to_node)
        data = {
            "shape": (nr, nc),
            "partition_tiling": (num_parts, 1),
            "get": lambda x: ray.get(x),
            "partitions": {},
        }
        startx = 0
        for i, pn in enumerate(part_to_node.items()):
            partref, node = pn
            data["partitions"][(i, 0)] = {
                "start": (startx, 0),
                "shape": shapes[partref],
                "data": partref,
                "location": [node],
            }
            startx = startx + shapes[partref][0]

        return Parted(data)

    def _getActorToParts(self, actors_to_node, partitions, part_to_node, part_nodes):
        def actor_ranks(actors):
            return actors_to_node

        with patch(
            "xgboost_ray.data_sources.partitioned.get_actor_rank_ips"
        ) as mock_ranks:
            mock_ranks.side_effect = actor_ranks

            nr, nc = self.x.shape
            data = self._mk_partitioned(
                part_to_node, nr, nc, {p: ray.get(p).shape for p in partitions}
            )

            _, actor_to_parts = Partitioned.get_actor_shards(data=data, actors=[])

        return actor_to_parts

    def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts):
        node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]]
        if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1:
            print("Not running on cluster, skipping rest of this test.")
            return

        actor_node_ips = [node_ips[nid] for nid in actor_nodes]
        part_node_ips = [node_ips[nid] for nid in part_nodes]

        # Initialize data frames on remote nodes
        # This way we can control which partition is on which node
        @ray.remote(num_cpus=0.1)
        def create_remote_df(arr):
            return ray.put(pd.DataFrame(arr))

        partitions = np.array_split(self.x, len(part_nodes))
        node_dfs, shapes = {}, {}
        for pid, pip in enumerate(part_node_ips):
            pref = ray.get(
                create_remote_df.options(resources={f"node:{pip}": 0.1}).remote(
                    partitions[pid]
                )
            )
            node_dfs[pref] = pip
            shapes[pref] = partitions[pid].shape

        nr, nc = self.x.shape
        # Create structure with __partitioned__ from distributed partitions
        parted = self._mk_partitioned(node_dfs, nr, nc, shapes)

        # Create ray actors
        actors = [
            _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote(
                rank=rank, num_actors=len(actor_nodes)
            )
            for rank, nip in enumerate(actor_node_ips)
        ]

        # Calculate shards
        _, actor_to_parts = Partitioned.get_actor_shards(parted, actors)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                assigned_df = ray.get(actor_to_parts[actor_rank][i])
                part_df = pd.DataFrame(partitions[part_id])
                self.assertTrue(
                    assigned_df.equals(part_df),
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.",
                )


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_end_to_end.py
================================================
import os
import shutil
import tempfile
import unittest

import numpy as np
import ray
import xgboost as xgb
from ray.exceptions import RayActorError, RayTaskError
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from scipy.sparse import csr_matrix

from xgboost_ray import RayDMatrix, RayParams, RayShardingMode, predict, train
from xgboost_ray.callback import DistributedCallback
from xgboost_ray.main import RayXGBoostTrainingError
from xgboost_ray.tests.utils import get_num_trees


def _make_callback(tmpdir: str) -> DistributedCallback:
    class TestDistributedCallback(DistributedCallback):
        logdir = tmpdir

        def on_init(self, actor, *args, **kwargs):
            log_file = os.path.join(self.logdir, f"rank_{actor.rank}.log")
            actor.log_fp = open(log_file, "at")
            actor.log_fp.write(f"Actor {actor.rank}: Init\n")
            actor.log_fp.flush()

        def before_data_loading(self, actor, data, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: Before loading\n")
            actor.log_fp.flush()

        def after_data_loading(self, actor, data, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: After loading\n")
            actor.log_fp.flush()

        def before_train(self, actor, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: Before train\n")
            actor.log_fp.flush()

        def after_train(self, actor, result_dict, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: After train\n")
            actor.log_fp.flush()

        def before_predict(self, actor, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: Before predict\n")
            actor.log_fp.flush()

        def after_predict(self, actor, predictions, *args, **kwargs):
            actor.log_fp.write(f"Actor {actor.rank}: After predict\n")
            actor.log_fp.flush()

    return TestDistributedCallback()


class XGBoostRayEndToEndTest(unittest.TestCase):
    """In this test suite we validate Ray-XGBoost multi class prediction.

    First, we validate that XGBoost is able to achieve 100% accuracy on
    a simple training task.

    Then we split the dataset into two halves. These halves don't have access
    to all relevant data, so overfit on their respective data. I.e. the first
    half always predicts feature 2 -> label 2, while the second half always
    predicts feature 2 -> label 3.

    We then train using Ray XGBoost. Again both halves will be trained
    separately, but because of Rabit's allreduce, they should end up being
    able to achieve 100% accuracy, again."""

    def setUp(self):
        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 2
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 3
            ]
            * repeat
        )
        self.y = np.array([0, 1, 2, 3] * repeat)

        self.params = {
            "booster": "gbtree",
            "nthread": 1,
            "max_depth": 2,
            "objective": "multi:softmax",
            "num_class": 4,
        }

    def tearDown(self):
        if ray.is_initialized:
            ray.shutdown()

    def testSingleTraining(self):
        """Test that XGBoost learns to predict full matrix"""
        dtrain = xgb.DMatrix(self.x, self.y)
        bst = xgb.train(self.params, dtrain, num_boost_round=2)

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))

    def testHalfTraining(self):
        """Test that XGBoost learns to predict half matrices individually"""
        x_first = self.x[::2]
        y_first = self.y[::2]

        x_second = self.x[1::2]
        y_second = self.y[1::2]

        # Test case: The first model only sees feature 2 --> label 2
        # and the second model only sees feature 2 --> label 3
        test_X = xgb.DMatrix(np.array([[0, 0, 1, 1], [0, 0, 1, 0]]))
        test_y_first = [2, 2]
        test_y_second = [3, 3]

        # First half
        dtrain = xgb.DMatrix(x_first, y_first)
        bst = xgb.train(self.params, dtrain, num_boost_round=2)

        x_mat = xgb.DMatrix(x_first)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(y_first), list(pred_y))

        pred_test = bst.predict(test_X)
        self.assertSequenceEqual(test_y_first, list(pred_test))

        # Second half
        dtrain = xgb.DMatrix(x_second, y_second)
        bst = xgb.train(self.params, dtrain, num_boost_round=2)

        x_mat = xgb.DMatrix(x_second)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(y_second), list(pred_y))

        pred_test = bst.predict(test_X)
        self.assertSequenceEqual(test_y_second, list(pred_test))

    def test_client_actor_cpus(self):
        ray.init(num_cpus=5, num_gpus=0)

        @ray.remote
        class DummyTrainActor:
            def test(self):
                import xgboost_ray

                return xgboost_ray.main._ray_get_actor_cpus()

        actor = DummyTrainActor.options(num_cpus=2).remote()
        assert ray.get(actor.test.remote()) == 2

        pg = ray.util.placement_group([{"CPU": 2}])
        ray.get(pg.ready())
        actor2 = DummyTrainActor.options(
            num_cpus=2,
            scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg),
        ).remote()
        assert ray.get(actor2.test.remote()) == 2

    def _testJointTraining(self, sharding=RayShardingMode.INTERLEAVED, softprob=False):
        """Train with Ray. The data will be split, but the trees
        should be combined together and find the true model."""
        params = self.params.copy()
        if softprob:
            params["objective"] = "multi:softprob"

        bst = train(
            params,
            RayDMatrix(self.x, self.y, sharding=sharding),
            ray_params=RayParams(num_actors=2),
        )

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y), list(pred_y))

        x_mat = RayDMatrix(self.x, sharding=sharding)
        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y), list(pred_y))

        # try on an odd number of rows
        bst = train(
            params,
            RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding),
            ray_params=RayParams(num_actors=2),
        )

        x_mat = RayDMatrix(self.x[:-1], sharding=sharding)
        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y[:-1]), list(pred_y))

    def testJointTrainingInterleaved(self):
        ray.init(num_cpus=2, num_gpus=0)
        self._testJointTraining(sharding=RayShardingMode.INTERLEAVED)
        self._testJointTraining(sharding=RayShardingMode.INTERLEAVED, softprob=True)

    def testJointTrainingBatch(self):
        ray.init(num_cpus=2, num_gpus=0)
        self._testJointTraining(sharding=RayShardingMode.BATCH)
        self._testJointTraining(sharding=RayShardingMode.BATCH, softprob=True)

    def testTrainPredict(
        self, init=True, remote=None, softprob=False, **ray_param_dict
    ):
        """Train with evaluation and predict"""
        if init:
            ray.init(num_cpus=2, num_gpus=0)

        dtrain = RayDMatrix(self.x, self.y)

        params = self.params
        if softprob:
            params = params.copy()
            params["objective"] = "multi:softprob"

        evals_result = {}
        bst = train(
            params,
            dtrain,
            num_boost_round=38,
            ray_params=RayParams(num_actors=2, **ray_param_dict),
            evals=[(dtrain, "dtrain")],
            evals_result=evals_result,
            _remote=remote,
        )

        self.assertEqual(get_num_trees(bst), 38)

        self.assertTrue("dtrain" in evals_result)

        x_mat = RayDMatrix(self.x)
        pred_y = predict(
            bst,
            x_mat,
            ray_params=RayParams(num_actors=2, **ray_param_dict),
            _remote=remote,
        )

        if softprob:
            self.assertEqual(pred_y.shape[1], len(np.unique(self.y)))
            pred_y = np.argmax(pred_y, axis=1)

        self.assertSequenceEqual(list(self.y), list(pred_y))

    def testTrainPredictSoftprob(self):
        """Train with evaluation and predict on softprob objective
        (which returns predictions in a 2d array)
        """
        self.testTrainPredict(init=True, softprob=True)

    def testTrainPredictRemote(self):
        """Train with evaluation and predict in a remote call"""
        self.testTrainPredict(init=True, remote=True)

    def testTrainPredictClient(self):
        """Train with evaluation and predict in a client session"""
        if ray.__version__ <= "1.2.0":
            self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")
        from ray.util.client.ray_client_helpers import ray_start_client_server

        ray.init(num_cpus=2, num_gpus=0)
        self.assertFalse(ray.util.client.ray.is_connected())
        with ray_start_client_server():
            self.assertTrue(ray.util.client.ray.is_connected())

            self.testTrainPredict(init=False, remote=None)

    def testDistributedCallbacksTrainPredict(self, init=True, remote=False):
        """Test distributed callbacks for train/predict"""
        tmpdir = tempfile.mkdtemp()
        test_callback = _make_callback(tmpdir)

        self.testTrainPredict(
            init=init, remote=remote, distributed_callbacks=[test_callback]
        )
        rank_0_log_file = os.path.join(tmpdir, "rank_0.log")
        rank_1_log_file = os.path.join(tmpdir, "rank_1.log")
        self.assertTrue(os.path.exists(rank_1_log_file))

        rank_0_log = open(rank_0_log_file, "rt").read()
        self.assertEqual(
            rank_0_log,
            "Actor 0: Init\n"
            "Actor 0: Before loading\n"
            "Actor 0: After loading\n"
            "Actor 0: Before train\n"
            "Actor 0: After train\n"
            "Actor 0: Init\n"
            "Actor 0: Before loading\n"
            "Actor 0: After loading\n"
            "Actor 0: Before predict\n"
            "Actor 0: After predict\n",
        )
        shutil.rmtree(tmpdir)

    def testDistributedCallbacksTrainPredictClient(self):
        """Test distributed callbacks for train/predict via Ray client"""

        if ray.__version__ <= "1.2.0":
            self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")
        from ray.util.client.ray_client_helpers import ray_start_client_server

        ray.init(num_cpus=2, num_gpus=0)
        self.assertFalse(ray.util.client.ray.is_connected())
        with ray_start_client_server():
            self.assertTrue(ray.util.client.ray.is_connected())

            self.testDistributedCallbacksTrainPredict(init=False, remote=None)

    def testFailPrintErrors(self):
        """Test that XGBoost training errors are propagated"""
        x = np.random.uniform(0, 1, size=(100, 4))
        y = np.random.randint(0, 2, size=100)

        train_set = RayDMatrix(x, y)

        try:
            train(
                {
                    "objective": "multi:softmax",
                    "num_class": 2,
                    "eval_metric": ["logloss", "error"],
                },  # This will error
                train_set,
                evals=[(train_set, "train")],
                ray_params=RayParams(num_actors=1, max_actor_restarts=0),
            )
        except RuntimeError as exc:
            self.assertTrue(exc.__cause__)
            self.assertTrue(isinstance(exc.__cause__, RayActorError))

            self.assertTrue(exc.__cause__.__cause__)
            self.assertTrue(isinstance(exc.__cause__.__cause__, RayTaskError))

            self.assertTrue(exc.__cause__.__cause__.cause)
            self.assertTrue(
                isinstance(exc.__cause__.__cause__.cause, RayXGBoostTrainingError)
            )

            self.assertIn(
                "label and prediction size not match", str(exc.__cause__.__cause__)
            )

    def testKwargsValidation(self):
        x = np.random.uniform(0, 1, size=(100, 4))
        y = np.random.randint(0, 1, size=100)

        train_set = RayDMatrix(x, y)

        with self.assertRaisesRegex(TypeError, "totally_invalid_kwarg"):
            train(
                {
                    "objective": "multi:softmax",
                    "num_class": 2,
                    "eval_metric": ["logloss", "error"],
                },
                train_set,
                evals=[(train_set, "train")],
                ray_params=RayParams(num_actors=1, max_actor_restarts=0),
                totally_invalid_kwarg="",
            )

    def testRanking(self):
        Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
        Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
        X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)).toarray()
        y = np.array(
            [
                0.0,
                1.0,
                1.0,
                0.0,
                0.0,
                0.0,
                1.0,
                0.0,
                1.0,
                0.0,
                0.0,
                1.0,
                0.0,
                0.0,
                1.0,
                0.0,
                1.0,
                1.0,
                0.0,
                0.0,
            ]
        )

        qid = np.array([0] * 5 + [1] * 5 + [2] * 5 + [3] * 5)
        dtrain = RayDMatrix(X, label=y, qid=qid)

        params = {
            "eta": 1,
            "objective": "rank:pairwise",
            "eval_metric": ["auc", "aucpr"],
            "max_depth": 1,
        }
        evals_result = {}
        train(
            params,
            dtrain,
            10,
            evals=[(dtrain, "train")],
            evals_result=evals_result,
            ray_params=RayParams(num_actors=2, max_actor_restarts=0),
        )
        auc_rec = evals_result["train"]["auc"]
        self.assertTrue(all(p <= q for p, q in zip(auc_rec, auc_rec[1:])))
        auc_rec = evals_result["train"]["aucpr"]
        self.assertTrue((p <= q for p, q in zip(auc_rec, auc_rec[1:])))

    @unittest.skipIf(
        xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}"
    )
    def testFeatureWeightsParam(self):
        """Test the feature_weights parameter for xgb version >= 1.3.0.
        Adapted from the official demo codes:
        http://xgboost.readthedocs.io/en/stable/python/examples/
        feature_weights.html"""

        rng = np.random.RandomState(1994)

        kRows = 1000
        kCols = 10

        X = rng.randn(kRows, kCols)
        y = rng.randn(kRows)
        fw = np.ones(shape=(kCols,))
        for i in range(kCols):
            fw[i] *= float(i)
        train_set = RayDMatrix(X, y, feature_weights=fw)

        evals_result = {}
        bst = train(
            {
                "objective": "reg:squarederror",
                "eval_metric": ["rmse", "error"],
                "colsample_bynode": 0.1,
            },
            train_set,
            num_boost_round=250,
            evals_result=evals_result,
            evals=[(train_set, "train")],
            verbose_eval=False,
            ray_params=RayParams(
                num_actors=2, cpus_per_actor=1  # Number of remote actors
            ),
        )

        feature_map = bst.get_fscore()
        # feature zero has 0 weight
        self.assertTrue(feature_map.get("f0", None) is None)
        self.assertTrue(max(feature_map.values()) == feature_map.get("f9"))


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_fault_tolerance.py
================================================
import logging
import os
import shutil
import tempfile
import time
import unittest
from unittest.mock import DEFAULT, MagicMock, patch

import numpy as np
import ray
import xgboost as xgb

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.main import RayXGBoostActorAvailable
from xgboost_ray.tests.fault_tolerance import (
    DelayedLoadingCallback,
    DieCallback,
    FaultToleranceManager,
)
from xgboost_ray.tests.utils import (
    _checkpoint_callback,
    _fail_callback,
    _kill_callback,
    flatten_obj,
    get_num_trees,
    tree_obj,
)


class _FakeTask(MagicMock):
    ready = False

    def is_ready(self):
        return self.ready


class XGBoostRayFaultToleranceTest(unittest.TestCase):
    """In this test suite we validate fault tolerance when a Ray actor dies.

    For this, we set up a callback that makes one worker die exactly once.
    """

    def setUp(self):
        # Set default
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0"

        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 2
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 3
            ]
            * repeat
        )
        self.y = np.array([0, 1, 2, 3] * repeat)

        self.params = {
            "booster": "gbtree",
            "nthread": 1,
            "max_depth": 2,
            "objective": "multi:softmax",
            "num_class": 4,
        }

        self.tmpdir = str(tempfile.mkdtemp())

        self.die_lock_file = "/tmp/died_worker.lock"
        if os.path.exists(self.die_lock_file):
            os.remove(self.die_lock_file)

        self.die_lock_file_2 = "/tmp/died_worker_2.lock"
        if os.path.exists(self.die_lock_file_2):
            os.remove(self.die_lock_file_2)

        ray.init(num_cpus=2, num_gpus=0, log_to_driver=True)

    def tearDown(self) -> None:
        if os.path.exists(self.tmpdir):
            shutil.rmtree(self.tmpdir)
        ray.shutdown()

        if os.path.exists(self.die_lock_file):
            os.remove(self.die_lock_file)

        if os.path.exists(self.die_lock_file_2):
            os.remove(self.die_lock_file_2)

    def testTrainingContinuationKilled(self):
        """This should continue after one actor died."""
        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[_kill_callback(self.die_lock_file)],
                num_boost_round=20,
                ray_params=RayParams(max_actor_restarts=1, num_actors=2),
                additional_results=additional_results,
            )

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # End with two working actors
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Two workers finished, so N=32
        self.assertEqual(additional_results["total_n"], 32)

    def testTrainingContinuationElasticKilled(self):
        """This should continue after one actor died."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"

        logging.getLogger().setLevel(10)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[_kill_callback(self.die_lock_file)],
                num_boost_round=20,
                ray_params=RayParams(
                    max_actor_restarts=1,
                    num_actors=2,
                    elastic_training=True,
                    max_failed_actors=1,
                ),
                additional_results=additional_results,
            )

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # First actor does not get recreated
        self.assertEqual(actors[0], None)
        self.assertTrue(actors[1])

        # Only one worker finished, so n=16
        self.assertEqual(additional_results["total_n"], 16)

    def testTrainingContinuationElasticKilledRestarted(self):
        """This should continue after one actor died and restart it."""
        logging.getLogger().setLevel(10)

        ft_manager = FaultToleranceManager.remote()

        ft_manager.schedule_kill.remote(rank=0, boost_round=6)
        ft_manager.delay_return.remote(rank=1, start_boost_round=12, end_boost_round=21)

        delay_callback = DelayedLoadingCallback(
            ft_manager, reload_data=True, sleep_time=0.1
        )
        die_callback = DieCallback(ft_manager, training_delay=0.25)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[die_callback],
                num_boost_round=20,
                ray_params=RayParams(
                    max_actor_restarts=1,
                    num_actors=2,
                    elastic_training=True,
                    max_failed_actors=1,
                    distributed_callbacks=[delay_callback],
                ),
                additional_results=additional_results,
            )

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]

        # First actor gets recreated
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Both workers finished, so n=32
        self.assertEqual(additional_results["total_n"], 32)

    def testTrainingContinuationElasticMultiKilled(self):
        """This should still show 20 boost rounds after two failures."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"

        logging.getLogger().setLevel(10)

        additional_results = {}

        bst = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[
                _kill_callback(self.die_lock_file, fail_iteration=6, actor_rank=0),
                _kill_callback(self.die_lock_file_2, fail_iteration=14, actor_rank=1),
            ],
            num_boost_round=20,
            ray_params=RayParams(
                max_actor_restarts=2,
                num_actors=2,
                elastic_training=True,
                max_failed_actors=2,
            ),
            additional_results=additional_results,
        )

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

    def testTrainingContinuationElasticFailed(self):
        """This should continue after one actor failed training."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[_fail_callback(self.die_lock_file)],
                num_boost_round=20,
                ray_params=RayParams(
                    max_actor_restarts=1,
                    num_actors=2,
                    elastic_training=True,
                    max_failed_actors=1,
                ),
                additional_results=additional_results,
            )

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # End with two working actors since only the training failed
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Two workers finished, so n=32
        self.assertEqual(additional_results["total_n"], 32)

    def testTrainingStop(self):
        """This should now stop training after one actor died."""
        # The `train()` function raises a RuntimeError
        with self.assertRaises(RuntimeError):
            train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[_kill_callback(self.die_lock_file)],
                num_boost_round=20,
                ray_params=RayParams(max_actor_restarts=0, num_actors=2),
            )

    def testTrainingStopElastic(self):
        """This should now stop training after one actor died."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0"

        # The `train()` function raises a RuntimeError
        ft_manager = FaultToleranceManager.remote()

        ft_manager.schedule_kill.remote(rank=0, boost_round=3)
        ft_manager.schedule_kill.remote(rank=1, boost_round=6)
        ft_manager.delay_return.remote(rank=0, start_boost_round=4, end_boost_round=5)

        delay_callback = DelayedLoadingCallback(
            ft_manager, reload_data=True, sleep_time=0.1
        )
        die_callback = DieCallback(ft_manager, training_delay=0.25)

        with self.assertRaises(RuntimeError):
            train(
                self.params,
                RayDMatrix(self.x, self.y),
                callbacks=[die_callback],
                num_boost_round=20,
                ray_params=RayParams(
                    elastic_training=True,
                    max_failed_actors=1,
                    max_actor_restarts=1,
                    num_actors=2,
                    distributed_callbacks=[delay_callback],
                ),
            )

    def testCheckpointContinuationValidity(self):
        """Test that checkpoints are stored and loaded correctly"""

        # Train once, get checkpoint via callback returns
        res_1 = {}
        bst_1 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[_checkpoint_callback(frequency=1, before_iteration_=False)],
            num_boost_round=2,
            ray_params=RayParams(num_actors=2),
            additional_results=res_1,
        )
        last_checkpoint_1 = res_1["callback_returns"][0][-1]
        last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1]

        # Sanity check
        lc1 = xgb.Booster()
        lc1.load_model(last_checkpoint_1)
        self.assertEqual(last_checkpoint_1, last_checkpoint_other_rank_1)
        self.assertEqual(last_checkpoint_1, lc1.save_raw())
        self.assertEqual(bst_1.get_dump(), lc1.get_dump())

        # Start new training run, starting from existing model
        res_2 = {}
        bst_2 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[
                _checkpoint_callback(frequency=1, before_iteration_=True),
                _checkpoint_callback(frequency=1, before_iteration_=False),
            ],
            num_boost_round=4,
            ray_params=RayParams(num_actors=2),
            additional_results=res_2,
            xgb_model=lc1,
        )
        first_checkpoint_2 = res_2["callback_returns"][0][0]
        first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0]
        last_checkpoint_2 = res_2["callback_returns"][0][-1]
        last_checkpoint_other_actor_2 = res_2["callback_returns"][1][-1]

        fcp_bst = xgb.Booster()
        fcp_bst.load_model(first_checkpoint_2)

        lcp_bst = xgb.Booster()
        lcp_bst.load_model(last_checkpoint_2)

        # Sanity check
        self.assertEqual(first_checkpoint_2, first_checkpoint_other_actor_2)
        self.assertEqual(last_checkpoint_2, last_checkpoint_other_actor_2)
        self.assertEqual(bst_2.get_dump(), lcp_bst.get_dump())

        # Training should not have proceeded for the first checkpoint,
        # so trees should be equal
        self.assertEqual(lc1.get_dump(), fcp_bst.get_dump())

        # Training should have proceeded for the last checkpoint,
        # so trees should not be equal
        self.assertNotEqual(fcp_bst.get_dump(), lcp_bst.get_dump())

    def testSameResultWithAndWithoutError(self):
        """Get the same model with and without errors during training."""
        # Run training
        bst_noerror = train(
            self.params,
            RayDMatrix(self.x, self.y),
            num_boost_round=10,
            ray_params=RayParams(max_actor_restarts=0, num_actors=2),
        )

        bst_2part_1 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            num_boost_round=5,
            ray_params=RayParams(max_actor_restarts=0, num_actors=2),
        )

        bst_2part_2 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            num_boost_round=5,
            ray_params=RayParams(max_actor_restarts=0, num_actors=2),
            xgb_model=bst_2part_1,
        )

        res_error = {}
        bst_error = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[_fail_callback(self.die_lock_file, fail_iteration=7)],
            num_boost_round=10,
            ray_params=RayParams(
                max_actor_restarts=1, num_actors=2, checkpoint_frequency=5
            ),
            additional_results=res_error,
        )

        flat_noerror = flatten_obj({"tree": tree_obj(bst_noerror)})
        flat_error = flatten_obj({"tree": tree_obj(bst_error)})
        flat_2part = flatten_obj({"tree": tree_obj(bst_2part_2)})

        for key in flat_noerror:
            self.assertAlmostEqual(flat_noerror[key], flat_error[key])
            self.assertAlmostEqual(flat_noerror[key], flat_2part[key])

        # We fail at iteration 7, but checkpoints are saved at iteration 5
        # Thus we have two additional returns here.
        print("Callback returns:", res_error["callback_returns"][0])
        self.assertEqual(len(res_error["callback_returns"][0]), 10 + 2)

    @patch("xgboost_ray.main._PrepareActorTask", _FakeTask)
    @patch("xgboost_ray.elastic._PrepareActorTask", _FakeTask)
    @patch("xgboost_ray.main._RemoteRayXGBoostActor", MagicMock)
    def testMaybeScheduleNewActors(self):
        """Test scheduling of new actors if resources become available.

        Context: We are training with num_actors=8, of which 3 actors are
        dead. The cluster has resources to restart 2 of these actors.

        In this test, we walk through the `_maybe_schedule_new_actors` and
        `_update_scheduled_actor_states` methods, checking their state
        after each call.

        """
        from xgboost_ray.elastic import (
            _maybe_schedule_new_actors,
            _update_scheduled_actor_states,
        )
        from xgboost_ray.main import _TrainingState

        os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30"

        # Three actors are dead
        actors = [
            MagicMock(),
            None,
            MagicMock(),
            MagicMock(),
            None,
            MagicMock(),
            None,
            MagicMock(),
        ]

        # Mock training state
        state = _TrainingState(
            actors=actors,
            queue=MagicMock(),
            stop_event=MagicMock(),
            checkpoint=MagicMock(),
            additional_results={},
            failed_actor_ranks=set(),
        )

        created_actors = []

        def fake_create_actor(rank, *args, **kwargs):
            created_actors.append(rank)
            return MagicMock()

        with patch("xgboost_ray.elastic._create_actor") as create_actor:
            create_actor.side_effect = fake_create_actor

            _maybe_schedule_new_actors(
                training_state=state,
                num_cpus_per_actor=8,
                num_gpus_per_actor=0,
                resources_per_actor={"custom": 1.0},
                load_data=[],
                ray_params=RayParams(
                    num_actors=8,
                    elastic_training=True,
                    max_failed_actors=1,
                    max_actor_restarts=2,
                ),
            )

            # 3 new actors should have been created
            self.assertEqual(len(created_actors), 3)
            self.assertEqual(len(state.pending_actors), 3)

            # The number of created actors shouldn't change even
            # if we run this function again.
            _maybe_schedule_new_actors(
                training_state=state,
                num_cpus_per_actor=8,
                num_gpus_per_actor=0,
                resources_per_actor={"custom": 1.0},
                load_data=[],
                ray_params=RayParams(
                    num_actors=8,
                    elastic_training=True,
                    max_failed_actors=1,
                    max_actor_restarts=2,
                ),
            )

            self.assertEqual(len(created_actors), 3)
            self.assertEqual(len(state.pending_actors), 3)

            # The actors have not yet been promoted because the
            # loading task has not finished.
            self.assertFalse(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Update status, nothing should change
            _update_scheduled_actor_states(training_state=state)

            self.assertFalse(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Set loading task status to finished, but only for first actor
            for _, (_, task) in state.pending_actors.items():
                task.ready = True
                break

            # Update status. This shouldn't raise RayXGBoostActorAvailable
            # because we still have a grace period to wait for the second
            # actor.
            _update_scheduled_actor_states(training_state=state)

            # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S
            # Allow for some slack in test execution
            self.assertGreaterEqual(state.restart_training_at, time.time() + 22)

            # The first actor should have been promoted to full actor
            self.assertTrue(actors[1])
            self.assertFalse(actors[4])
            self.assertFalse(actors[6])

            # Set loading task status to finished for all actors
            for _, (_, task) in state.pending_actors.items():
                task.ready = True

            # Update status. This should now raise RayXGBoostActorAvailable
            # immediately as there are no pending actors left to wait for.
            with self.assertRaises(RayXGBoostActorAvailable):
                _update_scheduled_actor_states(training_state=state)

            # All restarted actors should have been promoted to full actors
            self.assertTrue(actors[1])
            self.assertTrue(actors[4])
            self.assertTrue(actors[6])

    def testFaultToleranceManager(self):
        ft_manager = FaultToleranceManager.remote()

        ft_manager.schedule_kill.remote(rank=1, boost_round=16)
        ft_manager.delay_return.remote(rank=1, start_boost_round=14, end_boost_round=68)

        delay_callback = DelayedLoadingCallback(
            ft_manager, reload_data=True, sleep_time=0.1
        )
        die_callback = DieCallback(ft_manager, training_delay=0.25)

        res_1 = {}
        train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[die_callback],
            num_boost_round=100,
            ray_params=RayParams(
                num_actors=2,
                checkpoint_frequency=1,
                elastic_training=True,
                max_failed_actors=1,
                max_actor_restarts=1,
                distributed_callbacks=[delay_callback],
            ),
            additional_results=res_1,
        )

        logs = ray.get(ft_manager.get_logs.remote())

        print(logs)

        self.assertSequenceEqual([g for g, _ in logs[0][0:99]], range(99))

        # Which steps exactly are executed is stochastic. The rank 1 actor
        # will die at iteration 16, so at least 15 will be logged (though 16
        # might be logged as well). Iterations 17 to 67 should never be
        # logged. It comes back some time after iteration 68, so this might
        # be iter 68, 69, or later. We just make sure it comes back at all
        # (iter 70+).
        global_steps = [g for g, _ in logs[1]]
        self.assertTrue(global_steps)
        self.assertIn(15, global_steps)
        self.assertNotIn(17, global_steps)
        self.assertNotIn(67, global_steps)
        self.assertIn(75, global_steps)


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_matrix.py
================================================
import inspect
import os
import tempfile
import unittest

import numpy as np
import pandas as pd
import ray
import xgboost as xgb

try:
    import ray.data as ray_data
except (ImportError, ModuleNotFoundError):

    ray_data = None

from xgboost_ray import RayDMatrix
from xgboost_ray.matrix import RayShardingMode, _get_sharding_indices, concat_dataframes


class XGBoostRayDMatrixTest(unittest.TestCase):
    """This test suite validates core RayDMatrix functionality."""

    def setUp(self):
        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 2
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 3
            ]
            * repeat
        )
        self.y = np.array([0, 1, 2, 3] * repeat)
        self.multi_y = np.array(
            [
                [1, 0, 0, 0],
                [0, 1, 0, 0],
                [0, 0, 1, 1],
                [0, 0, 1, 0],
            ]
            * repeat
        )

    @classmethod
    def setUpClass(cls):
        ray.init()

    @classmethod
    def tearDownClass(cls):
        ray.shutdown()

    def testSameObject(self):
        """Test that matrices are recognized as the same in an actor task."""

        @ray.remote
        def same(one, two):
            return one == two

        data = RayDMatrix(self.x, self.y)
        self.assertTrue(ray.get(same.remote(data, data)))

    def testColumnOrdering(self):
        """When excluding cols, the remaining col order should be preserved."""

        cols = [str(i) for i in range(50)]
        df = pd.DataFrame(np.random.randn(1, len(cols)), columns=cols)
        matrix = RayDMatrix(df, label=cols[-1], num_actors=1)
        data = matrix.get_data(0)["data"]

        assert data.columns.tolist() == cols[:-1]

    def _testMatrixCreation(self, in_x, in_y, multi_label=False, **kwargs):
        if "sharding" not in kwargs:
            kwargs["sharding"] = RayShardingMode.BATCH
        mat = RayDMatrix(in_x, in_y, **kwargs)

        def _load_data(params):
            x = params["data"]
            y = params["label"]

            if isinstance(x, list):
                x = concat_dataframes(x)
            if isinstance(y, list):
                y = concat_dataframes(y)
            return x, y

        params = mat.get_data(rank=0, num_actors=1)
        x, y = _load_data(params)

        self.assertTrue(np.allclose(self.x, x))
        if multi_label:
            self.assertTrue(np.allclose(self.multi_y, y))
        else:
            self.assertTrue(np.allclose(self.y, y))

        # Multi actor check
        mat = RayDMatrix(in_x, in_y, **kwargs)

        params = mat.get_data(rank=0, num_actors=2)
        x1, y1 = _load_data(params)

        mat.unload_data()

        params = mat.get_data(rank=1, num_actors=2)
        x2, y2 = _load_data(params)

        self.assertTrue(np.allclose(self.x, concat_dataframes([x1, x2])))
        if multi_label:
            self.assertTrue(np.allclose(self.multi_y, concat_dataframes([y1, y2])))
        else:
            self.assertTrue(np.allclose(self.y, concat_dataframes([y1, y2])))

    def testFromNumpy(self):
        in_x = self.x
        in_y = self.y
        self._testMatrixCreation(in_x, in_y)

    def testFromPandasDfDf(self):
        in_x = pd.DataFrame(self.x)
        in_y = pd.DataFrame(self.y)
        self._testMatrixCreation(in_x, in_y)

    def testFromPandasDfSeries(self):
        in_x = pd.DataFrame(self.x)
        in_y = pd.Series(self.y)
        self._testMatrixCreation(in_x, in_y)

    def testFromPandasDfString(self):
        in_df = pd.DataFrame(self.x)
        in_df["label"] = self.y
        self._testMatrixCreation(in_df, "label")

    def testFromModinDfDf(self):
        from xgboost_ray.data_sources.modin import MODIN_INSTALLED

        if not MODIN_INSTALLED:
            self.skipTest("Modin not installed.")
            return

        from modin.pandas import DataFrame

        in_x = DataFrame(self.x)
        in_y = DataFrame(self.y)
        self._testMatrixCreation(in_x, in_y, distributed=False)

    def testFromModinDfSeries(self):
        from xgboost_ray.data_sources.modin import MODIN_INSTALLED

        if not MODIN_INSTALLED:
            self.skipTest("Modin not installed.")
            return

        from modin.pandas import DataFrame, Series

        in_x = DataFrame(self.x)
        in_y = Series(self.y)
        self._testMatrixCreation(in_x, in_y, distributed=False)

    def testFromModinDfString(self):
        from xgboost_ray.data_sources.modin import MODIN_INSTALLED

        if not MODIN_INSTALLED:
            self.skipTest("Modin not installed.")
            return

        from modin.pandas import DataFrame

        in_df = DataFrame(self.x)
        in_df["label"] = self.y
        self._testMatrixCreation(in_df, "label", distributed=False)
        self._testMatrixCreation(in_df, "label", distributed=True)

    def testFromDaskDfSeries(self):
        from xgboost_ray.data_sources.dask import DASK_INSTALLED

        if not DASK_INSTALLED:
            self.skipTest("Dask not installed.")
            return

        import dask.dataframe as dd

        in_x = dd.from_array(self.x)
        in_y = dd.from_array(self.y)

        self._testMatrixCreation(in_x, in_y, distributed=False)

    def testFromDaskDfArray(self):
        from xgboost_ray.data_sources.dask import DASK_INSTALLED

        if not DASK_INSTALLED:
            self.skipTest("Dask not installed.")
            return

        import dask.array as da
        import dask.dataframe as dd

        in_x = dd.from_array(self.x)
        in_y = da.from_array(self.y)

        self._testMatrixCreation(in_x, in_y, distributed=False)

    def testFromDaskDfString(self):
        from xgboost_ray.data_sources.dask import DASK_INSTALLED

        if not DASK_INSTALLED:
            self.skipTest("Dask not installed.")
            return

        import dask.dataframe as dd

        in_df = dd.from_array(self.x)
        in_df["label"] = dd.from_array(self.y)

        self._testMatrixCreation(in_df, "label", distributed=False)
        self._testMatrixCreation(in_df, "label", distributed=True)

    def testFromPetastormParquetString(self):
        try:
            import petastorm  # noqa: F401
        except ImportError:
            self.skipTest("Petastorm not installed.")
            return

        with tempfile.TemporaryDirectory() as dir:
            data_file = os.path.join(dir, "data.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)
            data_df.to_parquet(data_file)

            self._testMatrixCreation(f"file://{data_file}", "label", distributed=False)
            self._testMatrixCreation(f"file://{data_file}", "label", distributed=True)

    def testFromPetastormMultiParquetString(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file_1 = os.path.join(dir, "data_1.parquet")
            data_file_2 = os.path.join(dir, "data_2.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            df_1 = data_df[0 : len(data_df) // 2]
            df_2 = data_df[len(data_df) // 2 :]

            df_1.to_parquet(data_file_1)
            df_2.to_parquet(data_file_2)

            self._testMatrixCreation(
                [f"file://{data_file_1}", f"file://{data_file_2}"],
                "label",
                distributed=False,
            )
            self._testMatrixCreation(
                [f"file://{data_file_1}", f"file://{data_file_2}"],
                "label",
                distributed=True,
            )

    def testFromCSVString(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file = os.path.join(dir, "data.csv")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)
            data_df.to_csv(data_file, header=True, index=False)

            self._testMatrixCreation(data_file, "label", distributed=False)
            with self.assertRaises(ValueError):
                self._testMatrixCreation(data_file, "label", distributed=True)

    def testFromMultiCSVString(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file_1 = os.path.join(dir, "data_1.csv")
            data_file_2 = os.path.join(dir, "data_2.csv")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            df_1 = data_df[0 : len(data_df) // 2]
            df_2 = data_df[len(data_df) // 2 :]

            df_1.to_csv(data_file_1, header=True, index=False)
            df_2.to_csv(data_file_2, header=True, index=False)

            self._testMatrixCreation(
                [data_file_1, data_file_2], "label", distributed=False
            )
            self._testMatrixCreation(
                [data_file_1, data_file_2], "label", distributed=True
            )

    def testFromParquetStringMultiLabel(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file = os.path.join(dir, "data.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            labels = [f"label_{label}" for label in range(4)]
            data_df[labels] = self.multi_y
            data_df.to_parquet(data_file)

            self._testMatrixCreation(
                data_file, labels, multi_label=True, distributed=False
            )
            self._testMatrixCreation(
                data_file, labels, multi_label=True, distributed=True
            )

    def testFromParquetString(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file = os.path.join(dir, "data.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)
            data_df.to_parquet(data_file)

            self._testMatrixCreation(data_file, "label", distributed=False)
            self._testMatrixCreation(data_file, "label", distributed=True)

    def testFromMultiParquetStringMultiLabel(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file_1 = os.path.join(dir, "data_1.parquet")
            data_file_2 = os.path.join(dir, "data_2.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            labels = [f"label_{label}" for label in range(4)]
            data_df[labels] = self.multi_y

            df_1 = data_df[0 : len(data_df) // 2]
            df_2 = data_df[len(data_df) // 2 :]

            df_1.to_parquet(data_file_1)
            df_2.to_parquet(data_file_2)

            self._testMatrixCreation(
                [data_file_1, data_file_2], labels, multi_label=True, distributed=False
            )
            self._testMatrixCreation(
                [data_file_1, data_file_2], labels, multi_label=True, distributed=True
            )

    def testFromMultiParquetString(self):
        with tempfile.TemporaryDirectory() as dir:
            data_file_1 = os.path.join(dir, "data_1.parquet")
            data_file_2 = os.path.join(dir, "data_2.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            df_1 = data_df[0 : len(data_df) // 2]
            df_2 = data_df[len(data_df) // 2 :]

            df_1.to_parquet(data_file_1)
            df_2.to_parquet(data_file_2)

            self._testMatrixCreation(
                [data_file_1, data_file_2], "label", distributed=False
            )
            self._testMatrixCreation(
                [data_file_1, data_file_2], "label", distributed=True
            )

    def testDetectDistributed(self):
        with tempfile.TemporaryDirectory() as dir:
            parquet_file = os.path.join(dir, "file.parquet")
            csv_file = os.path.join(dir, "file.csv")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            data_df.to_parquet(parquet_file)
            data_df.to_csv(csv_file)

            mat = RayDMatrix(parquet_file, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix(csv_file, lazy=True)
            # Single CSV files should not be distributed
            self.assertFalse(mat.distributed)

            mat = RayDMatrix([parquet_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix([csv_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            if ray_data:
                ds = ray_data.read_parquet(parquet_file)
                mat = RayDMatrix(ds)
                self.assertTrue(mat.distributed)

    def testTooManyActorsDistributed(self):
        """Test error when too many actors are passed"""
        with self.assertRaises(RuntimeError):
            dtrain = RayDMatrix(["foo.csv"], num_actors=4, distributed=True)
            dtrain.assert_enough_shards_for_actors(4)

    def testTooManyActorsCentral(self):
        """Test error when too many actors are passed"""
        data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])

        with self.assertRaises(RuntimeError):
            RayDMatrix(data_df, num_actors=34, distributed=False)

    def testBatchShardingAllActorsGetIndices(self):
        """Check if all actors get indices with batch mode"""
        for i in range(16):
            self.assertTrue(_get_sharding_indices(RayShardingMode.BATCH, i, 16, 100))

    def testLegacyParams(self):
        """Test if all params can be set regardless of xgb version"""
        in_x = self.x
        in_y = self.y
        weight = np.array([1] * len(in_y))
        qid = np.array([0] + [1] * len(in_y - 1))
        base_margin = np.array([1] * len(in_y))
        label_lower_bound = np.array([0.1] * len(in_y))
        label_upper_bound = np.array([1] * len(in_y))
        self._testMatrixCreation(
            in_x,
            in_y,
            weight=weight,
            base_margin=base_margin,
            label_lower_bound=label_lower_bound,
            label_upper_bound=label_upper_bound,
        )
        self._testMatrixCreation(
            in_x,
            in_y,
            qid=qid,
            base_margin=base_margin,
            label_lower_bound=label_lower_bound,
            label_upper_bound=label_upper_bound,
        )

    @unittest.skipIf(
        xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}"
    )
    def testFeatureWeightsParam(self):
        """Test the feature_weights parameter for xgb version >= 1.3.0"""
        in_x = self.x
        in_y = self.y
        feature_weights = np.arange(len(in_y))
        self._testMatrixCreation(in_x, in_y, feature_weights=feature_weights)

    @unittest.skipIf(
        "qid" not in inspect.signature(xgb.DMatrix).parameters,
        f"not supported in xgb version {xgb.__version__}",
    )
    def testQidSortedBehaviorXGBoost(self):
        """Test that data with unsorted qid is sorted in RayDMatrix"""
        in_x = self.x
        in_y = self.y
        unsorted_qid = np.array([1, 2] * 16)

        from xgboost import DMatrix

        with self.assertRaises(ValueError):
            DMatrix(**{"data": in_x, "label": in_y, "qid": unsorted_qid})
        DMatrix(
            **{"data": in_x, "label": in_y, "qid": np.sort(unsorted_qid)}
        )  # no exception
        # test RayDMatrix handles sorting automatically
        mat = RayDMatrix(in_x, in_y, qid=unsorted_qid)
        params = mat.get_data(rank=0, num_actors=1)
        DMatrix(**params)

    @unittest.skipIf(
        "qid" not in inspect.signature(xgb.DMatrix).parameters,
        f"not supported in xgb version {xgb.__version__}",
    )
    def testQidSortedParquet(self):
        from xgboost import DMatrix

        with tempfile.TemporaryDirectory() as dir:
            parquet_file1 = os.path.join(dir, "file1.parquet")
            parquet_file2 = os.path.join(dir, "file2.parquet")

            unsorted_qid1 = np.array([2, 4] * 16)
            unsorted_qid2 = np.array([1, 3] * 16)

            # parquet 1
            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)
            data_df["group"] = pd.Series(unsorted_qid1)
            data_df.to_parquet(parquet_file1)
            # parquet 2
            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)
            data_df["group"] = pd.Series(unsorted_qid2)
            data_df.to_parquet(parquet_file2)
            mat = RayDMatrix(
                [parquet_file1, parquet_file2],
                columns=["a", "b", "c", "d", "label", "group"],
                label="label",
                qid="group",
            )
            params = mat.get_data(rank=0, num_actors=1)
            DMatrix(**params)


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_sklearn.py
================================================
"""Copied almost verbatim from https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/tests/python/test_with_sklearn.py
in order to ensure 1:1 coverage, with minimal modifications.
Some tests were disabled due to not being applicable for a
distributed setting."""  # noqa: E501

# Copyright 2021 by XGBoost Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# File based on:
# https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/tests/python/test_with_sklearn.py

# License:
# https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/LICENSE

import json
import os
import shutil

# import io
# from contextlib import redirect_stdout, redirect_stderr
import tempfile
import unittest

# import collections
# import importlib.util
import numpy as np
import ray
import xgboost as xgb
from packaging.version import Version

from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix, RayParams, predict, train
from xgboost_ray.matrix import RayShardingMode
from xgboost_ray.sklearn import (
    RayXGBClassifier,
    RayXGBRanker,
    RayXGBRegressor,
    RayXGBRFClassifier,
    RayXGBRFRegressor,
)


def softmax(x):
    e = np.exp(x)
    return e / np.sum(e)


def softprob_obj(classes):
    def objective(labels, predt):
        rows = labels.shape[0]
        grad = np.zeros((rows, classes), dtype=float)
        hess = np.zeros((rows, classes), dtype=float)
        eps = 1e-6
        for r in range(predt.shape[0]):
            target = labels[r]
            p = softmax(predt[r, :])
            for c in range(predt.shape[1]):
                assert target >= 0 or target <= classes
                g = p[c] - 1.0 if c == target else p[c]
                h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
                grad[r, c] = g
                hess[r, c] = h

        grad = grad.reshape((rows * classes, 1))
        hess = hess.reshape((rows * classes, 1))
        return grad, hess

    return objective


def get_basescore(model: xgb.XGBModel) -> float:
    """Get base score from an XGBoost sklearn estimator."""
    base_score = float(
        json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][
            "base_score"
        ]
    )
    return base_score


class TemporaryDirectory(object):
    """Context manager for tempfile.mkdtemp()"""

    def __enter__(self):
        self.name = tempfile.mkdtemp()
        return self.name

    def __exit__(self, exc_type, exc_value, traceback):
        shutil.rmtree(self.name)


class XGBoostRaySklearnTest(unittest.TestCase):
    def setUp(self):
        self.seed = 1994
        self.rng = np.random.RandomState(self.seed)

    def tearDown(self) -> None:
        if ray.is_initialized():
            ray.shutdown()

    def _init_ray(self):
        if not ray.is_initialized():
            ray.init(num_cpus=4)

    def run_binary_classification(self, cls, ray_dmatrix_params=None):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)

        for train_index, test_index in kf.split(X, y):
            clf = cls(random_state=42)
            xgb_model = clf.fit(
                X[train_index],
                y[train_index],
                eval_metric=["auc", "logloss"],
                ray_dmatrix_params=ray_dmatrix_params,
            )
            preds = xgb_model.predict(
                X[test_index], ray_dmatrix_params=ray_dmatrix_params
            )
            labels = y[test_index]
            err = sum(
                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
            ) / float(len(preds))
            assert err < 0.1

    def test_binary_classification(self):
        self.run_binary_classification(RayXGBClassifier)

    def test_binary_classification_dmatrix_params(self):
        self.run_binary_classification(
            RayXGBClassifier, ray_dmatrix_params={"sharding": RayShardingMode.BATCH}
        )

    # ray: added for legacy CI test
    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_binary_rf_classification(self):
        self.run_binary_classification(RayXGBRFClassifier)

    def test_multiclass_classification(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import KFold

        def check_pred(preds, labels, output_margin):
            if output_margin:
                err = sum(
                    1 for i in range(len(preds)) if preds[i].argmax() != labels[i]
                ) / float(len(preds))
            else:
                err = sum(
                    1 for i in range(len(preds)) if preds[i] != labels[i]
                ) / float(len(preds))
            assert err < 0.4

        iris = load_iris()
        y = iris["target"]
        X = iris["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index])
            if hasattr(xgb_model.get_booster(), "num_boosted_rounds"):
                assert (
                    xgb_model.get_booster().num_boosted_rounds()
                    == xgb_model.get_num_boosting_rounds()
                )
            preds = xgb_model.predict(X[test_index])
            # test other params in XGBClassifier().fit
            preds2 = xgb_model.predict(X[test_index], output_margin=True)
            preds3 = xgb_model.predict(X[test_index], output_margin=False)
            labels = y[test_index]

            check_pred(preds, labels, output_margin=False)
            check_pred(preds2, labels, output_margin=True)
            check_pred(preds3, labels, output_margin=False)

        cls = RayXGBClassifier(n_estimators=4).fit(X, y)
        assert cls.n_classes_ == 3
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_

        # custom objective, the default is multi:softprob
        # so no transformation is required.
        cls = RayXGBClassifier(n_estimators=4, objective=softprob_obj(3)).fit(X, y)
        proba = cls.predict_proba(X)
        assert proba.shape[0] == X.shape[0]
        assert proba.shape[1] == cls.n_classes_

    def test_stacking_regression(self):
        self._init_ray()

        from sklearn.datasets import load_diabetes
        from sklearn.ensemble import RandomForestRegressor, StackingRegressor
        from sklearn.linear_model import RidgeCV
        from sklearn.model_selection import train_test_split

        X, y = load_diabetes(return_X_y=True)
        estimators = [
            ("gbm", RayXGBRegressor(objective="reg:squarederror")),
            ("lr", RidgeCV()),
        ]
        reg = StackingRegressor(
            estimators=estimators,
            final_estimator=RandomForestRegressor(n_estimators=10, random_state=42),
        )

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        reg.fit(X_train, y_train).score(X_test, y_test)

    def test_stacking_classification(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.ensemble import StackingClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import train_test_split
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import StandardScaler
        from sklearn.svm import LinearSVC

        X, y = load_iris(return_X_y=True)
        estimators = [
            ("gbm", RayXGBClassifier()),
            (
                "svr",
                make_pipeline(StandardScaler(), LinearSVC(random_state=42)),
            ),
        ]
        clf = StackingClassifier(
            estimators=estimators, final_estimator=LogisticRegression()
        )

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        clf.fit(X_train, y_train).score(X_test, y_test)

    # exact tree method doesn't support distributed training
    # def test_feature_importances_weight(self)

    # def test_feature_importances_gain(self)

    def test_select_feature(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.feature_selection import SelectFromModel

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        cls = RayXGBClassifier()
        cls.fit(X, y)
        selector = SelectFromModel(cls, prefit=True, max_features=1)
        X_selected = selector.transform(X)
        assert X_selected.shape[1] == 1

    def test_num_parallel_tree(self):
        self._init_ray()

        from sklearn.datasets import fetch_california_housing

        reg = RayXGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist")
        ds = fetch_california_housing()
        bst = reg.fit(X=ds["data"], y=ds["target"])
        dump = bst.get_booster().get_dump(dump_format="json")
        assert len(dump) == 16

        if XGBOOST_VERSION != Version("0.90"):
            reg = RayXGBRFRegressor(n_estimators=4)
            bst = reg.fit(X=ds["data"], y=ds["target"])
            dump = bst.get_booster().get_dump(dump_format="json")
            assert len(dump) == 4

            if XGBOOST_VERSION >= Version("1.6.0"):
                config = json.loads(bst.get_booster().save_config())
                assert (
                    int(
                        config["learner"]["gradient_booster"]["gbtree_model_param"][
                            "num_parallel_tree"
                        ]
                    )
                    == 4
                )
            else:
                config = json.loads(bst.get_booster().save_config())
                assert (
                    int(
                        config["learner"]["gradient_booster"]["gbtree_train_param"][
                            "num_parallel_tree"
                        ]
                    )
                    == 4
                )

    def test_california_housing_regression(self):
        self._init_ray()

        from sklearn.datasets import fetch_california_housing
        from sklearn.metrics import mean_squared_error
        from sklearn.model_selection import KFold

        ds = fetch_california_housing()
        y = ds["target"]
        X = ds["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBRegressor().fit(X[train_index], y[train_index])

            preds = xgb_model.predict(X[test_index])
            # test other params in XGBRegressor().fit
            preds2 = xgb_model.predict(X[test_index], output_margin=True)
            preds3 = xgb_model.predict(X[test_index], output_margin=False)
            labels = y[test_index]

            assert mean_squared_error(preds, labels) < 25
            assert mean_squared_error(preds2, labels) < 350
            assert mean_squared_error(preds3, labels) < 350

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def run_california_housing_rf_regression(self, tree_method):
        from sklearn.datasets import fetch_california_housing
        from sklearn.metrics import mean_squared_error
        from sklearn.model_selection import KFold

        X, y = fetch_california_housing(return_X_y=True)
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBRFRegressor(random_state=42, tree_method=tree_method).fit(
                X[train_index], y[train_index]
            )
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            assert mean_squared_error(preds, labels) < 35

    def test_california_housing_rf_regression(self):
        self._init_ray()

        self.run_california_housing_rf_regression("hist")

    def test_parameter_tuning(self):
        self._init_ray()

        from sklearn.datasets import fetch_california_housing
        from sklearn.model_selection import GridSearchCV

        ds = fetch_california_housing()
        y = ds["target"]
        X = ds["data"]
        xgb_model = RayXGBRegressor(learning_rate=0.1)
        clf = GridSearchCV(
            xgb_model,
            {"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]},
            cv=3,
            verbose=1,
        )
        clf.fit(X, y)
        assert clf.best_score_ < 0.7
        assert clf.best_params_ == {"n_estimators": 200, "max_depth": 4}

    def test_regression_with_custom_objective(self):
        self._init_ray()

        from sklearn.datasets import fetch_california_housing
        from sklearn.metrics import mean_squared_error
        from sklearn.model_selection import KFold

        def objective_ls(y_true, y_pred):
            grad = y_pred - y_true
            hess = np.ones(len(y_true))
            return grad, hess

        ds = fetch_california_housing()
        y = ds["target"]
        X = ds["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBRegressor(objective=objective_ls).fit(
                X[train_index], y[train_index]
            )
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
        assert mean_squared_error(preds, labels) < 25

        # Test that the custom objective function is actually used
        class XGBCustomObjectiveException(Exception):
            pass

        def dummy_objective(y_true, y_pred):
            raise XGBCustomObjectiveException()

        xgb_model = RayXGBRegressor(objective=dummy_objective)
        # TODO figure out how to assertRaises XGBCustomObjectiveException
        with self.assertRaises(RuntimeError):
            xgb_model.fit(X, y)

    def test_classification_with_custom_objective(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(objective=logregobj)
            xgb_model.fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(
                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
            ) / float(len(preds))
            assert err < 0.1

        # Test that the custom objective function is actually used
        class XGBCustomObjectiveException(Exception):
            pass

        def dummy_objective(y_true, y_preds):
            raise XGBCustomObjectiveException()

        xgb_model = RayXGBClassifier(objective=dummy_objective)
        # TODO figure out how to assertRaises XGBCustomObjectiveException
        with self.assertRaises(RuntimeError):
            xgb_model.fit(X, y)

        # cls = RayXGBClassifier(use_label_encoder=False)
        # cls.fit(X, y)

        # is_called = [False]

        # def wrapped(y, p):
        #     is_called[0] = True
        #     return logregobj(y, p)

        # cls.set_params(objective=wrapped)
        # cls.predict(X)  # no throw
        # cls.fit(X, y)

        # assert is_called[0]

    def test_sklearn_api(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        iris = load_iris()
        tr_d, te_d, tr_l, te_l = train_test_split(
            iris.data, iris.target, train_size=120, test_size=0.2
        )

        classifier = RayXGBClassifier(
            booster="gbtree", n_estimators=10, random_state=self.seed
        )
        classifier.fit(tr_d, tr_l)

        preds = classifier.predict(te_d)
        labels = te_l
        err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l)
        assert err < 0.2

    def test_sklearn_api_gblinear(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        iris = load_iris()
        tr_d, te_d, tr_l, te_l = train_test_split(
            iris.data, iris.target, train_size=120
        )

        classifier = RayXGBClassifier(
            booster="gblinear", n_estimators=100, random_state=self.seed
        )
        classifier.fit(tr_d, tr_l)

        preds = classifier.predict(te_d)
        labels = te_l
        err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l)
        assert err < 0.5

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_sklearn_random_state(self):
        self._init_ray()

        clf = RayXGBClassifier(random_state=402)
        assert clf.get_xgb_params()["random_state"] == 402

        clf = RayXGBClassifier(random_state=401)
        assert clf.get_xgb_params()["random_state"] == 401

        random_state = np.random.RandomState(seed=403)
        clf = RayXGBClassifier(random_state=random_state)
        assert isinstance(clf.get_xgb_params()["random_state"], int)

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_sklearn_n_jobs(self):
        self._init_ray()

        clf = RayXGBClassifier(n_jobs=1)
        assert clf.get_xgb_params()["n_jobs"] == 1

        clf = RayXGBClassifier(n_jobs=2)
        assert clf.get_xgb_params()["n_jobs"] == 2

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.3.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_parameters_access(self):
        self._init_ray()

        from sklearn import datasets

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        clf = RayXGBClassifier(n_estimators=1000, **params)
        assert clf.get_params()["updater"] == "grow_gpu_hist"
        assert clf.get_params()["subsample"] == 0.5
        assert clf.get_params()["n_estimators"] == 1000

        clf = RayXGBClassifier(n_estimators=1, nthread=4)
        X, y = datasets.load_iris(return_X_y=True)
        clf.fit(X, y)

        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 4

        clf.set_params(nthread=16)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16

        clf.predict(X)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16

    def test_kwargs_error(self):
        self._init_ray()

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        with self.assertRaises(TypeError):
            clf = RayXGBClassifier(n_jobs=1000, **params)
            assert isinstance(clf, RayXGBClassifier)

    def test_kwargs_grid_search(self):
        self._init_ray()

        from sklearn import datasets
        from sklearn.model_selection import GridSearchCV

        params = {"tree_method": "hist"}
        clf = RayXGBClassifier(n_estimators=1, learning_rate=1.0, **params)
        assert clf.get_params()["tree_method"] == "hist"
        # 'max_leaves' is not a default argument of XGBClassifier
        # Check we can still do grid search over this parameter
        search_params = {"max_leaves": range(2, 5)}
        grid_cv = GridSearchCV(clf, search_params, cv=5)
        iris = datasets.load_iris()
        grid_cv.fit(iris.data, iris.target)

        # Expect unique results for each parameter value.
        # This confirms sklearn is able to successfully update the parameter.
        means = grid_cv.cv_results_["mean_test_score"]
        assert len(means) == len(set(means))

    def test_sklearn_clone(self):
        self._init_ray()

        from sklearn.base import clone

        clf = RayXGBClassifier(n_jobs=2)
        clf.n_jobs = -1
        clone(clf)

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_sklearn_get_default_params(self):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits_2class = load_digits(n_class=2)
        X = digits_2class["data"]
        y = digits_2class["target"]
        cls = RayXGBClassifier()
        assert cls.get_params()["base_score"] is None
        cls.fit(X[:4, ...], y[:4, ...])
        base_score = get_basescore(cls)
        np.testing.assert_equal(base_score, 0.5)

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.1.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_validation_weights_xgbmodel(self):
        self._init_ray()

        from sklearn.datasets import make_hastie_10_2

        # prepare training and test data
        X, y = make_hastie_10_2(n_samples=2000, random_state=42)
        labels, y = np.unique(y, return_inverse=True)
        X_train, X_test = X[:1600], X[1600:]
        y_train, y_test = y[:1600], y[1600:]

        # instantiate model
        param_dist = {
            "objective": "binary:logistic",
            "n_estimators": 2,
            "random_state": 123,
        }
        clf = xgb.sklearn.XGBModel(**param_dist)

        # train it using instance weights only in the training set
        weights_train = np.random.choice([1, 2], len(X_train))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric="logloss",
            verbose=False,
        )

        # evaluate logloss metric on test set *without* using weights
        evals_result_without_weights = clf.evals_result()
        logloss_without_weights = evals_result_without_weights["validation_0"][
            "logloss"
        ]

        # now use weights for the test set
        np.random.seed(0)
        weights_test = np.random.choice([1, 2], len(X_test))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            sample_weight_eval_set=[weights_test],
            eval_metric="logloss",
            verbose=False,
        )
        evals_result_with_weights = clf.evals_result()
        logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]

        # check that the logloss in the test set is actually different
        # when using weights than when not using them
        assert all(
            (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1])
        )

        with self.assertRaises((ValueError, AssertionError)):
            # length of eval set and sample weight doesn't match.
            clf.fit(
                X_train,
                y_train,
                sample_weight=weights_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                sample_weight_eval_set=[weights_train],
            )

        with self.assertRaises((ValueError, AssertionError)):
            cls = RayXGBClassifier()
            cls.fit(
                X_train,
                y_train,
                sample_weight=weights_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                sample_weight_eval_set=[weights_train],
            )

    def test_validation_weights_xgbclassifier(self):
        self._init_ray()

        from sklearn.datasets import make_hastie_10_2

        # prepare training and test data
        X, y = make_hastie_10_2(n_samples=2000, random_state=42)
        labels, y = np.unique(y, return_inverse=True)
        X_train, X_test = X[:1600], X[1600:]
        y_train, y_test = y[:1600], y[1600:]

        # instantiate model
        param_dist = {
            "objective": "binary:logistic",
            "n_estimators": 2,
            "random_state": 123,
        }
        clf = RayXGBClassifier(**param_dist)

        # train it using instance weights only in the training set
        weights_train = np.random.choice([1, 2], len(X_train))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric="logloss",
            verbose=False,
        )

        # evaluate logloss metric on test set *without* using weights
        evals_result_without_weights = clf.evals_result()
        logloss_without_weights = evals_result_without_weights["validation_0"][
            "logloss"
        ]

        # now use weights for the test set
        np.random.seed(0)
        weights_test = np.random.choice([1, 2], len(X_test))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            sample_weight_eval_set=[weights_test],
            eval_metric="logloss",
            verbose=False,
        )
        evals_result_with_weights = clf.evals_result()
        logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]

        # check that the logloss in the test set is actually different
        # when using weights than when not using them
        assert all(
            (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1])
        )

    def save_load_model(self, model_path):
        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index])
            xgb_model.save_model(model_path)

            xgb_model = RayXGBClassifier()
            xgb_model.load_model(model_path)

            assert isinstance(xgb_model.classes_, np.ndarray)
            assert isinstance(xgb_model._Booster, xgb.Booster)

            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(
                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
            ) / float(len(preds))
            assert err < 0.1
            assert xgb_model.get_booster().attr("scikit_learn") is None

            # test native booster
            preds = xgb_model.predict(X[test_index], output_margin=True)
            booster = xgb.Booster(model_file=model_path)
            predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True)
            assert np.allclose(preds, predt_1)

            with self.assertRaises(TypeError):
                xgb_model = xgb.XGBModel()
                xgb_model.load_model(model_path)

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.3.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_save_load_model(self):
        self._init_ray()

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model")
            self.save_load_model(model_path)

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            self.save_load_model(model_path)

        from sklearn.datasets import load_digits

        with TemporaryDirectory() as tempdir:
            model_path = os.path.join(tempdir, "digits.model.json")
            digits = load_digits(n_class=2)
            y = digits["target"]
            X = digits["data"]
            booster = xgb.train(
                {"tree_method": "hist", "objective": "binary:logistic"},
                dtrain=xgb.DMatrix(X, y),
                num_boost_round=4,
            )
            predt_0 = booster.predict(xgb.DMatrix(X))
            booster.save_model(model_path)
            cls = RayXGBClassifier()
            cls.load_model(model_path)

            proba = cls.predict_proba(X)
            assert proba.shape[0] == X.shape[0]
            assert proba.shape[1] == 2  # binary

            predt_1 = cls.predict_proba(X)[:, 1]
            assert np.allclose(predt_0, predt_1)

            cls = xgb.XGBModel()
            cls.load_model(model_path)
            predt_1 = cls.predict(X)
            assert np.allclose(predt_0, predt_1)

    # # forcing it to be last as it's the longest test by far
    # def test_zzzzzzz_RFECV(self):
    #     self._init_ray()

    #     from sklearn.datasets import fetch_california_housing
    #     from sklearn.datasets import load_breast_cancer
    #     from sklearn.datasets import load_iris
    #     from sklearn.feature_selection import RFECV

    #     # Regression
    #     X, y = fetch_california_housing(return_X_y=True)
    #     bst = RayXGBRegressor(
    #         booster="gblinear",
    #         learning_rate=0.1,
    #         n_estimators=10,
    #         objective="reg:squarederror",
    #         random_state=0,
    #         verbosity=0,
    #     )
    #     rfecv = RFECV(
    #         estimator=bst, step=1, cv=3, scoring="neg_mean_squared_error")
    #     rfecv.fit(X, y)

    #     # Binary classification
    #     X, y = load_breast_cancer(return_X_y=True)
    #     bst = RayXGBClassifier(
    #         booster="gblinear",
    #         learning_rate=0.1,
    #         n_estimators=10,
    #         objective="binary:logistic",
    #         random_state=0,
    #         verbosity=0,
    #         use_label_encoder=False,
    #     )
    #     rfecv = RFECV(estimator=bst, step=1, cv=3, scoring="roc_auc")
    #     rfecv.fit(X, y)

    #     # Multi-class classification
    #     X, y = load_iris(return_X_y=True)
    #     bst = RayXGBClassifier(
    #         base_score=0.4,
    #         booster="gblinear",
    #         learning_rate=0.1,
    #         n_estimators=10,
    #         objective="multi:softprob",
    #         random_state=0,
    #         reg_alpha=0.001,
    #         reg_lambda=0.01,
    #         scale_pos_weight=0.5,
    #         verbosity=0,
    #         use_label_encoder=False,
    #     )
    #     rfecv = RFECV(estimator=bst, step=1, cv=3, scoring="neg_log_loss")
    #     rfecv.fit(X, y)

    #     X[0:4, :] = np.nan  # verify scikit_learn doesn't throw with nan
    #     reg = RayXGBRegressor()
    #     rfecv = RFECV(estimator=reg)
    #     rfecv.fit(X, y)

    #     cls = RayXGBClassifier(use_label_encoder=False)
    #     rfecv = RFECV(
    #         estimator=cls, step=1, cv=3, scoring="neg_mean_squared_error")
    #     rfecv.fit(X, y)

    def test_XGBClassifier_resume(self):
        self._init_ray()

        from sklearn.datasets import load_breast_cancer
        from sklearn.metrics import log_loss

        with TemporaryDirectory() as tempdir:
            model1_path = os.path.join(tempdir, "test_XGBClassifier.model")
            model1_booster_path = os.path.join(tempdir, "test_XGBClassifier.booster")

            X, Y = load_breast_cancer(return_X_y=True)

            model1 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
            model1.fit(X, Y)

            pred1 = model1.predict(X)
            log_loss1 = log_loss(pred1, Y)

            # file name of stored xgb model
            model1.save_model(model1_path)
            model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2

            # file name of 'Booster' instance Xgb model
            model1.get_booster().save_model(model1_booster_path)
            model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_booster_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_constraint_parameters(self):
        self._init_ray()

        reg = RayXGBRegressor(interaction_constraints="[[0, 1], [2, 3, 4]]")
        X = np.random.randn(10, 10)
        y = np.random.randn(10)
        reg.fit(X, y)

        config = json.loads(reg.get_booster().save_config())

        if XGBOOST_VERSION >= Version("2.0.0"):
            assert (
                config["learner"]["gradient_booster"]["tree_train_param"][
                    "interaction_constraints"
                ]
                == "[[0, 1], [2, 3, 4]]"
            )
        elif XGBOOST_VERSION >= Version("1.6.0"):
            assert (
                config["learner"]["gradient_booster"]["updater"]["grow_histmaker"][
                    "train_param"
                ]["interaction_constraints"]
                == "[[0, 1], [2, 3, 4]]"
            )
        else:
            assert (
                config["learner"]["gradient_booster"]["updater"]["prune"][
                    "train_param"
                ]["interaction_constraints"]
                == "[[0, 1], [2, 3, 4]]"
            )

    # TODO check why this is not working (output is empty, probably due to Ray)
    # def test_parameter_validation(self):
    #     self._init_ray()

    #     reg = RayXGBRegressor(foo='bar', verbosity=1)
    #     X = np.random.randn(10, 10)
    #     y = np.random.randn(10)
    #     out = io.StringIO()
    #     err = io.StringIO()
    #     with redirect_stdout(out), redirect_stderr(err):
    #         reg.fit(X, y)
    #     output = out.getvalue().strip()

    #     print(output)
    #     assert output.find('foo') != -1

    #     reg = RayXGBRegressor(n_estimators=2,
    #                           missing=3,
    #                           importance_type='gain',
    #                           verbosity=1)
    #     X = np.random.randn(10, 10)
    #     y = np.random.randn(10)
    #     out = io.StringIO()
    #     err = io.StringIO()
    #     with redirect_stdout(out), redirect_stderr(err):
    #         reg.fit(X, y)
    #     output = out.getvalue().strip()

    #     assert len(output) == 0

    # def test_deprecate_position_arg(self):
    #     self._init_ray()

    #     from sklearn.datasets import load_digits

    #     X, y = load_digits(return_X_y=True, n_class=2)
    #     w = y
    #     with self.assertWarns(FutureWarning):
    #         RayXGBRegressor(3, learning_rate=0.1)
    #     model = RayXGBRegressor(n_estimators=1)
    #     with self.assertWarns(FutureWarning):
    #         model.fit(X, y, w)

    #     with self.assertWarns(FutureWarning):
    #         RayXGBClassifier(1, use_label_encoder=False)
    #     model = RayXGBClassifier(n_estimators=1, use_label_encoder=False)
    #     with self.assertWarns(FutureWarning):
    #         model.fit(X, y, w)

    #     with self.assertWarns(FutureWarning):
    #         RayXGBRanker("rank:ndcg", learning_rate=0.1)
    #     model = RayXGBRanker(n_estimators=1)
    #     group = np.repeat(1, X.shape[0])
    #     with self.assertWarns(FutureWarning):
    #         model.fit(X, y, group)

    #     with self.assertWarns(FutureWarning):
    #         RayXGBRFRegressor(1, learning_rate=0.1)
    #     model = RayXGBRFRegressor(n_estimators=1)
    #     with self.assertWarns(FutureWarning):
    #         model.fit(X, y, w)

    #     with self.assertWarns(FutureWarning):
    #         RayXGBRFClassifier(1, use_label_encoder=True)
    #     model = RayXGBRFClassifier(n_estimators=1)
    #     with self.assertWarns(FutureWarning):
    #         model.fit(X, y, w)

    def test_pandas_input(self):
        self._init_ray()

        import pandas as pd
        from sklearn.calibration import CalibratedClassifierCV

        rng = np.random.RandomState(self.seed)

        kRows = 100
        kCols = 6

        X = rng.randint(low=0, high=2, size=kRows * kCols)
        X = X.reshape(kRows, kCols)

        df = pd.DataFrame(X)
        feature_names = []
        for i in range(1, kCols):
            feature_names += ["k" + str(i)]

        df.columns = ["status"] + feature_names

        target = df["status"]
        train = df.drop(columns=["status"])
        model = RayXGBClassifier()
        model.fit(train, target)
        clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
        clf_isotonic.fit(train, target)
        try:
            estimator = clf_isotonic.calibrated_classifiers_[0].base_estimator
        except AttributeError:
            # sklearn>=1.2
            estimator = clf_isotonic.calibrated_classifiers_[0].estimator
        assert isinstance(
            estimator,
            RayXGBClassifier,
        )
        self.assertTrue(np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1])))

    # def run_feature_weights(self, X, y, fw, model=RayXGBRegressor):
    #     with TemporaryDirectory() as tmpdir:
    #         colsample_bynode = 0.5
    #         reg = model(
    #           tree_method='hist', colsample_bynode=colsample_bynode
    #         )

    #         reg.fit(X, y, feature_weights=fw)
    #         model_path = os.path.join(tmpdir, 'model.json')
    #         reg.save_model(model_path)
    #         with open(model_path) as fd:
    #             model = json.load(fd)

    #         parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
    #                                    'json_parser.py')
    #         spec = importlib.util.spec_from_file_location(
    #             "JsonParser", parser_path)
    #         foo = importlib.util.module_from_spec(spec)
    #         spec.loader.exec_module(foo)
    #         model = foo.Model(model)
    #         splits = {}
    #         total_nodes = 0
    #         for tree in model.trees:
    #             n_nodes = len(tree.nodes)
    #             total_nodes += n_nodes
    #             for n in range(n_nodes):
    #                 if tree.is_leaf(n):
    #                     continue
    #                 if splits.get(tree.split_index(n), None) is None:
    #                     splits[tree.split_index(n)] = 1
    #                 else:
    #                     splits[tree.split_index(n)] += 1

    #         od = collections.OrderedDict(sorted(splits.items()))
    #         tuples = [(k, v) for k, v in od.items()]
    #         k, v = list(zip(*tuples))
    #         w = np.polyfit(k, v, deg=1)
    #         return w

    # def test_feature_weights(self):
    #     kRows = 512
    #     kCols = 64
    #     X = self.rng.randn(kRows, kCols)
    #     y = self.rng.randn(kRows)

    #     fw = np.ones(shape=(kCols, ))
    #     for i in range(kCols):
    #         fw[i] *= float(i)
    #     poly_increasing = self.run_feature_weights(X, y, fw, RayXGBRegressor)

    #     fw = np.ones(shape=(kCols, ))
    #     for i in range(kCols):
    #         fw[i] *= float(kCols - i)
    #     poly_decreasing = self.run_feature_weights(X, y, fw, RayXGBRegressor)

    #     # Approxmated test, this is dependent on the implementation of random
    #     # number generator in std library.
    #     assert poly_increasing[0] > 0.08
    #     assert poly_decreasing[0] < -0.08

    def run_boost_from_prediction(self, tree_method):
        from sklearn.datasets import load_breast_cancer

        X, y = load_breast_cancer(return_X_y=True)
        model_0 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_0.fit(X=X, y=y)
        margin = model_0.predict(X, output_margin=True)

        model_1 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_1.fit(X=X, y=y, base_margin=margin)
        predictions_1 = model_1.predict(X, base_margin=margin)

        cls_2 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=8,
            tree_method=tree_method,
        )
        cls_2.fit(X=X, y=y)
        predictions_2 = cls_2.predict(X)
        assert np.all(predictions_1 == predictions_2)

    def boost_from_prediction(self, tree_method):
        self._init_ray()

        self.run_boost_from_prediction(tree_method)

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.0.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_boost_from_prediction_hist(self):
        self.run_boost_from_prediction("hist")

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.2.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_boost_from_prediction_approx(self):
        self.run_boost_from_prediction("approx")

    # Updater `grow_colmaker` or `exact` tree method doesn't support
    # distributed training
    def test_boost_from_prediction_exact(self):
        with self.assertRaises(ValueError):
            self.run_boost_from_prediction("exact")

    @unittest.skipIf(
        XGBOOST_VERSION < Version("1.4.0"),
        f"not supported in xgb version {xgb.__version__}",
    )
    def test_estimator_type(self):
        self._init_ray()

        assert RayXGBClassifier._estimator_type == "classifier"
        assert RayXGBRFClassifier._estimator_type == "classifier"
        assert RayXGBRegressor._estimator_type == "regressor"
        assert RayXGBRFRegressor._estimator_type == "regressor"
        assert RayXGBRanker._estimator_type == "ranker"

        from sklearn.datasets import load_digits

        X, y = load_digits(n_class=2, return_X_y=True)
        cls = RayXGBClassifier(n_estimators=2).fit(X, y)
        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "cls.json")
            cls.save_model(path)

            reg = RayXGBRegressor()
            with self.assertRaises(TypeError):
                reg.load_model(path)

            cls = RayXGBClassifier()
            cls.load_model(path)  # no error

    def test_ranking(self):
        # generate random data
        x_train = np.random.rand(1000, 10)
        y_train = np.random.randint(5, size=1000)
        train_qid = np.repeat(np.array([list(range(20))]), 50)

        x_valid = np.random.rand(200, 10)
        y_valid = np.random.randint(5, size=200)
        valid_qid = np.repeat(np.array([list(range(4))]), 50)

        x_test = np.random.rand(100, 10)

        params = {
            "objective": "rank:pairwise",
            "learning_rate": 0.1,
            "gamma": 1.0,
            "min_child_weight": 0.1,
            "max_depth": 6,
            "n_estimators": 4,
            "random_state": 1,
            "n_jobs": 2,
        }
        model = RayXGBRanker(**params)
        model.fit(
            x_train,
            y_train,
            qid=train_qid,
            eval_set=[(x_valid, y_valid)],
            eval_qid=[valid_qid],
        )
        assert model.evals_result()

        pred = model.predict(x_test)

        train_data = RayDMatrix(x_train, y_train, qid=train_qid)
        valid_data = RayDMatrix(x_valid, y_valid, qid=valid_qid)
        test_data = RayDMatrix(x_test)

        params_orig = {
            "objective": "rank:pairwise",
            "eta": 0.1,
            "gamma": 1.0,
            "min_child_weight": 0.1,
            "max_depth": 6,
            "random_state": 1,
        }
        xgb_model_orig = train(
            params_orig,
            train_data,
            num_boost_round=4,
            evals=[(valid_data, "validation")],
            ray_params=RayParams(num_actors=2, max_actor_restarts=0),
        )
        pred_orig = predict(
            xgb_model_orig,
            test_data,
            ray_params=RayParams(num_actors=2, max_actor_restarts=0),
        )

        np.testing.assert_almost_equal(pred, pred_orig)


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_sklearn_matrix.py
================================================
import unittest

import numpy as np
import ray
import xgboost as xgb
from packaging.version import Version
from sklearn.model_selection import train_test_split

from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix
from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor

has_label_encoder = XGBOOST_VERSION >= Version("1.0.0") and XGBOOST_VERSION < Version(
    "1.6.0"
)


class XGBoostRaySklearnMatrixTest(unittest.TestCase):
    def setUp(self):
        self.seed = 1994
        self.rng = np.random.RandomState(self.seed)
        self.params = {"n_estimators": 10}

    def tearDown(self) -> None:
        if ray.is_initialized():
            ray.shutdown()

    def _init_ray(self):
        if not ray.is_initialized():
            ray.init(num_cpus=4)

    @unittest.skipIf(
        has_label_encoder, f"not supported in xgb version {xgb.__version__}"
    )
    def testClassifierNoLabelEncoder(self, n_class=2):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits = load_digits(n_class=n_class)
        y = digits["target"]
        X = digits["data"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

        train_matrix = RayDMatrix(X_train, y_train)
        test_matrix = RayDMatrix(X_test, y_test)

        with self.assertRaisesRegex(Exception, "num_class"):
            RayXGBClassifier(**self.params).fit(train_matrix, None)

        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
            RayXGBClassifier(**self.params).fit(
                train_matrix, None, eval_set=[(X_test, y_test)]
            )

        with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"):
            RayXGBClassifier(**self.params).fit(
                X_train, y_train, eval_set=[(test_matrix, "eval")]
            )

        RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None)

        clf = RayXGBClassifier(num_class=n_class, **self.params).fit(
            train_matrix, None, eval_set=[(test_matrix, "eval")]
        )

        clf.predict(test_matrix)
        clf.predict_proba(test_matrix)

    @unittest.skipIf(
        has_label_encoder, f"not supported in xgb version {xgb.__version__}"
    )
    def testClassifierMulticlassNoLabelEncoder(self):
        self.testClassifierNoLabelEncoder(n_class=3)

    def testRegressor(self):
        self._init_ray()

        from sklearn.datasets import fetch_california_housing

        ds = fetch_california_housing()
        y = ds["target"]
        X = ds["data"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

        train_matrix = RayDMatrix(X_train, y_train)
        test_matrix = RayDMatrix(X_test, y_test)

        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
            RayXGBRegressor(**self.params).fit(
                train_matrix, None, eval_set=[(X_test, y_test)]
            )

        with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"):
            RayXGBRegressor(**self.params).fit(
                X_train, y_train, eval_set=[(test_matrix, "eval")]
            )

        RayXGBRegressor(**self.params).fit(train_matrix, None)

        reg = RayXGBRegressor(**self.params).fit(
            train_matrix, None, eval_set=[(test_matrix, "eval")]
        )

        reg.predict(test_matrix)


================================================
FILE: xgboost_ray/tests/test_tune.py
================================================
import os
import shutil
import tempfile
import unittest
from unittest.mock import MagicMock, patch

import numpy as np
import ray
from ray import tune
from ray.tune import TuneError
from ray.tune.integration.xgboost import (
    TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback,
)

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.tune import TuneReportCheckpointCallback, _try_add_tune_callback


class XGBoostRayTuneTest(unittest.TestCase):
    def setUp(self):
        ray.init(num_cpus=4)
        repeat = 8  # Repeat data a couple of times for stability
        x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 2
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 3
            ]
            * repeat
        )
        y = np.array([0, 1, 2, 3] * repeat)

        self.params = {
            "xgb": {
                "booster": "gbtree",
                "nthread": 1,
                "max_depth": 2,
                "objective": "multi:softmax",
                "num_class": 4,
                "eval_metric": ["mlogloss", "merror"],
            },
            "num_boost_round": tune.choice([1, 3]),
        }

        def train_func(
            ray_params, callbacks=None, check_for_spread_strategy=False, **kwargs
        ):
            def _inner_train(config):
                if check_for_spread_strategy:
                    assert (
                        ray.train.get_context().get_trial_resources().strategy
                        == "SPREAD"
                    )
                train_set = RayDMatrix(x, y)
                train(
                    config["xgb"],
                    dtrain=train_set,
                    ray_params=ray_params,
                    num_boost_round=config["num_boost_round"],
                    evals=[(train_set, "train")],
                    callbacks=callbacks,
                    **kwargs
                )

            return _inner_train

        self.train_func = train_func
        self.experiment_dir = tempfile.mkdtemp()

    def tearDown(self):
        ray.shutdown()
        shutil.rmtree(self.experiment_dir)

    # noinspection PyTypeChecker
    @patch.dict(os.environ, {"TUNE_RESULT_DELIM": "/"})
    def testNumIters(self):
        """Test that the number of reported tune results is correct"""
        ray_params = RayParams(cpus_per_actor=1, num_actors=2)
        params = self.params.copy()
        params["num_boost_round"] = tune.grid_search([1, 3])

        # TODO(justinvyu): Remove this once the xgboost integration
        # has been updated on the Ray side.
        try:
            callback = TuneReportCheckpointCallback(
                frequency=1, checkpoint_at_end=False
            )
        except TypeError:
            callback = TuneReportCheckpointCallback(frequency=1)

        analysis = tune.run(
            self.train_func(ray_params, callbacks=[callback]),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=1,
        )

        self.assertSequenceEqual(
            list(analysis.results_df["training_iteration"]),
            list(analysis.results_df["config/num_boost_round"]),
        )

    def testNumItersClient(self):
        """Test ray client mode"""
        if ray.__version__ <= "1.2.0":
            self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")

        from ray.util.client.ray_client_helpers import ray_start_client_server

        self.assertFalse(ray.util.client.ray.is_connected())
        with ray_start_client_server():
            self.assertTrue(ray.util.client.ray.is_connected())
            self.testNumIters()

    def testPlacementOptions(self):
        ray_params = RayParams(
            cpus_per_actor=1, num_actors=1, placement_options={"strategy": "SPREAD"}
        )
        tune.run(
            self.train_func(ray_params, check_for_spread_strategy=True),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=1,
        )

    def testElasticFails(self):
        """Test if error is thrown when using Tune with elastic training."""
        ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True)
        with self.assertRaises(TuneError):
            tune.run(
                self.train_func(ray_params),
                config=self.params,
                resources_per_trial=ray_params.get_tune_resources(),
                num_samples=1,
            )

    def testReplaceTuneCheckpoints(self):
        """Test if ray.tune.integration.xgboost callbacks are replaced"""
        # Report and checkpointing callback
        in_cp = [OrigTuneReportCheckpointCallback(metrics="met")]
        in_dict = {"callbacks": in_cp}

        with patch("ray.train.get_context") as mocked:
            mocked.return_value = MagicMock(return_value=True)
            _try_add_tune_callback(in_dict)

        replaced = in_dict["callbacks"][0]
        self.assertTrue(isinstance(replaced, TuneReportCheckpointCallback))

        self.assertSequenceEqual(replaced._metrics, ["met"])

    def testEndToEndCheckpointing(self):
        ray_params = RayParams(cpus_per_actor=1, num_actors=2)
        analysis = tune.run(
            self.train_func(
                ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)]
            ),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=1,
            metric="train-mlogloss",
            mode="min",
            log_to_file=True,
            local_dir=self.experiment_dir,
        )

        self.assertTrue(os.path.exists(analysis.best_checkpoint.path))

    def testEndToEndCheckpointingOrigTune(self):
        ray_params = RayParams(cpus_per_actor=1, num_actors=2)
        analysis = tune.run(
            self.train_func(
                ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)]
            ),
            config=self.params,
            resources_per_trial=ray_params.get_tune_resources(),
            num_samples=1,
            metric="train-mlogloss",
            mode="min",
            local_dir=self.experiment_dir,
        )

        self.assertTrue(os.path.exists(analysis.best_checkpoint.path))


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/test_xgboost_api.py
================================================
import unittest
from typing import Tuple

import numpy as np
import ray
import xgboost as xgb

from xgboost_ray import RayDMatrix, RayParams, train
from xgboost_ray.compat import TrainingCallback

# From XGBoost documentation:
# https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html
from xgboost_ray.session import get_actor_rank, put_queue


def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)


def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    y = dtrain.get_label()
    return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)


def squared_log(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess


def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y)))


class XGBoostAPITest(unittest.TestCase):
    """This test suite validates core XGBoost API functionality."""

    def setUp(self):
        repeat = 8  # Repeat data a couple of times for stability
        self.x = np.array(
            [
                [1, 0, 0, 0],  # Feature 0 -> Label 0
                [0, 1, 0, 0],  # Feature 1 -> Label 1
                [0, 0, 1, 1],  # Feature 2+3 -> Label 0
                [0, 0, 1, 0],  # Feature 2+!3 -> Label 1
            ]
            * repeat
        )
        self.y = np.array([0, 1, 0, 1] * repeat)

        self.params = {
            "booster": "gbtree",
            "tree_method": "hist",
            "nthread": 1,
            "max_depth": 2,
            "objective": "binary:logistic",
            "seed": 1000,
        }

        self.kwargs = {}

    def tearDown(self) -> None:
        if ray.is_initialized():
            ray.shutdown()

    def _init_ray(self):
        if not ray.is_initialized():
            ray.init(num_cpus=4)

    def testCustomObjectiveFunction(self):
        """Ensure that custom objective functions work.

        Runs a custom objective function with pure XGBoost and
        XGBoost on Ray and compares the prediction outputs."""
        self._init_ray()

        params = self.params.copy()
        params.pop("objective", None)

        bst_xgb = xgb.train(params, xgb.DMatrix(self.x, self.y), obj=squared_log)

        bst_ray = train(
            params,
            RayDMatrix(self.x, self.y),
            ray_params=RayParams(num_actors=2),
            obj=squared_log,
            **self.kwargs,
        )

        x_mat = xgb.DMatrix(self.x)
        pred_y_xgb = np.round(bst_xgb.predict(x_mat))
        pred_y_ray = np.round(bst_ray.predict(x_mat))

        self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray))
        self.assertSequenceEqual(list(self.y), list(pred_y_ray))

    def testCustomMetricFunction(self):
        """Ensure that custom objective functions work.

        Runs a custom objective function with pure XGBoost and
        XGBoost on Ray and compares the prediction outputs."""
        self._init_ray()

        params = self.params.copy()
        params.pop("objective", None)
        params["disable_default_eval_metric"] = 1

        dtrain_xgb = xgb.DMatrix(self.x, self.y)
        evals_result_xgb = {}
        bst_xgb = xgb.train(
            params,
            dtrain_xgb,
            obj=squared_log,
            feval=rmsle,
            evals=[(dtrain_xgb, "dtrain")],
            evals_result=evals_result_xgb,
        )

        dtrain_ray = RayDMatrix(self.x, self.y)
        evals_result_ray = {}
        bst_ray = train(
            params,
            dtrain_ray,
            ray_params=RayParams(num_actors=2),
            obj=squared_log,
            feval=rmsle,
            evals=[(dtrain_ray, "dtrain")],
            evals_result=evals_result_ray,
            **self.kwargs,
        )

        x_mat = xgb.DMatrix(self.x)
        pred_y_xgb = np.round(bst_xgb.predict(x_mat))
        pred_y_ray = np.round(bst_ray.predict(x_mat))

        self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray))
        self.assertSequenceEqual(list(self.y), list(pred_y_ray))

        self.assertTrue(
            np.allclose(
                evals_result_xgb["dtrain"]["PyRMSLE"],
                evals_result_ray["dtrain"]["PyRMSLE"],
                atol=0.1,
            )
        )

    def testCallbacks(self):
        class _Callback(TrainingCallback):
            def after_iteration(self, model, epoch, evals_log):
                print(f"My rank: {get_actor_rank()}")
                put_queue(("rank", get_actor_rank()))

        callback = _Callback()

        additional_results = {}
        train(
            self.params,
            RayDMatrix(self.x, self.y),
            ray_params=RayParams(num_actors=2),
            callbacks=[callback],
            additional_results=additional_results,
            **self.kwargs,
        )

        self.assertEqual(len(additional_results["callback_returns"]), 2)
        self.assertTrue(
            all(rank == 0 for (_, rank) in additional_results["callback_returns"][0])
        )
        self.assertTrue(
            all(rank == 1 for (_, rank) in additional_results["callback_returns"][1])
        )


if __name__ == "__main__":
    import sys

    import pytest

    sys.exit(pytest.main(["-v", __file__]))


================================================
FILE: xgboost_ray/tests/utils.py
================================================
import json
import os
import tempfile
import time
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost_ray.compat import TrainingCallback
from xgboost_ray.session import get_actor_rank, put_queue


def get_num_trees(bst: xgb.Booster):
    import json

    data = [json.loads(d) for d in bst.get_dump(dump_format="json")]
    return len(data) // 4


def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32):

    return pd.DataFrame(
        np.random.uniform(0.0, 10.0, size=(num_rows, num_cols)),
        columns=[f"feature_{i}" for i in range(num_cols)],
        dtype=dtype,
    )


def create_labels(
    num_rows: int, num_classes: int = 2, dtype: Optional[np.dtype] = None
):
    if num_classes == 0:
        # Create regression label
        dtype = dtype or np.float32
        return pd.Series(
            np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label"
        )

    dtype = dtype or np.int32
    return pd.Series(
        np.random.randint(0, num_classes, size=num_rows), dtype=dtype, name="label"
    )


def create_parquet(
    filename: str,
    num_rows: int,
    num_features: int,
    num_classes: int = 2,
    num_partitions: int = 1,
):

    partition_rows = num_rows // num_partitions
    for partition in range(num_partitions):
        print(f"Creating partition {partition}")
        data = create_data(partition_rows, num_features)
        labels = create_labels(partition_rows, num_classes)
        partition = pd.Series(np.full(partition_rows, partition), dtype=np.int32)

        data["labels"] = labels
        data["partition"] = partition

        os.makedirs(filename, 0o755, exist_ok=True)
        data.to_parquet(
            filename,
            partition_cols=["partition"],
            engine="pyarrow",
            partition_filename_cb=lambda key: f"part_{key[0]}.parquet",
        )


def create_parquet_in_tempdir(
    filename: str,
    num_rows: int,
    num_features: int,
    num_classes: int = 2,
    num_partitions: int = 1,
) -> Tuple[str, str]:
    temp_dir = tempfile.mkdtemp()
    path = os.path.join(temp_dir, filename)
    create_parquet(
        path,
        num_rows=num_rows,
        num_features=num_features,
        num_classes=num_classes,
        num_partitions=num_partitions,
    )
    return temp_dir, path


def flatten_obj(obj: Union[List, Dict], keys=None, base=None):
    keys = keys or []
    base = base if base is not None else {}  # Keep same object if empty dict
    if isinstance(obj, list):
        for i, o in enumerate(obj):
            flatten_obj(o, keys + [str(i)], base)
    elif isinstance(obj, dict):
        for k, o in obj.items():
            flatten_obj(o, keys + [str(k)], base)
    else:
        base["/".join(keys)] = obj
    return base


def tree_obj(bst: xgb.Booster):
    return [json.loads(j) for j in bst.get_dump(dump_format="json")]


def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6):
    """Returns a callback to kill an actor process.

    Args:
        die_lock_file: A file lock used to prevent race conditions
            when killing the actor.
        actor_rank: The rank of the actor to kill.
        fail_iteration: The iteration after which the actor is killed.

    """

    class _KillCallback(TrainingCallback):
        def after_iteration(self, model, epoch, evals_log):
            if get_actor_rank() == actor_rank:
                put_queue((epoch, time.time()))
            if (
                get_actor_rank() == actor_rank
                and epoch == fail_iteration
                and not os.path.exists(die_lock_file)
            ):

                # Get PID
                pid = os.getpid()
                print(f"Killing process: {pid}")
                with open(die_lock_file, "wt") as fp:
                    fp.write("")

                time.sleep(2)
                print(f"Testing: Rank {get_actor_rank()} will now die.")
                os.kill(pid, 9)

    return _KillCallback()


def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6):
    """Returns a callback to cause an Xgboost actor to fail training.

    Args:
        die_lock_file: A file lock used to prevent race conditions
            when causing the actor to fail.
        actor_rank: The rank of the actor to fail.
        fail_iteration: The iteration after which the training for
            the specified actor fails.

    """

    class _FailCallback(TrainingCallback):
        def after_iteration(self, model, epoch, evals_log):

            if get_actor_rank() == actor_rank:
                put_queue((epoch, time.time()))
            if (
                get_actor_rank() == actor_rank
                and epoch == fail_iteration
                and not os.path.exists(die_lock_file)
            ):

                with open(die_lock_file, "wt") as fp:
                    fp.write("")
                time.sleep(2)
                import sys

                print(f"Testing: Rank {get_actor_rank()} will now fail.")
                sys.exit(1)

    return _FailCallback()


def _checkpoint_callback(frequency: int = 1, before_iteration_=False):
    """Returns a callback to checkpoint a model.

    Args:
        frequency: The interval at which checkpointing occurs. If
            frequency is set to n, checkpointing occurs every n epochs.
        before_iteration_: If True, checkpoint before the iteration
            begins. Else, checkpoint after the iteration ends.

    """

    class _CheckpointCallback(TrainingCallback):
        def after_iteration(self, model, epoch, evals_log):
            if epoch % frequency == 0:
                put_queue(model.save_raw())

    if before_iteration_:

        def _before_iteration(self, model, epoch, evals_log):
            self.after_iteration(model, epoch, evals_log)

        _CheckpointCallback.before_iteration = _before_iteration

    return _CheckpointCallback()


def _sleep_callback(sleep_iteration: int = 6, sleep_seconds: int = 5):
    """Returns a callback to sleep after an iteration.

    This artificially inflates training time.

    Args:
        sleep_iteration: The iteration after which the actor should
            sleep.
        sleep_seconds: Time in seconds the actor should sleep.

    """

    class _SleepCallback(TrainingCallback):
        def after_iteration(self, model, epoch, evals_log):
            if epoch == sleep_iteration:
                print(
                    f"Testing: Rank {get_actor_rank()} will now sleep "
                    f"for {sleep_seconds} seconds."
                )
                time.sleep(sleep_seconds)

    return _SleepCallback()


================================================
FILE: xgboost_ray/tune.py
================================================
import logging
from typing import Dict, Optional

import ray
from ray.util.annotations import PublicAPI

from xgboost_ray.session import get_rabit_rank, put_queue
from xgboost_ray.util import force_on_current_node
from xgboost_ray.xgb import xgboost as xgb

try:
    from ray import train, tune  # noqa: F401
except (ImportError, ModuleNotFoundError) as e:
    raise RuntimeError(
        "Ray Train and Ray Tune are required dependencies of xgboost_ray. "
        'Please install with: `pip install "ray[train]"`'
    ) from e

import ray.train
from ray.tune.integration.xgboost import TuneReportCallback as OrigTuneReportCallback
from ray.tune.integration.xgboost import (
    TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback,
)


class TuneReportCheckpointCallback(OrigTuneReportCheckpointCallback):
    def after_iteration(self, model, epoch: int, evals_log: Dict):
        # NOTE: We need to update `evals_log` here (even though the super method
        # already does it) because the actual callback method gets run
        # in a different process, so *this* instance of the callback will not have
        # access to the `evals_log` dict in `after_training`.
        self._evals_log = evals_log

        if get_rabit_rank() == 0:
            put_queue(
                lambda: super(TuneReportCheckpointCallback, self).after_iteration(
                    model=model, epoch=epoch, evals_log=evals_log
                )
            )

    def after_training(self, model):
        if get_rabit_rank() == 0:
            put_queue(
                lambda: super(TuneReportCheckpointCallback, self).after_training(
                    model=model
                )
            )
        return model


class TuneReportCallback(OrigTuneReportCallback):
    def __new__(cls: type, *args, **kwargs):
        # TODO(justinvyu): [code_removal] Remove in Ray 2.11.
        raise DeprecationWarning(
            "`TuneReportCallback` is deprecated. "
            "Use `xgboost_ray.tune.TuneReportCheckpointCallback` instead."
        )


def _try_add_tune_callback(kwargs: Dict):
    ray_train_context_initialized = (
        ray.train.get_context().get_trial_resources() is not None
    )
    if ray_train_context_initialized:
        callbacks = kwargs.get("callbacks", []) or []
        new_callbacks = []
        has_tune_callback = False

        REPLACE_MSG = (
            "Replaced `{orig}` with `{target}`. If you want to "
            "avoid this warning, pass `{target}` as a callback "
            "directly in your calls to `xgboost_ray.train()`."
        )

        for cb in callbacks:
            if isinstance(cb, TuneReportCheckpointCallback):
                has_tune_callback = True
                new_callbacks.append(cb)
            elif isinstance(cb, OrigTuneReportCheckpointCallback):
                orig_metrics = cb._metrics
                orig_frequency = cb._frequency

                replace_cb = TuneReportCheckpointCallback(
                    metrics=orig_metrics, frequency=orig_frequency
                )
                new_callbacks.append(replace_cb)
                logging.warning(
                    REPLACE_MSG.format(
                        orig="ray.tune.integration.xgboost."
                        "TuneReportCheckpointCallback",
                        target="xgboost_ray.tune.TuneReportCheckpointCallback",
                    )
                )
                has_tune_callback = True
            else:
                new_callbacks.append(cb)

        if not has_tune_callback:
            new_callbacks.append(TuneReportCheckpointCallback(frequency=0))

        kwargs["callbacks"] = new_callbacks
        return True
    else:
        return False


def _get_tune_resources(
    num_actors: int,
    cpus_per_actor: int,
    gpus_per_actor: int,
    resources_per_actor: Optional[Dict],
    placement_options: Optional[Dict],
):
    """Returns object to use for ``resources_per_trial`` with Ray Tune."""
    from ray.tune import PlacementGroupFactory

    head_bundle = {}
    child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor}
    child_bundle_extra = {} if resources_per_actor is None else resources_per_actor
    child_bundles = [{**child_bundle, **child_bundle_extra} for _ in range(num_actors)]
    bundles = [head_bundle] + child_bundles
    placement_options = placement_options or {}
    placement_options.setdefault("strategy", "PACK")
    placement_group_factory = PlacementGroupFactory(bundles, **placement_options)

    return placement_group_factory


@PublicAPI(stability="beta")
def load_model(model_path):
    """Loads the model stored in the provided model_path.

    If using Ray Client, this will automatically handle loading the path on
    the server by using a Ray task.

    Returns:
        xgb.Booster object of the model stored in the provided model_path

    """

    def load_model_fn(model_path):
        best_bst = xgb.Booster()
        best_bst.load_model(model_path)
        return best_bst

    # Load the model checkpoint.
    if ray.util.client.ray.is_connected():
        # If using Ray Client, the best model is saved on the server.
        # So we have to wrap the model loading in a ray task.
        remote_load = ray.remote(load_model_fn)
        remote_load = force_on_current_node(remote_load)
        bst = ray.get(remote_load.remote(model_path))
    else:
        bst = load_model_fn(model_path)

    return bst


================================================
FILE: xgboost_ray/util.py
================================================
import asyncio
from typing import Dict, List, Optional

import ray
from ray.util.annotations import DeveloperAPI


@DeveloperAPI
class Unavailable:
    """No object should be instance of this class"""

    def __init__(self):
        raise RuntimeError("This class should never be instantiated.")


class _EventActor:
    def __init__(self):
        self._event = asyncio.Event()

    def set(self):
        self._event.set()

    def clear(self):
        self._event.clear()

    def is_set(self):
        return self._event.is_set()


@DeveloperAPI
class Event:
    def __init__(self, actor_options: Optional[Dict] = None):
        actor_options = {} if not actor_options else actor_options
        self.actor = ray.remote(_EventActor).options(**actor_options).remote()

    def set(self):
        self.actor.set.remote()

    def clear(self):
        self.actor.clear.remote()

    def is_set(self):
        return ray.get(self.actor.is_set.remote())

    def shutdown(self):
        if self.actor:
            ray.kill(self.actor)
        self.actor = None


@DeveloperAPI
class MultiActorTask:
    """Utility class to hold multiple futures.

    The `is_ready()` method will return True once all futures are ready.

    Args:
        pending_futures: List of object references (futures)
            that should be tracked.
    """

    def __init__(self, pending_futures: Optional[List[ray.ObjectRef]] = None):
        self._pending_futures = pending_futures or []
        self._ready_futures = []

    def is_ready(self):
        if not self._pending_futures:
            return True

        ready = True
        while ready:
            ready, not_ready = ray.wait(self._pending_futures, timeout=0)
            if ready:
                for obj in ready:
                    self._pending_futures.remove(obj)
                    self._ready_futures.append(obj)

        return not bool(self._pending_futures)


@DeveloperAPI
def get_current_node_resource_key() -> str:
    """Get the Ray resource key for current node.
    It can be used for actor placement.
    If using Ray Client, this will return the resource key for the node that
    is running the client server.
    """
    current_node_id = ray.get_runtime_context().get_node_id()
    for node in ray.nodes():
        if node["NodeID"] == current_node_id:
            # Found the node.
            for key in node["Resources"].keys():
                if key.startswith("node:"):
                    return key
    else:
        raise ValueError("Cannot found the node dictionary for current node.")


@DeveloperAPI
def force_on_current_node(task_or_actor):
    """Given a task or actor, place it on the current node.

    If the task or actor that is passed in already has custom resource
    requirements, then they will be overridden.

    If using Ray Client, the current node is the client server node.
    """
    node_resource_key = get_current_node_resource_key()
    options = {"resources": {node_resource_key: 0.01}}
    return task_or_actor.options(**options)


================================================
FILE: xgboost_ray/xgb.py
================================================
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    import xgboost
else:
    try:
        import xgboost
    except ImportError:
        xgboost = None

__all__ = ["xgboost"]