Repository: dholzmueller/pytabkit
Branch: main
Commit: c126ea51187c
Files: 157
Total size: 2.0 MB

Directory structure:
gitextract_xlrx7g0c/

├── .github/
│   └── workflows/
│       └── testing.yml
├── .gitignore
├── .readthedocs.yaml
├── LICENSE.txt
├── README.md
├── docs/
│   ├── Makefile
│   ├── make.bat
│   ├── requirements.txt
│   └── source/
│       ├── bench/
│       │   ├── 00_installation.md
│       │   ├── 01_running_the_benchmark.md
│       │   ├── 02_stored_data.md
│       │   ├── 03_code.md
│       │   ├── adding_models.md
│       │   ├── download_results.md
│       │   ├── refine_then_calibrate.md
│       │   └── using_the_scheduler.md
│       ├── conf.py
│       ├── index.rst
│       └── models/
│           ├── 00_overview.md
│           ├── 01_sklearn_interfaces.rst
│           ├── 02_hpo.md
│           ├── 03_training_implementation.md
│           ├── examples.md
│           ├── nn_classes.md
│           └── quantile_reg.md
├── examples/
│   └── tutorial_notebook.ipynb
├── original_requirements/
│   ├── conda_env_2024_06_25.yml
│   ├── conda_env_2024_10_28.yml
│   ├── conda_env_2025_01_15.yml
│   └── requirements_2024_06_25.txt
├── pyproject.toml
├── pytabkit/
│   ├── __about__.py
│   ├── __init__.py
│   ├── bench/
│   │   ├── __init__.py
│   │   ├── alg_wrappers/
│   │   │   ├── __init__.py
│   │   │   ├── general.py
│   │   │   └── interface_wrappers.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── get_uci.py
│   │   │   ├── import_talent_benchmark.py
│   │   │   ├── import_tasks.py
│   │   │   ├── paths.py
│   │   │   ├── tasks.py
│   │   │   └── uci_file_ops.py
│   │   ├── eval/
│   │   │   ├── __init__.py
│   │   │   ├── analysis.py
│   │   │   ├── colors.py
│   │   │   ├── evaluation.py
│   │   │   ├── plotting.py
│   │   │   ├── runtimes.py
│   │   │   └── tables.py
│   │   ├── run/
│   │   │   ├── __init__.py
│   │   │   ├── results.py
│   │   │   └── task_execution.py
│   │   └── scheduling/
│   │       ├── __init__.py
│   │       ├── execution.py
│   │       ├── jobs.py
│   │       ├── resource_manager.py
│   │       ├── resources.py
│   │       └── schedulers.py
│   └── models/
│       ├── __init__.py
│       ├── alg_interfaces/
│       │   ├── __init__.py
│       │   ├── alg_interfaces.py
│       │   ├── autogluon_model_interfaces.py
│       │   ├── base.py
│       │   ├── calibration.py
│       │   ├── catboost_interfaces.py
│       │   ├── ensemble_interfaces.py
│       │   ├── lightgbm_interfaces.py
│       │   ├── nn_interfaces.py
│       │   ├── other_interfaces.py
│       │   ├── resource_computation.py
│       │   ├── resource_params.py
│       │   ├── rtdl_interfaces.py
│       │   ├── sub_split_interfaces.py
│       │   ├── tabm_interface.py
│       │   ├── tabr_interface.py
│       │   ├── xgboost_interfaces.py
│       │   └── xrfm_interfaces.py
│       ├── data/
│       │   ├── __init__.py
│       │   ├── conversion.py
│       │   ├── data.py
│       │   ├── nested_dict.py
│       │   └── splits.py
│       ├── hyper_opt/
│       │   ├── __init__.py
│       │   ├── coord_opt.py
│       │   └── hyper_optimizers.py
│       ├── nn_models/
│       │   ├── __init__.py
│       │   ├── activations.py
│       │   ├── base.py
│       │   ├── categorical.py
│       │   ├── models.py
│       │   ├── nn.py
│       │   ├── pipeline.py
│       │   ├── rtdl_num_embeddings.py
│       │   ├── rtdl_resnet.py
│       │   ├── tabm.py
│       │   ├── tabr.py
│       │   ├── tabr_context_freeze.py
│       │   └── tabr_lib.py
│       ├── optim/
│       │   ├── __init__.py
│       │   ├── adopt.py
│       │   ├── optimizers.py
│       │   └── scheduling_adam.py
│       ├── sklearn/
│       │   ├── __init__.py
│       │   ├── default_params.py
│       │   ├── sklearn_base.py
│       │   └── sklearn_interfaces.py
│       ├── torch_utils.py
│       ├── training/
│       │   ├── __init__.py
│       │   ├── auc_mu.py
│       │   ├── coord.py
│       │   ├── lightning_callbacks.py
│       │   ├── lightning_modules.py
│       │   ├── logging.py
│       │   ├── metrics.py
│       │   ├── nn_creator.py
│       │   └── scheduling.py
│       └── utils.py
├── scripts/
│   ├── analyze_hpo_best_params.py
│   ├── analyze_tasks.py
│   ├── check_missing_values.py
│   ├── copy_algs.py
│   ├── create_plots_and_tables.py
│   ├── create_probclass_plots.py
│   ├── create_xrfm_ablations_table.py
│   ├── custom_paths.py.default
│   ├── download_data.py
│   ├── estimate_resource_params.py
│   ├── get_sklearn_names.py
│   ├── make_plot_animation.py
│   ├── meta_hyperopt.py
│   ├── move_algs.py
│   ├── move_many_algs.py
│   ├── print_complete_results.py
│   ├── print_runtimes.py
│   ├── ray_slurm_launch.py
│   ├── ray_slurm_template.sh
│   ├── rename_alg.py
│   ├── rename_tag.py
│   ├── run_evaluation.py
│   ├── run_experiments.py
│   ├── run_experiments_unused.py
│   ├── run_probclass_experiments.py
│   ├── run_single_task.py
│   ├── run_slurm.py
│   ├── run_time_measurement.py
│   └── run_xrfm_large_ablations.py
└── tests/
    ├── __init__.py
    ├── test_bench.py
    ├── test_ensemble.py
    ├── test_metrics.py
    ├── test_rtdl_nns.py
    ├── test_sklearn_interfaces.py
    ├── test_tabr.py
    └── test_variants.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/testing.yml
================================================
name: 'test'

on:
  push:
    branches:
      - "main"
      - "dev"
  pull_request:
    branches:
      - '*'

jobs:
  test:
    strategy:
      fail-fast: false
      matrix:
        os: [windows-latest, ubuntu-latest, macos-latest]
        python-version: ['3.9', '3.10', '3.11', '3.12']  # 3.13 fails on Windows because it doesn't find a ray version
    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          # Install a specific version of uv.
          version: "0.5.4"
      - name: Install hatch
        run: uv pip install --system hatch
      - name: Install swig
        run: uv pip install --system swig
      - name: Run tests
        run: hatch test  # removed codecov upload in v1.7.3


================================================
FILE: .gitignore
================================================
*.pyc
*.pdf
*.zip
*.ckpt

experiments/*/
experiments/trace.json
!experiments/meta_hpo
!experiments/prototypes
public_export
dist
files
plots
lightning_logs

docs/build
docs/source/modules.rst
docs/source/pytabkit.*

.coverage*

.idea
catboost_info
tab_bench_data
rtdl_checkpoints
examples/.ipynb_checkpoints

scripts/custom_paths.py


================================================
FILE: .readthedocs.yaml
================================================
# Read the Docs configuration file for Sphinx projects
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Set the OS, Python version and other tools you might need
build:
  os: ubuntu-22.04
  tools:
    python: "3.10"
    # You can also specify other tool versions:
    # nodejs: "20"
    # rust: "1.70"
    # golang: "1.20"
  jobs:
    pre_build:
      - sphinx-apidoc -o docs/source/ pytabkit

# Build documentation in the "docs/" directory with Sphinx
sphinx:
  configuration: docs/source/conf.py
  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
  # builder: "dirhtml"
  builder: "html"
  # Fail on all warnings to avoid broken references
  # fail_on_warning: true

# Optionally build your docs in additional formats such as PDF and ePub
# formats:
#   - pdf
#   - epub

# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
  install:
    - requirements: docs/requirements.txt

================================================
FILE: LICENSE.txt
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright {yyyy} {name of copyright owner}

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dholzmueller/pytabkit/blob/main/examples/tutorial_notebook.ipynb)
[![](https://readthedocs.org/projects/pytabkit/badge/?version=latest&style=flat-default)](https://pytabkit.readthedocs.io/en/latest/)
[![test](https://github.com/dholzmueller/pytabkit/actions/workflows/testing.yml/badge.svg)](https://github.com/dholzmueller/pytabkit/actions/workflows/testing.yml)
[![Downloads](https://img.shields.io/pypi/dm/pytabkit)](https://pypistats.org/packages/pytabkit)

# PyTabKit: Tabular ML models and benchmarking (NeurIPS 2024)

 [Paper](https://arxiv.org/abs/2407.04491) | [Documentation](https://pytabkit.readthedocs.io) | [RealMLP-TD-S standalone implementation](https://github.com/dholzmueller/realmlp-td-s_standalone) | [Grinsztajn et al. benchmark code](https://github.com/LeoGrin/tabular-benchmark/tree/better_by_default) | [Data archive](https://doi.org/10.18419/darus-4555) |
|-------------------------------------------|--------------------------------------------------|---------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|

PyTabKit provides **scikit-learn interfaces for modern tabular classification and regression methods**
benchmarked in our [paper](https://arxiv.org/abs/2407.04491), see below.
It also contains the code we used for **benchmarking** these methods
on our benchmarks.

![Meta-test benchmark results](./figures/meta-test_benchmark_results.png)

## When (not) to use pytabkit

- **To get the best possible results**: 
  - Generally we recommend AutoGluon for the best possible results, 
    though it does not include all the models from pytabkit. AutoGluon 1.4
    includes RealMLP (though not in a default configuration) and TabM (in the "extreme" preset for <= 30K samples).
  - To get the best possible results from `pytabkit`, 
    we recommend using 
    `Ensemble_HPO_Classifier(n_cv=8, use_full_caruana_ensembling=True, use_tabarena_spaces=True, n_hpo_steps=50)` 
    with a `val_metric_name` corresponding to your target metric 
    (e.g., `class_error`, `cross_entropy`, `brier`, `1-auc_ovr`), or the corresponding `Regressor`. 
    (This might take very long to fit.)
  - For only a single model, we recommend using 
    `RealMLP_HPO_Classifier(n_cv=8, hpo_space_name='tabarena-new', use_caruana_ensembling=True, n_hyperopt_steps=50)`,
    also with `val_metric_name` as above, or the corresponding `Regressor`.
- **Models**: [TabArena](https://github.com/AutoGluon/tabarena) 
  also includes some newer models like RealMLP and TabM 
  with more general preprocessing (missing numericals, text, etc.),
  as well as very good boosted tree implementations.
  `pytabkit` is currently still easier to use 
  and supports vectorized cross-validation for RealMLP, 
  which can significantly speed up the training.
- **Benchmarking**: While pytabkit can be good for quick benchmarking for development, 
  for method evaluation we recommend [TabArena](https://github.com/AutoGluon/tabarena).

## Installation (new in 1.4.0: optional model dependencies)

```bash
pip install pytabkit[models]
```

- RealMLP (and TabM) can be used without the `[models]` part.
- For xRFM on GPU, faster kernels will be used if you install `kermac[cu12]` or `kermac[cu11]` 
(depending on your CUDA version).
- If you want to use **TabR**, you have to manually install
  [faiss](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md),
  which is only available on **conda**.
- Please install torch separately if you want to control the version (CPU/GPU etc.)
- Use `pytabkit[models,autogluon,extra,hpo,bench,dev]` to install additional dependencies for the other models,
  AutoGluon models, extra preprocessing,
  hyperparameter optimization methods beyond random search (hyperopt/SMAC),
  the benchmarking part, and testing/documentation. For the hpo part,
  you might need to install *swig* (e.g. via pip) if the build of *pyrfr* fails.
  See also the [documentation](https://pytabkit.readthedocs.io).
  To run the data download for the meta-train benchmark, you need one of rar, unrar, or 7-zip
  to be installed on the system.

## Using the ML models

Most of our machine learning models are directly available via scikit-learn interfaces.
For example, you can use RealMLP-TD for classification as follows:

```python
from pytabkit import RealMLP_TD_Classifier

model = RealMLP_TD_Classifier()  # or TabR_S_D_Classifier, CatBoost_TD_Classifier, etc.
model.fit(X_train, y_train)
model.predict(X_test)
```

The code above will automatically select a GPU if available,
try to detect categorical columns in dataframes,
preprocess numerical variables and regression targets (no standardization required),
and use a training-validation split for early stopping.
All of this (and much more) can be configured through the constructor
and the parameters of the fit() method.
For example, it is possible to do bagging
(ensembling of models on 5-fold cross-validation)
simply by passing `n_cv=5` to the constructor.
Here is an example for some of the parameters that can be set explicitly:

```python
from pytabkit import RealMLP_TD_Classifier

model = RealMLP_TD_Classifier(device='cpu', random_state=0, n_cv=1, n_refit=0,
                              n_epochs=256, batch_size=256, hidden_sizes=[256] * 3,
                              val_metric_name='cross_entropy',
                              use_ls=False,  # for metrics like AUC / log-loss
                              lr=0.04, verbosity=2)
model.fit(X_train, y_train, X_val, y_val, cat_col_names=['Education'])
model.predict_proba(X_test)
```

See [this notebook](https://colab.research.google.com/github/dholzmueller/pytabkit/blob/main/examples/tutorial_notebook.ipynb)
for more examples. Missing numerical values are currently *not* allowed and need to be imputed beforehand.

### Available ML models

Our ML models are available in up to three variants, all with best-epoch selection:

- library defaults (D)
- our tuned defaults (TD)
- random search hyperparameter optimization (HPO), 
  sometimes also tree parzen estimator (HPO-TPE) or weighted ensembling (Ensemble)

We provide the following ML models:

- **RealMLP** (TD, HPO, Ensemble): Our new neural net models with tuned defaults (TD),
  random search hyperparameter optimization (HPO), or Ensembling
- **XGB**, **LGBM**, **CatBoost** (D, TD, HPO, HPO-TPE): Interfaces for gradient-boosted
  tree libraries XGBoost, LightGBM, CatBoost
- **MLP**, **ResNet**, **FTT** (D, HPO): Models
  from [Revisiting Deep Learning Models for Tabular Data](https://proceedings.neurips.cc/paper_files/paper/2021/hash/9d86d83f925f2149e9edb0ac3b49229c-Abstract.html)
- **MLP-PLR** (D, HPO): MLP with numerical embeddings
  from [On Embeddings for Numerical Features in Tabular Deep Learning](https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html)
- **TabR** (D, HPO): TabR model
  from [TabR: Tabular Deep Learning Meets Nearest Neighbors](https://openreview.net/forum?id=rhgIgTSSxW)
- **TabM** (D, HPO): TabM model
  from [TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling](https://arxiv.org/abs/2410.24210)
- **XRFM** (D, HPO): xRFM model from [here](https://arxiv.org/abs/2508.10053) ([original repo](https://github.com/dmbeaglehole/xRFM))
- **RealTabR** (D): Our new TabR variant with default parameters
- **Ensemble-TD**: Weighted ensemble of all TD models (RealMLP, XGB, LGBM, CatBoost)

## Post-hoc calibration and refinement stopping

For using post-hoc temperature scaling and refinement stopping from our 
paper [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195),
you can pass the following parameters to the scikit-learn interfaces:
```python
from pytabkit import RealMLP_TD_Classifier
clf = RealMLP_TD_Classifier(
    val_metric_name='ref-ll-ts',  # short for 'refinement_logloss_ts-mix_all'
    calibration_method='ts-mix',  # temperature scaling with laplace smoothing
    use_ls=False  # recommended for cross-entropy loss
)
```
Other calibration methods and validation metrics
from [probmetrics](https://github.com/dholzmueller/probmetrics)
can be used as well.

For reproducing the results from this paper, we refer to the
[documentation](https://pytabkit.readthedocs.io/en/latest/bench/refine_then_calibrate.html).

## Benchmarking code

Our benchmarking code has functionality for

- dataset download
- running methods highly parallel on single-node/multi-node/multi-GPU hardware,
  with automatic scheduling and trying to respect RAM constraints
- analyzing/plotting results

For more details, we refer to the [documentation](https://pytabkit.readthedocs.io).

## Preprocessing code

While many preprocessing methods are implemented in this repository,
a standalone version of our robust scaling + smooth clipping
can be found [here](https://github.com/dholzmueller/realmlp-td-s_standalone/blob/main/preprocessing.py#L65C7-L65C37).

## Citation

If you use this repository for research purposes, please cite our [paper](https://arxiv.org/abs/2407.04491):

```
@inproceedings{holzmuller2024better,
  title={Better by default: {S}trong pre-tuned {MLPs} and boosted trees on tabular data},
  author={Holzm{\"u}ller, David and Grinsztajn, Leo and Steinwart, Ingo},
  booktitle = {Neural {Information} {Processing} {Systems}},
  year={2024}
}
```

## Contributors

- David Holzmüller (main developer)
- Léo Grinsztajn (deep learning baselines, plotting)
- Ingo Steinwart (UCI dataset download)
- Katharina Strecker (PyTorch-Lightning interface)
- Daniel Beaglehole (part of the xRFM implementation)
- Lennart Purucker (some features/fixes)
- Jérôme Dockès (deployment, continuous integration)

## Acknowledgements

Code from other repositories is acknowledged as well as possible in code comments.
Especially, we used code from https://github.com/yandex-research/rtdl
and sub-packages (Apache 2.0 license),
code from https://github.com/catboost/benchmarks/
(Apache 2.0 license),
and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
(Apache 2.0 license).

## Releases (see git tags)

- v1.7.3:
    - disabled RealMLP lightning log file creation that was accidentally introduced 
      in predict() in >=v1.7.0.
    - removed pynvml dependency.
- v1.7.2: 
    - Added scikit-learn 1.8 compatibility.
    - Removed debug print in RealMLP.
    - fixed device memory estimation error in the scheduler when `CUDA_VISIBLE_DEVICES` was used.
- v1.7.1:
    - LightGBM now processes the `extra_trees`, `max_cat_to_onehot`, and `min_data_per_group` parameters 
      used in the `'tabarena'` search space, which should improve results.
    - Scikit-learn interfaces for RealMLP (TD, HPO) now support moving the model to a different device 
      (e.g., before saving). This can be achived using, e.g., `model.to('cpu')` (which is in-place).
    - Fixed an xRFM bug in handling binary categorical features.
- v1.7.0:
    - added [xRFM](https://arxiv.org/abs/2508.10053) (D, HPO)
    - added new `'tabarena-new'` search space for RealMLP-HPO, including per-fold ensembling (more expensive)
      and tuning two more categorical hyperparameters
      (with [better results](https://github.com/autogluon/tabarena/pull/195))
    - reduced RealMLP pickle size by not storing the dataset ([#33](https://github.com/dholzmueller/pytabkit/issues/33))
    - fixed gradient clipping for TabM 
      (it did nothing previously, see [#34](https://github.com/dholzmueller/pytabkit/issues/34)).
      To ensure backward compatibility, it is set to None in the HPO search spaces now 
      (it was already None in the default parameters).
    - removed debug print in TabM training loop
- v1.6.1:
    - For `n_ens>1`, changed the default behavior for classification to averaging probabilities instead of logits.
      This can be reverted by setting `ens_av_before_softmax=True`.
    - Implemented time limit for HPO/ensemble methods through `time_limit_s` parameter.
    - Support `torch>=2.6` and Python 3.13.
- v1.6.0:
    - Added support for other training losses in TabM through the `train_metric_name` parameter, 
      for example, (multi)quantile regression via `train_metric_name='multi_pinball(0.05,0.95)'`.
    - RealMLP-TD now adds the `n_ens` hyperparameter, which can be set to values >1 
      to train ensembles per train-validation split (called PackedEnsemble in the TabM paper). 
      This is especially useful when using holdout validation instead of cross-validation ensembles, 
      and to get more reliable validation predictions and scores for tuning/ensembling.
    - fixed RealMLP TabArena search space (`hpo_space_name='tabarena'`) for classification 
      (allow no label smoothing through `use_ls=False` instead of `use_ls="auto"`).
- v1.5.2: fixed more device bugs for HPO and ensembling
- v1.5.1: fixed a device bug in TabM for GPU
- v1.5.0:
    - added `n_repeats` parameter to scikit-learn interfaces for repeated cross-validation
    - HPO sklearn interfaces (the ones using random search)
      can now do weighted ensembling instead by setting `use_caruana_ensembling=True`.
      Removed the `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor` from v1.4.2 
      since they are now redundant through this feature.
    - renamed `space` parameter of GBDT HPO interface 
      to `hpo_space_name` so now it also works with non-TPE versions.
    - Added new [TabArena](https://tabarena.ai) search spaces for boosted trees (not TPE), 
      which should be almost equivalent to the ones from TabArena 
      except for the early stopping logic. 
    - TabM now supports `val_metric_name` for early stopping on different metrics.
    - fixed issues #20 and #21 regarding HPO
    - small updates for the ["Rethinking Early Stopping" paper](https://arxiv.org/abs/2501.19195)
- v1.4.2:
    - fixed handling of custom `val_metric_name` HPO models and `Ensemble_TD_Regressor`.
    - if `tmp_folder` is specified in HPO models, 
      save each model to disk immediately instead of holding all of them in memory.
      This can considerably reduce RAM/VRAM usage.
      In this case, pickled HPO models will still rely on the models stored in the `tmp_folder`.
    - We now provide `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor`,
      which will use weighted ensembling and usually perform better than HPO 
      (but have slower inference time). We recommend using the new `hpo_space_name='tabarena'`
      for best results.
- v1.4.1: 
    - moved dill to optional dependencies
    - updated TabM code to a newer version: 
      added option share_training_batches=False (old version: True), 
      exclude certain parameters from weight decay.
    - added [documentation](https://pytabkit.readthedocs.io/en/latest/bench/using_the_scheduler.html) for using the scheduler with custom jobs.
    - fixed bug in RealMLP refitting.
    - updated process start method for scheduler to speed up benchmarking
- v1.4.0:
    - moved some imports to the new `models` optional dependencies
      to have a more light-weight RealMLP installation
    - Added GPU support for CatBoost with help from 
      [Maximilian Schambach](https://github.com/MaxSchambach) 
      in #16 (not guaranteed to produce exactly the same results).
    - Ensembling now saves models after training if a path is supplied, to reduce memory usage
    - Added more search spaces
    - fixed error in multiquantile output when the passed y was one-dimensional 
      instead of having shape `(n_samples, 1)`
    - Added some examples to the documentation
- v1.3.0: 
    - Added multiquantile regression for RealMLP: 
      see the [documentation](https://pytabkit.readthedocs.io/en/latest/models/quantile_reg.html)
    - More hyperparameters for RealMLP
    - Added [TabICL](github.com/soda-inria/tabicl) wrapper
    - Small fixes
- v1.2.1: avoid error for older skorch versions
- v1.2.0:
    - Included post-hoc calibration and more metrics through 
      [probmetrics](https://github.com/dholzmueller/probmetrics).
    - Added benchmarking code for [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195).
    - Updated format for saving predictions, 
      allow to stop on multiple metrics during the same training 
      in the benchmark.
    - Better categorical handling, 
      avoiding an error for string and object columns,
      not ignoring boolean columns by default but treating them as 
      categorical.
    - Added Ensemble_HPO_Classifier and Ensemble_HPO_Regressor.
- v1.1.3:
  - Fixed a bug where the categorical encoding was incorrect if categories 
    were missing in the training or validation set. The bug affected XGBoost 
    and potentially many other models except RealMLP.
  - Scikit-learn interfaces now accept and auto-detect categorical datatypes
    (category, string, object) in dataframes.
- v1.1.2:
    - Some compatibility improvements for scikit-learn 1.6
      (but disabled 1.6 since skorch is not compatible with it).
    - Improved documentation for Pytorch-Lightning interface.
    - Other small bugfixes and improvements.
- v1.1.1:
    - Added parameters `weight_decay`, `tfms`,
      and `gradient_clipping_norm` to TabM.
      The updated default parameters now apply the RTDL quantile transform.
- v1.1.0:
    - Included TabM
    - Replaced `__` by `_` in parameter names for MLP, MLP-PLR, ResNet, and FTT,
      to comply with scikit-learn interface requirements.
    - Fixed non-determinism in NN baselines
      by initializing the random state of quantile (and KDI)
      preprocessing transforms.
    - n_threads parameter is not ignored by NNs anymore.
    - Changes by [Lennart Purucker](https://github.com/LennartPurucker):
      Add time limit for RealMLP,
      add support for `lightning` (but also still allowing `pytorch-lightning`),
      making skorch a lazy import, removed msgpack\_numpy dependency.
- v1.0.0: Release for the NeurIPS version and arXiv v2+v3.
    - More baselines (MLP-PLR, FT-Transformer, TabR-HPO, RF-HPO),
      also some un-polished internal interfaces for other methods,
      esp. the ones in AutoGluon.
    - Updated benchmarking code (configurations, plots)
      including the new version of the Grinsztajn et al. benchmark
    - Updated fit() parameters in scikit-learn interfaces, etc.
- v0.0.1: First release for arXiv v1.
  Code and data are archived at [DaRUS](https://doi.org/10.18419/darus-4255).


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.https://www.sphinx-doc.org/
	exit /b 1
)

if "%1" == "" goto help

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: docs/requirements.txt
================================================
adjustText>=1.0
autorank>=1.0
catboost>=1.2
dask[dataframe]>=2023
dill
fire
lightgbm>=4.1
matplotlib>=3.0
msgpack>=1.0
myst_parser>=3.0
numba>=0.59.0
numpy>=1.25
openml>=0.14
openpyxl>=3.0
pandas>=2.0
patool>=1.0
probmetrics>=0.0.1
psutil>=5.0
pytest-cov>=4.0
pytest>=7.0
pytorch_lightning>=2.0
pyyaml>=5.0
ray>=2.8
requests>=2.0
scikit-learn>=1.3
seaborn>=0.0.13
skorch>=0.15
sphinx>=7.0
sphinx_rtd_theme>=2.0
torch>=2.0
torchmetrics>=1.2.1
tqdm
tueplots>=0.0.12
xgboost>=2.0
xlrd>=2.0
xrfm>=0.4.3

================================================
FILE: docs/source/bench/00_installation.md
================================================
# Overview and Installation of the Benchmarking code

Our benchmarking code contains several features:

- Automatic dataset download
- Running models (parallelized) with automatic scheduling, 
trying to respect RAM constraints
- Evaluation and plotting

## Installation

Our code has been tested with python 3.9 and 3.10.
After cloning/forking the repo, 
the required libraries can be installed as follows:

```commandline
# in the repo folder:
pip3 install -e .[extra,hpo,bench]
```

Note that the version requirements in our `pyproject.toml` 
are somewhat restrictive to avoid problems, they can potentially be relaxed.

To more closely reproduce the installation we used for running the benchmarks, 
we refer to the configuration files in the `original_requirements` folder:
- The pip-only requirements in `requirements_2024_06_25.txt` 
were used to compute many of the older NN results (not TabR).
- The conda requirements in `conda_env_2024_06_25.yml` 
and `conda_env_2024_10_28.yml` were used to compute GBDT-HPO results 
and TabR results as well as a few newer NN results. 
They can be installed as a new conda environment using 
`conda env create -f conda_env_2024_10_28.yml`. 
Note that the older of the two conda environments was very slow 
for TabR on some datasets 
since it uses an older torchmetrics version with slow implementations.


## Using Sphinx Documentation
Go to the repo root dir and run
```commandline
sphinx-apidoc -o docs/source/ pytabkit
sphinx-build -M html docs/source/ docs/build/
```
then open `docs/build/html/index.html`.

================================================
FILE: docs/source/bench/01_running_the_benchmark.md
================================================
# Running the benchmark

## Configuration of data paths

The paths for storing data and results are configured
through the `tab_bench.data.paths.Paths` class. 
There are several options to configure which folders are used, 
which will be automatically recognized by `Paths.from_env_variables()`:

- **Through environmental variables**: 
The base folder can be configured by setting the environmental variable
`TAB_BENCH_DATA_BASE_FOLDER`. 
Optionally, some sub-folders can be set separately 
(e.g. for moving them to another partition). These are
`TAB_BENCH_DATA_TASKS_FOLDER`, `TAB_BENCH_DATA_RESULTS_FOLDER`,
`TAB_BENCH_DATA_RESULT_SUMMARIES_FOLDER`, `TAB_BENCH_DATA_UCI_DOWNLOAD_FOLDER`.
- **Through a python file**: If `TAB_BENCH_DATA_BASE_FOLDER` is not available, 
the code will try to get the base folder (as a string) from
`scripts.custom_paths.get_base_folder()`.
This can be implemented by copying `scripts/custom_paths.py.default` to `scripts/custom_paths.py`
  (ignored by git) and adjusting the path therein.
- If neither of the two options above is used, 
all data will be stored in `./tab_bench_data`.


## Download datasets

To download all datasets for the meta-train and meta-test benchmarks, run 
(with your desired OpenML cache directory, optionally)
```commandline
python3 scripts/download_data.py openml_cache_dir --import_meta_train --import_meta_test --import_grinsztajn_medium
```
To run methods on the benchmarks, there are two options:

## Run experiments with slurm

Our benchmarking code contains its own scheduling code that will start subprocesses 
for each algorithm-dataset-split combination.
Therefore, it is in principle possible to run all experiments 
through a single slurm job,
though experiments can be divided into smaller pieces by running them separately.

First, in `scripts/ray_slurm_template.sh`, 
replace the line `cd ~/git/pytabkit` according to your folder location.
Also, make sure that the data path is specified there 
if you want to set it via an environmental variable.
Run the following command (replacing some of the parameters with your own values) on the login node:
```commandline
python3 scripts/ray_slurm_launch.py --exp_name=my_exp_name --num_nodes=num_nodes --queue="queue_name" --time=24:00:00 --mail_user="my@address.edu" --log_folder=log_folder --command="python3 -u scripts/run_slurm.py"
```
This will submit a job to the configured queue that will run `scripts/run_slurm.py` and create logfiles.
Your experiments then have to be configured in `scripts/run_slurm.py`, see below.
Multi-node is supported: `ray` will start instances on each node
and our benchmarking code will schedule the individual experiments on the nodes.

## Run experiments without slurm

Run the file with the corresponding experiments directly. 
For example, many of our experiment configurations 
can be found in `scripts/run_experiments.py`. 
One possible way to run the experiments detached from the shell with log-files is
````commandline
systemd-run --scope --user python3 -u scripts/run_experiments.py > ./out.log 2> ./err.log &
````

## Time measurements

For time measurements, simply run `scripts/run_time_measurements.py` (with or without slurm).
Results can be printed using `scripts/print_runtimes.py` 
(but these are averaged total times, not averaged per 1K samples as in the paper).

## Evaluating the benchmark results

Aggregated algorithm results can be printed using 
````commandline
python3 scripts/run_evaluation.py meta-train-class
````
where `meta-train-class` can be replaced by the name of any other task collection 
(that is stored in the `task_collections` folder in the configured data directory),
or a single dataset such as `openml-class/Higgs`. 
This script also has many more command line options, see the python file.
For example, one can print only those methods with a certain tag 
using the `--tag` option,
print results on individual datasets, for different metrics, etc.
The parameters are the same as the ones of the following method:
```{eval-rst}  
.. autofunction:: scripts.run_evaluation.show_eval
```

## Creating plots and tables

Plots and tables can be created using
````commandline
python3 scripts/create_plots_and_tables.py
````
The plots without missing value datasets require running
```commandline
python3 scripts/check_missing_values.py
```
once beforehand.

## Single-task experiments

You can also run a configuration on a single data set,
without saving the results, by adjusting and running `scripts/run_single_task.py`.

## Other utilities

- Use `scripts/analyze_tasks.py` to print some dataset statistics.
- You can rename a method using `python3 scripts/rename_alg.py old_name new_name`.
- We used some code in `scripts/meta_hyperopt.py` to optimize the default parameters for GBDTs.
- The code in `scripts/estimate_resource_params.py` has been used to get more precise estimates 
for RAM usage etc. for running methods on the benchmark.
- `scripts/print_complete_results.py` can be used to check which methods have results available 
on all splits for all tasks in a given collection.

================================================
FILE: docs/source/bench/02_stored_data.md
================================================
# Data format

Here, we describe how the main data is stored 
inside the main data folder configured in the `tab_bench.data.paths.Paths` object
(see the documentation on running the benchmark).

As file formats, we mostly use `.yaml` (for small, human-readable files),
`.msgpack.gz` (for efficiently storing dicts, lists, etc.), and `.npy` 
(standard format for storing numpy arrays).

## Algs folder

The following files are stored in `algs/<alg_name>`, 
see `tab_bench.run.task_execution.TabBenchJobManager.add_jobs()`
for details on how they are stored:

- `tags.yaml` contains a list of tags, 
which can be used to only load results for algs with certain tags.
- `extended_config.yaml` contains a dictionary with the wrapper parameters, 
as well as the alg_name and the wrapper class name.
- `wrapper.pkl`: Optionally, a pickled version (using `dill`) of the wrapper. 
(However, our code does not load these as pickle is an unsafe format.)
- `src`: A folder containing the source files at the time of execution, as a backup.

## Tasks folder

We store datasets (tasks) in folders `tasks/<source_name>/<task_name>`, 
where source_name and task_name are derived from how the tasks are imported
(see also the `tab_bench.data.tasks.TaskDescription` class).
In each of these folders, we store the following files:

- `x_cont.npy`, `x_cat.npy`, `y.npy` store the three relevant tensors 
for the DictDataset
(see the `tab_models` documentation).
- `task_info.yaml` stores the information of a `TaskInfo` object.

## Task collections folder

In `task_collections/<coll_name>.yaml`, 
we store the list of tasks that a task collection with name `coll_name` consists of.

## Results folder

We store the results of experiments in the folder

`results/<alg_name>/<source_name>/<task_name>/<k>-fold/<split_type>/<split_idx>`.
Here, 

- alg_name is the name given to the method, 
- source_name and task_name identify a task, 
- k refers to the number of cross-validation folds (training-validation, not test),
- split_type is either `random-split` (usually the case) 
or `default-split` (not used in our benchmark),
- split_idx is the index (starting from zero) of the trainval-test-split.

The results are stored in files `metrics.yaml` and `other.msgpack.gz`. 
The former contains only the errors in different metrics, 
the latter contains other things like predictions (if configured to be saved), 
best stopping epoch, and possibly optimized hyperparameters.
These files are stored by `tab_bench.run.results.ResultManager`.
The involved dictionaries are generated by 
`tab_models.alg_interfaces.alg_interfaces.AlgInterface.eval()`.

## Result summaries folder

Since loading the results directly can be slow, 
we store accumulated versions of them in a more efficient format. Specifically,
`tab_bench.run.task_execution.TabBenchJobManager.run_jobs()` will call
`tab_bench.run.task_execution.results.save_summaries()`, which will generate files
`result_summaries/<alg_name>/<source_name>/<task_name>/<k>-fold/metrics.msgpack.gz`
that contain the metrics results for all splits.

## Other folders

- Plots and LaTeX tables will be saved in the `plots` folder.
- Results of estimating resource prediction parameters 
are saved in the `resources` folder.
- Results of time measurements are saved in the `times` folder.
- Downloaded datasets from the UCI repository are saved in the `uci_download` folder.
They can be deleted after the data import in `download_data.py` is completed.
- The `tmp` folder can be used for storing temporary files.
When running experiments, methods can store intermediate results 
in a temporary folder in their respective results folder.


================================================
FILE: docs/source/bench/03_code.md
================================================
# Code structure

## Algorithm wrappers

To run methods in `tab_bench`, one needs to 
provide them as a subclass of `tab_bench.alg_wrappers.general.AlgWrapper`.
Generally, we use models from the `tab_models` library that implement 
the `AlgInterface` from there, and wrap them lightly as an `AlgInterfaceWrapper`
in `tab_bench/alg_wrappers/interface_wrappers.py`, 
see the numerous classes there for examples. 
As in `tab_models`, we pass parameters to these models via `**kwargs`.
The scikit-learn interfaces in `tab_models` provide in their constructors
a list of the most important hyperparameters.

## Datasets

We represent our datasets using the `DictDataset` class from `tab_models`.
These datasets can be loaded as follows:

```python
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription

paths = Paths.from_env_variables()
task_desc = TaskDescription('openml-reg', 'fifa')
task_info = task_desc.load_info(paths)  # a TaskInfo object
task = task_info.load_task(paths)
ds = task.ds  # this is the DictDataset object
```

We can convert `ds` to a Pandas DataFrame using `ds.to_df()`. 
It is also possible to load a list of all TaskInfo objects
for an entire task collection:

```python
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection

paths = Paths.from_env_variables()
task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
```

## Scheduling code

We implement general scheduling code in `tab_bench/scheduling`. 
This code can take a list of jobs with certain functionalities 
and run them in parallel in a single-node or multi-node setup, 
respecting the provided resource requirements 
(on RAM usage, number of threads, etc.). It can be used independently as follows:

```python
from typing import List
from pytabkit.bench.scheduling.jobs import AbstractJob
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler

jobs: List[AbstractJob] = []  # create a list of jobs here
scheduler = SimpleJobScheduler(RayJobManager())
scheduler.add_jobs(jobs)
scheduler.run()
```

For our tabular benchmarking code, 
the `AbstractJob` objects will be created by the
`tab_bench.run.task_execution.TabBenchJobManager`.
Numerous examples for this can be found in `run_final_experiments.py`.

## Resource estimation

## Evaluation and plotting


================================================
FILE: docs/source/bench/adding_models.md
================================================
# Adding your own models to the benchmark

To run your own models,
- implement an `AlgInterface` subclass. There are numerous examples already implemented.
For models that can only run a single train-validation-test split at a time, 
you might want to subclass or modify `SklearnSubSplitInterface` from 
`pytabkit/models/alg_interfaces/sub_split_interfaces.py`. Examples can be found in
`pytabkit/models/alg_interfaces/other_interfaces.py` or
`pytabkit/models/alg_interfaces/rtdl_interfaces.py`.
- add an `AlgInterfaceWrapper` subclass. This is often just a three-liner 
that specifies which AlgInterfaces subclass to instantiate. 
See the numerous examples in
`pytabkit/bench/alg_wrappers/interface_wrappers.py`, especially the later ones.
- adjust the code to run your `AlgInterfaceWrapper` on the benchmark,
see `scripts/run_experiments.py` for many examples. 
Note that `RunConfig` has an option to save the model predictions 
on the whole datasets,
which can significantly increase the disk usage 
(can be up to 2 GB per model on the meta-test-class benchmark).

================================================
FILE: docs/source/bench/download_results.md
================================================
# Downloading the benchmark results

The benchmark data (as well as the code)
is archived at [DaRUS](https://doi.org/10.18419/darus-4555).
To download the benchmark data,
- create a folder for the data 
(which is then linked in the environmental variable 
`TAB_BENCH_DATA_BASE_FOLDER` or in `custom_paths.py`)
- in the folder, unpack `main_no_results.tar.gz`, 
this should create the folders `algs`, `result_summaries`, `times`, `plots`,
`task_collections`, and `tasks_only_infos` 
(which should be renamed to `tasks` if no `tasks` folder has been created).
Since `result_summaries` stores the main metrics of the results, 
this is already enough for plotting/evaluating the results. 
- If you want the non-summarized results, 
download and unpack `results_small.tar.gz`, which contains the `results` folder 
(you might need to rename it from `results_no_gz` to `results`).
However, this does not contain the additional files storing the predictions 
and optimal hyperparameters.
- If you want the full results, download and unpack
`results_main.tar.gz` (180 GB!) into the results folder 
(overwriting/replacing the contents of `results_small.tar.gz`)
Moreover, there are additional files containing the results 
of the individual random search steps
for the different methods, 
which could be used for retrospectively optimizing on a different metric etc. 
The file `cv_refit.tar.gz` contains the results of the cross-validation/refitting experiments, 
which are also somewhat large.
- If you need the datasets (in the `tasks` folder), 
you can normally just obtain it by running `scripts/download_data.py`. 
However, there is the option to request access to download `tasks.tar.gz` directly.

================================================
FILE: docs/source/bench/refine_then_calibrate.md
================================================
# Reproducing results of "Rethinking Early Stopping: Refine, Then Calibrate"

Here, we document how to reproduce results from our paper [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195).
For general instructions on how to set data paths and use slurm, 
we refer to the installation page. 
The following will be the parts specific to this paper.

## Installation

```bash
pip install probmetrics[extra]  # to get smECE
pip install pytabkit[bench,dev]
```

### Original environment

The original conda environment for exact reproduction 
is stored in `original_requirements/conda_env_2025_01_15.yml`.

## Downloading datasets

Download the zipped datasets (`dataset-latest.zip`) of the TALENT benchmark from
[here](https://drive.google.com/drive/folders/1j1zt3zQIo8dO6vkO-K-WE6pSrl71bf0z).
Extract them into a folder. Then, use

```commandline
python3 scripts/download_data.py --import_talent_class_small --talent_folder=<unzipped data folder>
```

where the provided data folder should be the `data` folder inside the unzipped results.

## Running experiments

Experiments can be run using `python3 scripts/run_probclass_experiments.py`,
then plots can be generated using `python3 scripts/create_probclass_plots.py`.


================================================
FILE: docs/source/bench/using_the_scheduler.md
================================================
# Using the scheduler

`pytabkit` includes a flexible scheduler that can schedule jobs within python using `ray` and `multiprocessing`.
Essentially, it is a much fancier version of `multiprocessing.Pool`.
Custom jobs need to provide an estimate of their required resources. The scheduler will
- run as many jobs in parallel as possible on the current hardware while respecting the RAM and resource constraints
- try to run the slowest jobs first, to avoid waiting for a few slow jobs in the end
- measure free CPU RAM in the beginning, and add the fixed RAM that a CPU process uses to the requested RAM. 
  For processes requesting a GPU, the fixed RAM used by a process using torch CUDA will be added to the requested RAM.
- print info including remaining time estimates after each new started job, failed jobs etc.
  (unless the jobs run so fast that multiple ones are started at once). 
  The time estimates will be based on the time estimates by the jobs, 
  but they will be adapted by a factor learned based on the actual time taken by already finished jobs. 
  Hence, the time estimate is only accurate after a few jobs have finished. 
  It often underestimates the actually needed time to some extent.
  (This is probably also due to selection bias, since the estimated longest jobs are run first.)

The scheduler also works on multi-GPU systems,
and it even works on multi-node systems thanks to `ray`'s multi-node support. 
See [`ray_slurm_launch.py`](https://github.com/dholzmueller/pytabkit/blob/main/scripts/ray_slurm_launch.py) 
and [`ray_slurm_template.sh`](https://github.com/dholzmueller/pytabkit/blob/main/scripts/ray_slurm_template.sh).
To use the scheduler, install `pytabkit[models,bench]`.

Here is some example code:

```python
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.jobs import AbstractJob
from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler

class CustomJob(AbstractJob):
    def get_group(self):
        # group name, for all jobs with the same group name
        # one joint time multiplier will be fitted in the scheduler
        return 'default'

    def get_desc(self) -> str:
        return 'CustomJob'  # name for displaying

    def __call__(self, assigned_resources: NodeResources) -> bool:
        # the main job, should only use the assigned resources
        print(f'Running job with {assigned_resources.get_n_threads()} threads', flush=True)
        return True  # job finished successfully

    def get_required_resources(self) -> RequiredResources:
        # Return the resources requested by this job (RAM should be upper bounds, time doesn't need to be)
        return RequiredResources(time_s=1.0, n_threads=1, cpu_ram_gb=0.1, n_gpus=0, gpu_ram_gb=0.0, gpu_usage=1.0)


sched = SimpleJobScheduler(RayJobManager(available_gpu_ram_multiplier=0.7))
sched.add_jobs([CustomJob() for _ in range(1000)])
sched.run()
```

================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

# following https://stackoverflow.com/questions/10324393/sphinx-build-fail-autodoc-cant-import-find-module
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))

from pytabkit.__about__ import __version__

project = 'pytabkit'
copyright = '2024, David Holzmüller, Léo Grinsztajn, Ingo Steinwart'
author = 'David Holzmüller, Léo Grinsztajn, Ingo Steinwart'
release = __version__
# release = "0.0.1"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

extensions = ['myst_parser', 'sphinx.ext.autodoc']

templates_path = ['_templates']
exclude_patterns = []


# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

# html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'
# html_theme = 'default'
html_static_path = ['_static']

# Automatically extract typehints when specified and place them in
# descriptions of the relevant function/method.
autodoc_typehints = "description"
# python_maximum_signature_line_length = 88


# Don't show class signature with the class' name.
autodoc_class_signature = "separated"


================================================
FILE: docs/source/index.rst
================================================
Welcome to PyTabKit's documentation!
======================================

.. toctree::
   :maxdepth: 2
   :caption: Contents:

Tabular ML models in pytabkit.models
===============================

.. toctree::
   models/00_overview
   models/01_sklearn_interfaces
   models/02_hpo
   models/examples
   models/nn_classes
   models/03_training_implementation
   models/quantile_reg


Tabular benchmarking using pytabkit.bench
====================================

.. toctree::
   bench/00_installation
   bench/01_running_the_benchmark
   bench/adding_models
   bench/02_stored_data
   bench/03_code
   bench/download_results
   bench/refine_then_calibrate
   bench/using_the_scheduler


Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/source/models/00_overview.md
================================================
# Overview of the `models` part

## Scikit-learn interfaces

We provide scikit-learn interfaces for various methods in 
`sklearn/sklearn_interfaces.py`. 
These use the default parameter dictionaries defined in `sklearn/default_params.py`.

## AlgInterface: more fine-grained control

We implement all our methods 
through subclassing `AlgInterface` in `alg_interfaces/alg_interfaces.py`.
`AlgInterface` provides more functionality than scikit-learn interfaces,
which is crucial for our benchmarking in `pytabkit.bench`. 
All our scikit-learn interfaces are wrappers around `AlgInterface` classes, 
using the `sklearn.sklearn_base.AlgInterfaceEstimator` base class.
Compared to scikit-learn interfaces, 
`AlgInterface` provides the following additional features:

- Vectorized evaluation on multiple train-validation-test splits 
(used by RealMLP-TD and RealMLP-TD-S).
- Specification of train-validation-test splits, random seeds, temporary folder, custom loggers
- Inclusion of required resource estimates (CPU RAM, GPU RAM, GPU usage, n_threads, time)
- Evaluation on a list of metrics
- Refitting with best found parameters

## Hyperparameter handling

Hyperparameters are explicitly defined in scikit-learn constructors.
<!--- but otherwise, they are just passed on through `**kwargs` (often called `**config`).
Hence, some default parameters are just filled in 
at the point where the parameter is used.-->
Elsewhere, we generally pass all configuration parameters as **kwargs, 
then the corresponding functions pick out the parameters that they need 
and pass the rest on to nested function calls. 
This allows for very convenient coding, 
but one has to pay attention for typos in parameter names, 
which will often not be caught. 
For example, one could have the following structure:

```python

def fit(**kwargs):
    model = build_model(**kwargs)
    train_model(model, **kwargs)
    
def build_model(n_layers=4, **kwargs):
    ...
    
def train_model(model, lr=4e-2, batch_size=256, **kwargs):
    ...
```
    
We usually write `**config` instead of `**kwargs`. 
We also generally try to give unique names to parameters. 
For example, the epsilon parameter of the optimizer 
is called `opt_eps` and the epsilon parameter of label smoothing is called `ls_eps`.

## Internal data representation

We represent datasets internally using the `DictDataset` class. 
It contains a dictionary of PyTorch tensors. 
In our case, there are usually three tensors: 
`'x_cont'` for continuous features,
`'x_cat'` for categorical features (`dtype=torch.long`), and
`'y'` for labels.
A `DictDataset` also contains a dictionary `tensor_infos`, 
which for each of these keys contains a `TensorInfo` object. 
The latter describes the number of features and, 
if applicable, the number of categories for each feature
(for categorical variables or classification labels).

We reserve the category `0` as the category for missing values 
(and values that have not been known to exist at train time).
Missing numerical values are currently not handled by the NN code, 
so they need to be encoded beforehand.


## Data preprocessing (also available for other models)

Most models offer to customize the data preprocessing 
through the `tfms` parameter. 
This is done using the NN preprocessing code in 
`nn_models.models.PreprocessingFactory`
(see the corresponding documentation page 
for an explanation of the Factory classes).

## NN implementation

For the implementation of RealMLP, 
we extend and alter the typical PyTorch structure, 
see the documentation page on NN classes.

## Vectorization

Due to the vectorization of NN models, we use different terms for similar things:
- `n_cv` refers to the number 
of training-validation splits in cross-validation (bagging)
- `n_refit` refers to the number of models 
that are refitted on training+validation data after the CV stage
- `n_tv_splits` (or `n_models`) refers to the number of training-validation 
splits used in the current training (could be `n_cv` or `n_refit`)
- `n_tt_splits` (or `n_parallel`) refers to the number of trainval-test splits used
(this is normally 1 when used through the scikit-learn interface,
but can be larger when using RealMLP through the benchmark)


================================================
FILE: docs/source/models/01_sklearn_interfaces.rst
================================================
Scikit-learn interfaces
=======================

We provide scikit-learn interfaces for numerous methods in
``pytabkit.models.sklearn.sklearn_interfaces``.
Below, we provide an overview.
All of our interfaces allow to specify the validation set(s)
and categorical features in the ``fit`` method:

.. autofunction:: pytabkit.models.sklearn.sklearn_base.AlgInterfaceEstimator.fit

Important: For HPO and ensemble interfaces, it is recommended to set `tmp_folder`
to allow these methods to store fitted models instead of holding them in the RAM.
This means that `tmp_folder` should not be deleted while the associated interface
still exists (even when it is pickled).

RealMLP
-------

For RealMLP, we provide TD (tuned default),
HPO (hyperparameter optimization with random search),
and Ensemble (weighted ensembling of random search configurations) variants:

- RealMLP_TD_Classifier
- RealMLP_TD_Regressor
- RealMLP_HPO_Classifier
- RealMLP_HPO_Regressor
- RealMLP_Ensemble_Classifier
- RealMLP_Ensemble_Regressor

While the TD variants have good defaults,
they provide the option to override any hyperparameters.
The classifier and regressor have the same hyperparameters,
therefore we only show the constructor of the classifier here.
The first parameters until (including) verbosity
are provided for every scikit-learn interface,
although ``random_state``, ``n_threads``, ``tmp_folder``,
and ``verbosity`` may be ignored by some of the methods.

.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_TD_Classifier.__init__

For the HPO and Ensemble variants, we currently only provide few options:

.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_HPO_Classifier.__init__


Boosted Trees
-------------

For boosted trees, we provide the same interfaces as for RealMLP (TD, D, and HPO variants),
but do not wrap the full parameter space from the respective libraries.
Here are some representative examples:

.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.XGB_TD_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.LGBM_TD_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.CatBoost_TD_Classifier.__init__

Other NN baselines
---------

We offer interfaces (D and HPO variants) for

- MLP (from the RTDL code)
- ResNet (from the RTDL code)
- FTT (FT-Transformer from the RTDL code)
- MLP-PLR (from the RTDL code)
- TabR (requires installing faiss)
- TabM

.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.MLP_RTDL_D_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.Resnet_RTDL_D_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.FTT_D_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.MLP_PLR_D_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.TabR_S_D_Classifier.__init__
.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.TabM_D_Classifier.__init__

xRFM
------

We offer D and HPO variants for xRFM.

.. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.XRFM_D_Classifier.__init__


Other methods
-------------
For convenience, we wrap the scikit-learn RF and MLP interfaces
with our scikit-learn interfaces,
although in this case the validation sets are not used.
The respective classes are called
``RF_SKL_Classifier`` and ``MLP_SKL_Classifier`` etc.
We also provide our ``Ensemble_TD_Classifier`` and ``Ensemble_HPO_Classifier``,
a weighted ensemble of our TD / HPO models (and similar for regression).

..
    test

    .. autoclass:: pytabkit.models.sklearn.sklearn_interfaces.RealMLPConstructorMixin

    test2

    .. automodule:: pytabkit.models.sklearn.sklearn_interfaces
        :members:
        :undoc-members:
        :show-inheritance:

Saving and loading
------------------

RealMLP and possibly other models (except probably TabR)
can be saved using pickle-like modules.
With standard pickling,
a model trained on a GPU will be restored to use the same GPU,
and fail to load if the GPU is not present.
(Note that dill fails to save torch models in newer torch versions,
while pickle can still save them.)

The following code allows to load GPU-trained models to the CPU,
but fails to run predict() due to pytorch-lightning device issues.

.. code-block:: language
    import torch
    import dill  # might also work with pickle instead
    torch.save(model, 'model.pkl', pickle_module=dill, _use_new_zipfile_serialization=False)
    model = torch.load('model.pkl', map_location='cpu', pickle_module=dill)


================================================
FILE: docs/source/models/02_hpo.md
================================================
# Hyperparameter optimization

This is a guide how to perform hyperparameter optimization (HPO) 
to get the best results out of RealMLP. 
We consider RealMLP for classification here, but most of the guide 
applies to regression and other baselines as well.

## Option 1: Using the HPO interface

The easiest option is to use the direct HPO interface:

```python
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_HPO_Classifier

X, y = make_classification(random_state=42, n_samples=200, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

clf = RealMLP_HPO_Classifier(n_hyperopt_steps=10, n_cv=1, verbosity=2, val_metric_name='brier')
clf.fit(X_train, y_train)
clf.predict(X_test)
```

The code above
- runs random search with 10 configurations from the HPO space in the paper 
(should be increased to, say, 50 for better results)
- only uses one training-validation split 
(should be increased to, say, 5 for better results)
- prints validation results of each epoch and best found parameters thanks to `verbosity=2`
- selects the best model and best epoch based on the Brier score 
(default would be classification error)

While using the interface directly is convenient, it has certain drawbacks:
- It is not possible to change the search space, 
e.g. to reduce label smoothing for other metrics than classification error.
- It is not possible to save and resume from an intermediate state.
- It is not possible to use another HPO method than random search.
- It is not (easily) possible to access intermediate results.

Therefore, we now look at a more manual approach.

## Option 2: Performing your own HPO

The following code provides an example on how to do HPO manually.

```python
import numpy as np
import torch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold

from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler
from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier
from pytabkit.models.training.metrics import Metrics

n_hyperopt_steps = 10
n_cv = 1
is_classification = True

X, y = make_classification(random_state=42, n_samples=200, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# We compute train-validation splits here instead of letting the sklearn interface do it
# such that we can compute the validation error ourselves
if n_cv == 1:
    # we cannot do 1-fold CV, so we do an 80%-20% train-validation split
    _, val_idxs = train_test_split(np.arange(X_train.shape[0]), test_size=0.2, random_state=0)
    val_idxs = val_idxs[None, :]
else:
    skf = StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=0)
    val_idxs_list = [val_idxs for train_idxs, val_idxs in skf.split(X_train, y_train)]

    # make sure that each validation set has the same length, so we can exploit vectorization
    max_len = max([len(val_idxs) for val_idxs in val_idxs_list])
    val_idxs_list = [val_idxs[:max_len] for val_idxs in val_idxs_list]
    val_idxs = np.asarray(val_idxs_list)

best_val_loss = np.Inf
best_clf = None
best_params = None

for hpo_step in range(n_hyperopt_steps):
    # sample random params according to the proposed search space, but this can be replaced by a custom HPO method
    params = RealMLPParamSampler(is_classification=is_classification).sample_params(seed=hpo_step)

    # we only use one classifier that will fit n_cv sub-models, since RealMLP can vectorize the fitting,
    # but it would also be possible to use one classifier per cross-validation split.
    clf = RealMLP_TD_Classifier(**params, n_cv=n_cv, verbosity=2, val_metric_name='brier')
    clf.fit(X_train, y_train, val_idxs=val_idxs)

    # evaluate validation loss
    # for n_cv >= 2, predict_proba() only outputs averaged predictions of the cross-validation models,
    # but we need separate predictions of each of the cross-validation members to extract the out-of-bag ones,
    # so we use predict_proba_ensemble().
    # There is also predict_ensemble() which replaces predict().
    y_pred_prob = clf.predict_proba_ensemble(X_train)
    val_predictions = np.concatenate([y_pred_prob[i, val_idxs[i, :]] for i in range(n_cv)], axis=0)
    val_labels = np.concatenate([y_train[val_idxs[i, :]] for i in range(n_cv)], axis=0)
    val_logits = np.log(val_predictions + 1e-30)

    val_loss = Metrics.apply(torch.as_tensor(val_logits, dtype=torch.float32), torch.as_tensor(val_labels),
                             metric_name='brier').item()

    # update best model if loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_clf = clf
        best_params = params

best_clf.predict(X_test)
print(f'best params: {best_params}')
```

Here is the equivalent search space for `hyperopt`:
```python
from hyperopt import hp
import numpy as np

space = {
    'num_emb_type': hp.choice('num_emb_type', ['none', 'pbld', 'pl', 'plr']),
    'add_front_scale': hp.pchoice('add_front_scale', [(0.6, True), (0.4, False)]),
    'lr': hp.loguniform('lr', np.log(2e-2), np.log(3e-1)),
    'p_drop': hp.pchoice('p_drop', [(0.3, 0.0), (0.5, 0.15), (0.2, 0.3)]),
    'wd': hp.choice('wd', [0.0, 2e-2]),
    'plr_sigma': hp.loguniform('plr_sigma', np.log(0.05), np.log(0.5)),
    'hidden_sizes': hp.pchoice('hidden_sizes', [(0.6, [256] * 3), (0.2, [64] * 5), (0.2, [512])]),
    'act': hp.choice('act', ['selu', 'mish', 'relu']),
    'ls_eps': hp.pchoice('ls_eps', [(0.3, 0.0), (0.7, 0.1)])
}
```


================================================
FILE: docs/source/models/03_training_implementation.md
================================================
# Training directly with PyTorch Lightning

## Using PyTorch Lightning

The TabNN models are implemented using [Pytorch Lightning](https://lightning.ai/docs/pytorch/stable/).
It follows the following training implementation principle as described [here](https://lightning.ai/docs/pytorch/stable/model/train_model_basic.html):

```python
# define Dataloader
train_loader = DataLoader(x_train, y_train)
val_loader = DataLoader(x_val, y_val)
test_loader = DataLoader(x_test, y_test)

# define model using a Pytorch LightningModule
nn_model = MyModel(hyper_param1, hyper_param2, ...)

# train model using the Pytorch Lightning Trainer
trainer = pl.Trainer()
trainer.fit(model=nn_model, train_dataloaders=train_loader, val_dataloaders=val_loader)

# make predictions using the Trainer
pred = trainer.predict(nn_model, dataloaders=test_loader)
```

In our use case, adapted to the Tabular NN Network, the implementation looks like this:

``` { .python .annotate } 
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources
from pytabkit.models.data.data import DictDataset, TensorInfo
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.lightning_modules import TabNNModule

import lightning.pytorch as pl  # or: import pytorch_lightning as pl
import numpy as np
import torch

n_epochs = 200

X, y = make_classification()

idxs = np.arange(len(X))
trainval_idxs, test_idxs = train_test_split(idxs, test_size=0.2)
n_trainval_splits = 5
train_idxs_list = []
val_idxs_list = []
for i in range(n_trainval_splits):
    train_idxs, val_idxs = train_test_split(trainval_idxs, test_size=0.2)
    train_idxs_list.append(train_idxs)
    val_idxs_list.append(val_idxs)

# define datasets
ds = DictDataset(tensors={'x_cont': torch.as_tensor(X, dtype=torch.float32),
                                    'x_cat': torch.zeros(len(X), 0),
                                    'y': torch.as_tensor(y, dtype=torch.long)[:, None]},
                           tensor_infos={'x_cont': TensorInfo(feat_shape=[X.shape[1]]),
                                         'x_cat': TensorInfo(cat_sizes=[]),
                                         'y': TensorInfo(cat_sizes=[np.max(y) + 1])}, )  # (1)
train_val_splitting_idxs_list = [
    SplitIdxs(train_idxs=torch.as_tensor(np.stack(train_idxs_list, axis=0), dtype=torch.long),
              val_idxs=torch.as_tensor(np.stack(val_idxs_list, axis=0), dtype=torch.long),
              test_idxs=torch.as_tensor(test_idxs, dtype=torch.long),
              split_seed=0, sub_split_seeds=list(range(len(train_idxs_list))), split_id=0)]

test_ds = ds.get_sub_dataset(torch.as_tensor(test_idxs, dtype=torch.long))

# Create assigned resources
# interface_resources = InterfaceResources(n_threads=4, gpu_devices=['cuda:0'])  # (2)
interface_resources = InterfaceResources(n_threads=4, gpu_devices=[])  # (2)

# define the model using our LightningModule TabNNModule
nn_model = TabNNModule(**DefaultParams.RealMLP_TD_CLASS)
# build and 'compile' the model using the data, now it is ready to use
nn_model.compile_model(ds, train_val_splitting_idxs_list, interface_resources)

# train the model using the Pytorch Lightning Trainer
trainer = pl.Trainer(
    callbacks=nn_model.create_callbacks(),
    max_epochs=n_epochs,
    enable_checkpointing=False,
    enable_progress_bar=False,
    num_sanity_val_steps=0,
    logger=pl.loggers.logger.DummyLogger(),
)  # (3)

trainer.fit(
    model=nn_model,
    train_dataloaders=nn_model.train_dl,
    val_dataloaders=nn_model.val_dl
)
# make predictions using the Trainer
pred = trainer.predict(
    model=nn_model,
    dataloaders=nn_model.get_predict_dataloader(test_ds)
)
```

1. The NN Models have special requirements for their dataloaders, therefore we need to use the `DictDataset` first to create a dataset for both training and validation.
2. We handle our resource management manually, not with Lightning, therefore we need to create an `InterfaceResources` object
3. We use the original [`Trainer`](https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api) Class from Lightning. However, all of the parameters specified here are obligatory for the TabNNModule to work properly.


================================================
FILE: docs/source/models/examples.md
================================================
# Examples

## Refitting RealMLP on train+val data using the best epoch from a previous run

You can refit RealMLP by simply using $n_refit=1$
(or, better, larger values to ensemble multiple NNs). 
But in case you want more control, you can do it manually
(e.g., if you only want to refit the best configuration from HPO,
but you're not using the HPO within pytabkit).

```python
import numpy as np
from sklearn.model_selection import train_test_split

from pytabkit import RealMLP_TD_Regressor

np.random.seed(0)

X = np.random.randn(500, 5)
y = np.random.randn(500)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

reg = RealMLP_TD_Regressor(verbosity=2, random_state=0)
reg.fit(X_train, y_train, X_val, y_val)

refit = RealMLP_TD_Regressor(verbosity=2, stop_epoch=list(reg.fit_params_['stop_epoch'].values())[0], val_fraction=0.0, random_state=0)
refit.fit(X, y)
```

## Fitting again after HPO on a smaller subset

Here is an example on how to fit HPO on a smaller subset 
and fit the best configuration again with validation. 
(It might be better to just use `n_refit` in the HPO classifier/regressor instead.)

```python
import numpy as np
from sklearn.model_selection import train_test_split

from pytabkit import LGBM_HPO_TPE_Regressor, LGBM_TD_Regressor

# This is an example on how to fit a HPO method on a smaller subset of the data,
# and then refit the best hyperparams on the full dataset
np.random.seed(0)

X = np.random.randn(500, 5)
y = np.random.randn(500)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.9, random_state=0)

# use 90% for validation to train faster
# if there is too much validation data, validation data might be the bottleneck, then you should pass
model = LGBM_HPO_TPE_Regressor(val_fraction=0.9, n_hyperopt_steps=5)
model.fit(X, y)

# unfortunately params are not always called the same way, so we need to rename a few
params = model.fit_params_['hyper_fit_params']
params['subsample'] = params.pop('bagging_fraction')
params['colsample_bytree'] = params.pop('feature_fraction')
params['lr'] = params.pop('learning_rate')

# unfortunately, it is hard right now to check if this is exactly the same config,
# as this might set some default params that are not used in the HPO config
model_refit = LGBM_TD_Regressor(**params)
model_refit.fit(X, y)
```

================================================
FILE: docs/source/models/nn_classes.md
================================================
# NN implementation

While RealMLP is implemented in PyTorch, 
we extend the conventional `nn.Module` logic. 
Traditionally, one writes some PyTorch code to assemble a NN model, 
which is a nn.Module composed of building blocks 
that are also nn.Module objects (Composite design pattern). 
The nn.Module classes initialize the parameters in the constructor 
and are then callable objects providing the forward() transformation. 
Data preprocessing is done separately via different code/classes. 
We use a different structure of classes that unifies preprocessing and NN layers, 
which is useful for vectorized NNs: 
The vectorized NNs can share a single non-preprocessed data set, 
loaded into GPU RAM, 
while having different preprocessing parameters 
(fitted on different training sets since different splits are used). 
Individual preprocessed data sets are never fully instantiated in GPU RAM; 
instead, the vectorized NN models do preprocessing on batches individually, 
which saves GPU RAM (we're talking e.g. 
about having 50-100 NNs on the same GPU at the same time).
The class structure uses three base classes:

- `Layer` classes are similar to nn.Module, 
but they do not perform random initialization in the constructor. 
Instead, they simply take the already initialized parameters as input. 
There are some additional features: 
Layer objects of the same type can be combined into a vectorized Layer. 
The vectorized NN is not built directly, 
but first NNs are built and initialized sequentially for better reproducibility 
(random seed etc.) and RAM saving, 
and then they are vectorized after initialization using the Layer.stack() function. 
Additionally, Layer classes work with the DictDataset class, 
which usually contains 'x_cont' and 'x_cat' tensors 
for continuous and categorical variables. 
Moreover, during training, we also pass the labels 'y' through the Layer, 
which allows to implement mixup, label smoothing, 
and output standardization as Layer objects.
- `Fitter` classes initialize the NN based on a single forward pass 
on the (subsampled) training (and possibly validation) set. 
This is done using the `fit()` or `fit_transform()` functions 
similar to scikit-learn preprocessing classes, 
which return a `Layer` object 
(and, in case of `fit_transform()`, the transformed dataset). 
Initialization can be random or depending on the so far transformed training set. 
Typically, parameters of preprocessing layers 
such as standardization depend on the training set, 
while NN parameters do not depend on the training set. 
However, we also use weight and bias initializations 
that depend on the training set, 
and the unification of NN and preprocessing makes this much more convenient. 
- `FitterFactory` (could also be called ArchitectureBuilder) classes 
build the NN structure based on the input and output shape and type. 
Specifically, `FitterFactory` objects can build `Fitter` objects 
given the corresponding 'tensor_infos' of the data set, 
which specifies the number of continuous variables, 
the number of categorical variables and the category sizes, 
and the same for the labels. 
For example, a `FitterFactory` can decide to use one-hot encoding 
for categorical variables with small category sizes, 
and Embedding layers for larger category sizes.

The `Layer`, `Fitter`, and `FitterFactory` classes are defined in `model/base.py`. 
Other subclasses are also defined in `model` folder. There are some more features:

- We introduce a class called `Variable` that inherits from `torch.nn.Parameter`. 
Variable has a parameter `trainable: bool`, and in the case `trainable==False`, 
the `Layer` class will register it using `register_buffer()`. 
One might also be able to just use `nn.Parameter(..., requires_grad=False)`
for this, though we did not check whether it has the same effect 
(will it be saved when using `model.state_dict()`?). 
There is also the convenience function `Variable.stack()` used by `Layer.stack()`. 
Moreover, Variables can have names 
(to assign individual hyperparameter values to them), 
and they can have custom hyperparameter factors 
(e.g. to specify that the lr should be multiplied 
by a certain value for this Variable).
- The classes above can be given scope names, 
which are then prepended to variable names. 
For example, using scope names, 
the weight of the first linear layer 
in a NN could be called 'net/first_layer/layer-0/weight', 
where 0 is the layer index and 'first_layer' is 
redundant information that can be useful when regex matching variable names. 
One can assign an individual lr to this layer by using
`lr={'': global_lr, '.*first_layer.*weight': first_layer_weight_lr}`
in `**kwargs` to the `NNAlgInterface`. 
This works as follows: The `HyperparamManager`, 
which is available through a global context managed by the `TrainContext` class, 
stores the hyperparameter configurations obtained through **kwargs. 
Different classes can require getters for specific hyperparameters 
for specific variables. 
If multiple lr values are specified above, 
the one from the last matching regex is taken.
The scope names are passed on from FitterFactory to Fitter and then 
to Layer and Variable by a somewhat complicated context manager system, 
for which I didn't find a more elegant solution.
- Fitter objects can be split up in three parts using the `split_off_dynamic()` 
and `split_off_individual()` functions.
The static part would typically be the one-hot encoding, 
since it does not depend on the data and is not trainable, 
which means that even in a vectorized context, 
it can be applied once to the single shared data set 
since it does not depend on the train/val/test split.  
Then, there is the dynamic but not individual part, 
which can depend on the fitting data but is not trained or randomized, 
and can therefore be shared by models with the same trainval-test split. 
Finally, there is the individual (trainable/randomized) part, 
which is usually the NN part.
- `Fitter` classes should implement methods that allow to estimate 
the RAM usage of the parameters and a forward pass, 
which allows to decide how many NNs fit onto a GPU when running the benchmark.

================================================
FILE: docs/source/models/quantile_reg.md
================================================
# (Multi)quantile regression with RealMLP

RealMLP supports multiquantile regression, for example by using
```python
from pytabkit import RealMLP_TD_Regressor
reg = RealMLP_TD_Regressor(
    train_metric_name='multi_pinball(0.25,0.5,0.75)',
    val_metric_name='multi_pinball(0.25,0.5,0.75)'
)
```
This will adjust the training objective 
as well as the metric for best-epoch selection on the validation set.
The quantiles can be specified in any format 
that Python can convert to a float. 
There must be no space between the commas, 
and the quantiles need to be in ascending order.
The latter is relevant because RealMLP 
will by default sort the prediction outputs, 
to always have ascending quantile predictions.
This can be deactivated by passing `sort_quantile_predictions=False`.


================================================
FILE: examples/tutorial_notebook.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "enZVuzCHCy1n"
      },
      "source": [
        "**To train neural networks faster, you need to enable GPUs for the notebook:**\n",
        "* Navigate to Edit→Notebook Settings\n",
        "* select GPU from the Hardware Accelerator drop-down"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rtKFT1oSCy1p"
      },
      "source": [
        "# Setup"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Sr0lfFYqCy1q"
      },
      "source": [
        "## Installation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d-Zn1o8jCy1q"
      },
      "outputs": [],
      "source": [
        "!pip install pytabkit\n",
        "!pip install openml"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V1Qo43ciCy1r"
      },
      "source": [
        "## Getting a dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "o-MpREHMCy1r"
      },
      "outputs": [],
      "source": [
        "import openml\n",
        "from sklearn.model_selection import train_test_split\n",
        "import numpy as np\n",
        "\n",
        "task = openml.tasks.get_task(361113) # covertype dataset\n",
        "dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)\n",
        "X, y, categorical_indicator, attribute_names = dataset.get_data(\n",
        "    dataset_format='dataframe',\n",
        "    target=task.target_name\n",
        ")\n",
        "# we restrict to 15K samples for demonstration purposes\n",
        "index = np.random.choice(range(len(X)), 15_000, replace=False)\n",
        "X = X.iloc[index]\n",
        "y = y.iloc[index]\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PeMtLz0ICy1s"
      },
      "source": [
        "# Using RealMLP"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CgSOr3l0Cy1s",
        "outputId": "d2b0ea97-45ac-4a9e-ff3d-291d72094615"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Accuracy of RealMLP: 0.8770666666666667\n",
            "CPU times: user 1min 11s, sys: 192 ms, total: 1min 11s\n",
            "Wall time: 1min 11s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "from pytabkit import RealMLP_TD_Classifier\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "model = RealMLP_TD_Classifier()\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "y_pred = model.predict(X_test)\n",
        "acc = accuracy_score(y_test, y_pred)\n",
        "print(f\"Accuracy of RealMLP: {acc}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-G8Oblk5Cy1s"
      },
      "source": [
        "## With bagging\n",
        "It is possible to do bagging (ensembling of models on 5-fold cross-validation) simply by passing `n_cv=5` to the constructor. Note that it doesn't take 5x as long because of vectorized training."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "i0NpWvjKCy1s",
        "outputId": "89c07496-fd0e-4f46-ea59-3457f8a35371"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Accuracy of RealMLP with bagging: 0.8930666666666667\n",
            "CPU times: user 1min 8s, sys: 180 ms, total: 1min 9s\n",
            "Wall time: 1min 8s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "from pytabkit import RealMLP_TD_Classifier\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "model = RealMLP_TD_Classifier(n_cv=5)\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "y_pred = model.predict(X_test)\n",
        "acc = accuracy_score(y_test, y_pred)\n",
        "print(f\"Accuracy of RealMLP with bagging: {acc}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KHphiGKBCy1t"
      },
      "source": [
        "## With hyperparameter optimization\n",
        "It is possible to do hyperparameter optimization directly inside a sklearn interface by using the `RealMLP_HPO_Regressor` interface.\n",
        "This is also available for classification, and for other models, for instance `LGBM_HPO_Classifier` or `LGBM_HPO_TPE_Classifier` (to use the Tree-structured Parzen Estimator algorithm)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7e4wjdYJCy1t",
        "outputId": "a7ed7867-c808-4ed9-dbc2-badea992eae2"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Accuracy of RealMLP with 3 steps HPO: 0.8605333333333334\n",
            "CPU times: user 2min 27s, sys: 442 ms, total: 2min 28s\n",
            "Wall time: 2min 28s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "from pytabkit import RealMLP_HPO_Classifier\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "n_hyperopt_steps = 3 # small number for demonstration purposes\n",
        "model = RealMLP_HPO_Classifier(n_hyperopt_steps=n_hyperopt_steps)\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "y_pred = model.predict(X_test)\n",
        "acc = accuracy_score(y_test, y_pred)\n",
        "print(f\"Accuracy of RealMLP with {n_hyperopt_steps} steps HPO: {acc}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SB0D5MnbCy1t"
      },
      "source": [
        "# Using improved default for tree based models"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OLulH2rGCy1t"
      },
      "source": [
        "`TD` stands for *tuned defaults*, which are the improved default we propose. `D` stands for *defaults*, which are the libraries defaults."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UEZU3kaDCy1t",
        "outputId": "1c5bd06f-caf6-499c-8f84-5496db9d0ce6"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Accuracy of CatBoost_TD_Classifier: 0.8685333333333334\n",
            "Accuracy of CatBoost_D_Classifier: 0.8464\n",
            "Accuracy of LGBM_TD_Classifier: 0.8602666666666666\n",
            "Accuracy of LGBM_D_Classifier: 0.8344\n",
            "Accuracy of XGB_TD_Classifier: 0.8544\n",
            "Accuracy of XGB_D_Classifier: 0.8472\n",
            "CPU times: user 1min 55s, sys: 44.3 s, total: 2min 40s\n",
            "Wall time: 24 s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "from pytabkit import CatBoost_TD_Classifier, CatBoost_D_Classifier, LGBM_TD_Classifier, LGBM_D_Classifier, XGB_TD_Classifier, XGB_D_Classifier\n",
        "\n",
        "for model in [CatBoost_TD_Classifier(), CatBoost_D_Classifier(), LGBM_TD_Classifier(), LGBM_D_Classifier(), XGB_TD_Classifier(), XGB_D_Classifier()]:\n",
        "    model.fit(X_train, y_train)\n",
        "    y_pred = model.predict(X_test)\n",
        "    acc = accuracy_score(y_test, y_pred)\n",
        "    print(f\"Accuracy of {model.__class__.__name__}: {acc}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tMzbmtJMCy1t"
      },
      "source": [
        "# Ensembling tuned defaults of tree-based methods and RealMLP: a very good baseline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JZJH1sWfCy1t",
        "outputId": "8d059418-5236-4a84-b55a-6829200bb330"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Accuracy of Ensemble_TD_Classifier: 0.8834666666666666\n",
            "CPU times: user 2min 34s, sys: 38 s, total: 3min 12s\n",
            "Wall time: 1min 30s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "from pytabkit import Ensemble_TD_Classifier\n",
        "\n",
        "model = Ensemble_TD_Classifier()\n",
        "model.fit(X_train, y_train)\n",
        "y_pred = model.predict(X_test)\n",
        "acc = accuracy_score(y_test, y_pred)\n",
        "print(f\"Accuracy of Ensemble_TD_Classifier: {acc}\")"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "undefined.undefined.undefined"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}


================================================
FILE: original_requirements/conda_env_2024_06_25.yml
================================================
name: tab_bench_venv_3
channels:
  - pytorch
  - nvidia
  - defaults
dependencies:
  - _libgcc_mutex=0.1
  - _openmp_mutex=5.1
  - _py-xgboost-mutex=2.0
  - abseil-cpp=20211102.0
  - arrow-cpp=11.0.0
  - atk-1.0=2.36.0
  - aws-c-common=0.6.8
  - aws-c-event-stream=0.1.6
  - aws-checksums=0.1.11
  - aws-sdk-cpp=1.8.185
  - blas=1.0
  - boost-cpp=1.82.0
  - bottleneck=1.3.5
  - brotli=1.0.9
  - brotli-bin=1.0.9
  - brotlipy=0.7.0
  - bzip2=1.0.8
  - c-ares=1.19.1
  - ca-certificates=2024.3.11
  - cairo=1.16.0
  - catboost=1.2
  - certifi=2024.6.2
  - cffi=1.16.0
  - charset-normalizer=2.0.4
  - configparser=5.0.2
  - contourpy=1.2.0
  - coverage=7.2.2
  - cryptography=41.0.7
  - cuda-cudart=11.7.99
  - cuda-cupti=11.7.101
  - cuda-libraries=11.7.1
  - cuda-nvrtc=11.7.99
  - cuda-nvtx=11.7.91
  - cuda-runtime=11.7.1
  - cudatoolkit=11.4.1
  - cycler=0.11.0
  - cyrus-sasl=2.1.28
  - cython=3.0.6
  - dbus=1.13.18
  - dill=0.3.7
  - et_xmlfile=1.1.0
  - exceptiongroup=1.2.0
  - expat=2.5.0
  - faiss-gpu=1.7.4
  - filelock=3.13.1
  - font-ttf-dejavu-sans-mono=2.37
  - font-ttf-inconsolata=2.001
  - font-ttf-source-code-pro=2.030
  - font-ttf-ubuntu=0.83
  - fontconfig=2.14.1
  - fonts-anaconda=1
  - fonts-conda-ecosystem=1
  - fonttools=4.25.0
  - freetype=2.12.1
  - fribidi=1.0.10
  - fsspec=2023.10.0
  - future=0.18.3
  - gdk-pixbuf=2.42.10
  - gflags=2.2.2
  - giflib=5.2.1
  - glib=2.69.1
  - glog=0.5.0
  - gmp=6.2.1
  - gmpy2=2.1.2
  - gobject-introspection=1.72.0
  - graphite2=1.3.14
  - graphviz=2.50.0
  - grpc-cpp=1.48.2
  - gst-plugins-base=1.14.1
  - gstreamer=1.14.1
  - gtk2=2.24.33
  - gts=0.7.6
  - harfbuzz=4.3.0
  - icu=73.1
  - idna=3.4
  - iniconfig=1.1.1
  - intel-openmp=2021.4.0
  - jinja2=3.1.2
  - joblib=1.2.0
  - jpeg=9e
  - kiwisolver=1.4.4
  - krb5=1.20.1
  - lcms2=2.12
  - ld_impl_linux-64=2.38
  - lerc=3.0
  - liac-arff=2.5.0
  - libboost=1.82.0
  - libbrotlicommon=1.0.9
  - libbrotlidec=1.0.9
  - libbrotlienc=1.0.9
  - libclang=14.0.6
  - libclang13=14.0.6
  - libcublas=11.10.3.66
  - libcufft=10.7.2.124
  - libcufile=1.8.1.2
  - libcups=2.4.2
  - libcurand=10.3.4.107
  - libcurl=8.5.0
  - libcusolver=11.4.0.1
  - libcusparse=11.7.4.91
  - libdeflate=1.17
  - libedit=3.1.20230828
  - libev=4.33
  - libevent=2.1.12
  - libfaiss=1.7.4
  - libffi=3.4.4
  - libgcc-ng=11.2.0
  - libgd=2.3.3
  - libgfortran-ng=11.2.0
  - libgfortran5=11.2.0
  - libgomp=11.2.0
  - libiconv=1.16
  - libllvm14=14.0.6
  - libnghttp2=1.57.0
  - libnpp=11.7.4.75
  - libnvjpeg=11.8.0.2
  - libpng=1.6.39
  - libpq=12.17
  - libprotobuf=3.20.3
  - librsvg=2.54.4
  - libssh2=1.10.0
  - libstdcxx-ng=11.2.0
  - libthrift=0.15.0
  - libtiff=4.5.1
  - libtool=2.4.6
  - libuuid=1.41.5
  - libwebp=1.3.2
  - libwebp-base=1.3.2
  - libxcb=1.15
  - libxgboost=1.7.3
  - libxkbcommon=1.0.1
  - libxml2=2.10.4
  - lightgbm=4.1.0
  - lightning-utilities=0.9.0
  - llvm-openmp=14.0.6
  - lz4-c=1.9.4
  - markupsafe=2.1.3
  - matplotlib=3.8.0
  - matplotlib-base=3.8.0
  - minio=7.1.0
  - mkl=2021.4.0
  - mkl-service=2.4.0
  - mkl_fft=1.3.1
  - mkl_random=1.2.2
  - mpc=1.1.0
  - mpfr=4.0.2
  - mpmath=1.3.0
  - munkres=1.1.4
  - mysql=5.7.24
  - ncurses=6.4
  - networkx=3.1
  - ninja=1.10.2
  - ninja-base=1.10.2
  - nspr=4.35
  - nss=3.89.1
  - numexpr=2.8.4
  - openjpeg=2.4.0
  - openml=0.12.2
  - openpyxl=3.0.10
  - openssl=3.0.13
  - orc=1.7.4
  - packaging=23.1
  - pandas=2.1.4
  - pango=1.50.7
  - pcre=8.45
  - pigz=2.6
  - pillow=10.0.1
  - pip=23.3.1
  - pixman=0.40.0
  - platformdirs=3.10.0
  - plotly=5.9.0
  - pluggy=1.0.0
  - ply=3.11
  - pooch=1.7.0
  - poppler=22.12.0
  - poppler-data=0.4.11
  - psutil=5.9.0
  - py-xgboost=1.7.3
  - pyarrow=11.0.0
  - pycparser=2.21
  - pyopenssl=23.2.0
  - pyparsing=3.0.9
  - pyqt=5.15.10
  - pyqt5-sip=12.13.0
  - pysocks=1.7.1
  - pytest=7.4.0
  - pytest-cov=4.1.0
  - python=3.10.13
  - python-dateutil=2.8.2
  - python-graphviz=0.20.1
  - python-tzdata=2023.3
  - pytorch=2.0.1
  - pytorch-cuda=11.7
  - pytorch-lightning=2.0.3
  - pytorch-mutex=1.0
  - pytz=2023.3.post1
  - pyyaml=6.0.1
  - qt-main=5.15.2
  - re2=2022.04.01
  - readline=8.2
  - requests=2.31.0
  - scikit-learn=1.3.0
  - scipy=1.10.1
  - seaborn=0.12.2
  - setuptools=68.2.2
  - sip=6.7.12
  - six=1.16.0
  - snappy=1.1.10
  - sqlite=3.41.2
  - swig=4.0.2
  - sympy=1.12
  - tbb=2021.8.0
  - tenacity=8.2.2
  - threadpoolctl=2.2.0
  - tk=8.6.12
  - toml=0.10.2
  - tomli=2.0.1
  - torchmetrics=1.1.2
  - torchtriton=2.0.0
  - tornado=6.3.3
  - tqdm=4.65.0
  - typing-extensions=4.9.0
  - typing_extensions=4.9.0
  - tzdata=2023d
  - urllib3=1.26.16
  - utf8proc=2.6.1
  - wheel=0.41.2
  - xgboost=1.7.3
  - xlrd=2.0.1
  - xmltodict=0.13.0
  - xz=5.4.5
  - yaml=0.2.5
  - zlib=1.2.13
  - zstd=1.5.5
  - pip:
      - adjusttext==1.0.4
      - aiosignal==1.3.1
      - annotated-types==0.6.0
      - attrs==23.2.0
      - babel==2.14.0
      - blis==0.7.11
      - catalogue==2.0.10
      - cir-model==0.2.0
      - click==8.1.7
      - cloudpathlib==0.16.0
      - cloudpickle==3.0.0
      - colorama==0.4.6
      - confection==0.1.4
      - configspace==0.7.1
      - cramjam==2.8.1
      - cymem==2.0.8
      - dask==2024.1.1
      - dask-jobqueue==0.8.2
      - distributed==2024.1.1
      - einops==0.7.0
      - emcee==3.1.4
      - fastparquet==2023.10.1
      - fire==0.5.0
      - frozenlist==1.4.1
      - gensim==4.3.2
      - ghp-import==2.1.0
      - griffe==0.39.1
      - hyperopt==0.2.7
      - importlib-metadata==7.0.1
      - imutils==0.5.4
      - jsonschema==4.21.1
      - jsonschema-specifications==2023.12.1
      - kditransform==0.2.0
      - langcodes==3.3.0
      - llvmlite==0.41.1
      - locket==1.0.0
      - markdown==3.5.2
      - mergedeep==1.3.4
      - mkdocs==1.5.3
      - mkdocs-autorefs==0.5.0
      - mkdocs-material==9.5.6
      - mkdocs-material-extensions==1.3.1
      - mkdocstrings==0.24.0
      - mkdocstrings-python==1.8.0
      - more-itertools==10.2.0
      - msgpack==1.0.7
      - msgpack-numpy==0.4.8
      - murmurhash==1.0.10
      - numba==0.58.1
      - numpy==1.26.4
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
      - nvidia-cuda-runtime-cu12==12.1.105
      - nvidia-cudnn-cu12==8.9.2.26
      - nvidia-cufft-cu12==11.0.2.54
      - nvidia-curand-cu12==10.3.2.106
      - nvidia-cusolver-cu12==11.4.5.107
      - nvidia-cusparse-cu12==12.1.0.106
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.3.101
      - nvidia-nvtx-cu12==12.1.105
      - opencv-contrib-python==4.9.0.80
      - paginate==0.5.6
      - partd==1.4.1
      - pathspec==0.12.1
      - patool==2.1.1
      - preshed==3.0.9
      - protobuf==4.25.2
      - py4j==0.10.9.7
      - pydantic==2.5.3
      - pydantic-core==2.14.6
      - pygments==2.17.2
      - pymdown-extensions==10.7
      - pynisher==1.0.10
      - pynvml==11.5.0
      - pyrfr==0.9.0
      - pytorch-widedeep==1.4.0
      - pyyaml-env-tag==0.1
      - ray==2.9.1
      - referencing==0.32.1
      - regex==2023.12.25
      - rpds-py==0.17.1
      - skorch==0.15.0
      - smac==2.0.2
      - smart-open==6.4.0
      - sortedcontainers==2.4.0
      - spacy==3.7.2
      - spacy-legacy==3.0.12
      - spacy-loggers==1.0.5
      - srsly==2.4.8
      - tabulate==0.9.0
      - tblib==3.0.0
      - termcolor==2.4.0
      - thinc==8.2.2
      - toolz==0.12.1
      - torch==2.1.2
      - torchvision==0.16.2
      - triton==2.1.0
      - tueplots==0.0.13
      - typer==0.9.0
      - venn-abers==1.4.1
      - wasabi==1.1.2
      - watchdog==3.0.0
      - weasel==0.3.4
      - wrapt==1.16.0
      - zict==3.0.0
      - zipp==3.17.0


================================================
FILE: original_requirements/conda_env_2024_10_28.yml
================================================
name: tab_bench_conda
channels:
  - pytorch
  - nvidia
  - defaults
dependencies:
  - _libgcc_mutex=0.1
  - _openmp_mutex=5.1
  - _py-xgboost-mutex=2.0
  - abseil-cpp=20211102.0
  - arrow-cpp=11.0.0
  - atk-1.0=2.36.0
  - aws-c-common=0.6.8
  - aws-c-event-stream=0.1.6
  - aws-checksums=0.1.11
  - aws-sdk-cpp=1.8.185
  - blas=1.0
  - boost-cpp=1.82.0
  - bottleneck=1.3.5
  - brotli=1.0.9
  - brotli-bin=1.0.9
  - brotlipy=0.7.0
  - bzip2=1.0.8
  - c-ares=1.19.1
  - ca-certificates=2024.7.2
  - cairo=1.16.0
  - catboost=1.2.3
  - certifi=2024.8.30
  - cffi=1.16.0
  - charset-normalizer=2.0.4
  - configparser=5.0.2
  - contourpy=1.2.0
  - coverage=7.2.2
  - cryptography=41.0.7
  - cuda-cudart=11.7.99
  - cuda-cupti=11.7.101
  - cuda-libraries=11.7.1
  - cuda-nvrtc=11.7.99
  - cuda-nvtx=11.7.91
  - cuda-runtime=11.7.1
  - cudatoolkit=11.4.1
  - cycler=0.11.0
  - cyrus-sasl=2.1.28
  - cython=3.0.6
  - dbus=1.13.18
  - dill=0.3.7
  - et_xmlfile=1.1.0
  - exceptiongroup=1.2.0
  - expat=2.5.0
  - faiss-gpu=1.7.4
  - filelock=3.13.1
  - font-ttf-dejavu-sans-mono=2.37
  - font-ttf-inconsolata=2.001
  - font-ttf-source-code-pro=2.030
  - font-ttf-ubuntu=0.83
  - fontconfig=2.14.1
  - fonts-anaconda=1
  - fonts-conda-ecosystem=1
  - fonttools=4.25.0
  - freetype=2.12.1
  - fribidi=1.0.10
  - fsspec=2023.10.0
  - future=0.18.3
  - gdk-pixbuf=2.42.10
  - gflags=2.2.2
  - giflib=5.2.1
  - glib=2.69.1
  - glog=0.5.0
  - gmp=6.2.1
  - gmpy2=2.1.2
  - gobject-introspection=1.72.0
  - graphite2=1.3.14
  - graphviz=2.50.0
  - grpc-cpp=1.48.2
  - gst-plugins-base=1.14.1
  - gstreamer=1.14.1
  - gtk2=2.24.33
  - gts=0.7.6
  - harfbuzz=4.3.0
  - icu=73.1
  - idna=3.4
  - iniconfig=1.1.1
  - intel-openmp=2021.4.0
  - jinja2=3.1.2
  - joblib=1.2.0
  - jpeg=9e
  - kiwisolver=1.4.4
  - krb5=1.20.1
  - lcms2=2.12
  - ld_impl_linux-64=2.38
  - lerc=3.0
  - liac-arff=2.5.0
  - libboost=1.82.0
  - libbrotlicommon=1.0.9
  - libbrotlidec=1.0.9
  - libbrotlienc=1.0.9
  - libclang=14.0.6
  - libclang13=14.0.6
  - libcublas=11.10.3.66
  - libcufft=10.7.2.124
  - libcufile=1.8.1.2
  - libcups=2.4.2
  - libcurand=10.3.4.107
  - libcurl=8.5.0
  - libcusolver=11.4.0.1
  - libcusparse=11.7.4.91
  - libdeflate=1.17
  - libedit=3.1.20230828
  - libev=4.33
  - libevent=2.1.12
  - libfaiss=1.7.4
  - libffi=3.4.4
  - libgcc-ng=11.2.0
  - libgd=2.3.3
  - libgfortran-ng=11.2.0
  - libgfortran5=11.2.0
  - libgomp=11.2.0
  - libiconv=1.16
  - libllvm14=14.0.6
  - libnghttp2=1.57.0
  - libnpp=11.7.4.75
  - libnvjpeg=11.8.0.2
  - libpng=1.6.39
  - libpq=12.17
  - libprotobuf=3.20.3
  - librsvg=2.54.4
  - libssh2=1.10.0
  - libstdcxx-ng=11.2.0
  - libthrift=0.15.0
  - libtiff=4.5.1
  - libtool=2.4.6
  - libuuid=1.41.5
  - libwebp=1.3.2
  - libwebp-base=1.3.2
  - libxcb=1.15
  - libxgboost=1.7.3
  - libxkbcommon=1.0.1
  - libxml2=2.10.4
  - lightgbm=4.1.0
  - lightning-utilities=0.9.0
  - llvm-openmp=14.0.6
  - lz4-c=1.9.4
  - markupsafe=2.1.3
  - matplotlib=3.8.0
  - matplotlib-base=3.8.0
  - minio=7.1.0
  - mkl=2021.4.0
  - mkl-service=2.4.0
  - mkl_fft=1.3.1
  - mkl_random=1.2.2
  - mpc=1.1.0
  - mpfr=4.0.2
  - mpmath=1.3.0
  - munkres=1.1.4
  - mysql=5.7.24
  - ncurses=6.4
  - networkx=3.1
  - ninja=1.10.2
  - ninja-base=1.10.2
  - nspr=4.35
  - nss=3.89.1
  - numexpr=2.8.4
  - numpy-base=1.24.3
  - openjpeg=2.4.0
  - openml=0.12.2
  - openpyxl=3.0.10
  - openssl=3.0.15
  - orc=1.7.4
  - packaging=23.1
  - pandas=2.1.4
  - pango=1.50.7
  - pcre=8.45
  - pigz=2.6
  - pillow=10.0.1
  - pip=23.3.1
  - pixman=0.40.0
  - platformdirs=3.10.0
  - plotly=5.9.0
  - pluggy=1.0.0
  - ply=3.11
  - pooch=1.7.0
  - poppler=22.12.0
  - poppler-data=0.4.11
  - psutil=5.9.0
  - py-xgboost=1.7.3
  - pyarrow=11.0.0
  - pycparser=2.21
  - pyopenssl=23.2.0
  - pyparsing=3.0.9
  - pyqt=5.15.10
  - pyqt5-sip=12.13.0
  - pysocks=1.7.1
  - pytest=7.4.0
  - pytest-cov=4.1.0
  - python=3.10.13
  - python-dateutil=2.8.2
  - python-graphviz=0.20.1
  - python-tzdata=2023.3
  - pytorch=2.0.1
  - pytorch-cuda=11.7
  - pytorch-lightning=2.0.3
  - pytorch-mutex=1.0
  - pytz=2023.3.post1
  - pyyaml=6.0.1
  - qt-main=5.15.2
  - re2=2022.04.01
  - readline=8.2
  - requests=2.31.0
  - scikit-learn=1.3.0
  - scipy=1.10.1
  - setuptools=68.2.2
  - sip=6.7.12
  - six=1.16.0
  - snappy=1.1.10
  - sqlite=3.41.2
  - swig=4.0.2
  - sympy=1.12
  - tbb=2021.8.0
  - tenacity=8.2.2
  - threadpoolctl=2.2.0
  - tk=8.6.12
  - toml=0.10.2
  - tomli=2.0.1
  - torchmetrics=1.4.0.post0
  - torchtriton=2.0.0
  - tornado=6.3.3
  - tqdm=4.65.0
  - typing-extensions=4.9.0
  - typing_extensions=4.9.0
  - tzdata=2023d
  - urllib3=1.26.16
  - utf8proc=2.6.1
  - wheel=0.41.2
  - xgboost=1.7.3
  - xlrd=2.0.1
  - xmltodict=0.13.0
  - xz=5.4.5
  - yaml=0.2.5
  - zlib=1.2.13
  - zstd=1.5.5
  - pip:
      - adjusttext==1.0.4
      - aiosignal==1.3.1
      - annotated-types==0.6.0
      - attrs==23.2.0
      - autorank==1.1.3
      - babel==2.14.0
      - baycomp==1.0.3
      - blis==0.7.11
      - catalogue==2.0.10
      - cir-model==0.2.0
      - click==8.1.7
      - cloudpathlib==0.16.0
      - cloudpickle==3.0.0
      - colorama==0.4.6
      - confection==0.1.4
      - configspace==0.7.1
      - cramjam==2.8.1
      - cymem==2.0.8
      - dask==2024.1.1
      - dask-jobqueue==0.8.2
      - distributed==2024.1.1
      - einops==0.7.0
      - emcee==3.1.4
      - fastparquet==2023.10.1
      - fire==0.5.0
      - frozenlist==1.4.1
      - gensim==4.3.2
      - ghp-import==2.1.0
      - griffe==0.39.1
      - hyperopt==0.2.7
      - importlib-metadata==7.0.1
      - imutils==0.5.4
      - jsonschema==4.21.1
      - jsonschema-specifications==2023.12.1
      - kditransform==0.2.0
      - langcodes==3.3.0
      - llvmlite==0.41.1
      - locket==1.0.0
      - markdown==3.5.2
      - mergedeep==1.3.4
      - mkdocs==1.5.3
      - mkdocs-autorefs==0.5.0
      - mkdocs-material==9.5.6
      - mkdocs-material-extensions==1.3.1
      - mkdocstrings==0.24.0
      - mkdocstrings-python==1.8.0
      - more-itertools==10.2.0
      - msgpack==1.0.7
      - msgpack-numpy==0.4.8
      - murmurhash==1.0.10
      - numba==0.58.1
      - numpy==1.26.4
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
      - nvidia-cuda-runtime-cu12==12.1.105
      - nvidia-cudnn-cu12==8.9.2.26
      - nvidia-cufft-cu12==11.0.2.54
      - nvidia-curand-cu12==10.3.2.106
      - nvidia-cusolver-cu12==11.4.5.107
      - nvidia-cusparse-cu12==12.1.0.106
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.3.101
      - nvidia-nvtx-cu12==12.1.105
      - opencv-contrib-python==4.9.0.80
      - paginate==0.5.6
      - partd==1.4.1
      - pathspec==0.12.1
      - patool==2.1.1
      - patsy==0.5.6
      - preshed==3.0.9
      - protobuf==4.25.2
      - py4j==0.10.9.7
      - pydantic==2.5.3
      - pydantic-core==2.14.6
      - pygments==2.17.2
      - pymdown-extensions==10.7
      - pynisher==1.0.10
      - pynvml==11.5.0
      - pyrfr==0.9.0
      - pytorch-widedeep==1.4.0
      - pyyaml-env-tag==0.1
      - ray==2.9.1
      - referencing==0.32.1
      - regex==2023.12.25
      - rpds-py==0.17.1
      - rtdl-revisiting-models==0.0.2
      - seaborn==0.13.2
      - skorch==0.15.0
      - smac==2.0.2
      - smart-open==6.4.0
      - sortedcontainers==2.4.0
      - spacy==3.7.2
      - spacy-legacy==3.0.12
      - spacy-loggers==1.0.5
      - srsly==2.4.8
      - statsmodels==0.14.3
      - tabulate==0.9.0
      - tblib==3.0.0
      - termcolor==2.4.0
      - thinc==8.2.2
      - toolz==0.12.1
      - torch==2.1.2
      - torchvision==0.16.2
      - triton==2.1.0
      - tueplots==0.0.13
      - typer==0.9.0
      - venn-abers==1.4.1
      - wasabi==1.1.2
      - watchdog==3.0.0
      - weasel==0.3.4
      - wrapt==1.16.0
      - zict==3.0.0
      - zipp==3.17.0


================================================
FILE: original_requirements/conda_env_2025_01_15.yml
================================================
name: probclass
channels:
  - pytorch
  - nvidia
  - defaults
dependencies:
  - _libgcc_mutex=0.1
  - _openmp_mutex=5.1
  - blas=1.0
  - brotli-python=1.0.9
  - bzip2=1.0.8
  - ca-certificates=2024.12.31
  - certifi=2024.12.14
  - charset-normalizer=3.3.2
  - cuda-cudart=11.8.89
  - cuda-cupti=11.8.87
  - cuda-libraries=11.8.0
  - cuda-nvrtc=11.8.89
  - cuda-nvtx=11.8.86
  - cuda-runtime=11.8.0
  - cuda-version=12.6
  - expat=2.6.4
  - ffmpeg=4.3
  - filelock=3.13.1
  - freetype=2.12.1
  - giflib=5.2.2
  - gmp=6.2.1
  - gnutls=3.6.15
  - idna=3.7
  - intel-openmp=2023.1.0
  - jinja2=3.1.4
  - jpeg=9e
  - lame=3.100
  - lcms2=2.16
  - ld_impl_linux-64=2.40
  - lerc=4.0.0
  - libcublas=11.11.3.6
  - libcufft=10.9.0.58
  - libcufile=1.11.1.6
  - libcurand=10.3.7.77
  - libcusolver=11.4.1.48
  - libcusparse=11.7.5.86
  - libdeflate=1.22
  - libffi=3.4.4
  - libgcc-ng=11.2.0
  - libgomp=11.2.0
  - libiconv=1.16
  - libidn2=2.3.4
  - libjpeg-turbo=2.0.0
  - libnpp=11.8.0.86
  - libnvjpeg=11.9.0.86
  - libpng=1.6.39
  - libstdcxx-ng=11.2.0
  - libtasn1=4.19.0
  - libtiff=4.5.1
  - libunistring=0.9.10
  - libuuid=1.41.5
  - libwebp=1.3.2
  - libwebp-base=1.3.2
  - llvm-openmp=14.0.6
  - lz4-c=1.9.4
  - markupsafe=2.1.3
  - mkl=2023.1.0
  - mkl-service=2.4.0
  - mkl_fft=1.3.11
  - mkl_random=1.2.8
  - mpmath=1.3.0
  - ncurses=6.4
  - nettle=3.7.3
  - networkx=3.2.1
  - openh264=2.1.1
  - openjpeg=2.5.2
  - openssl=3.0.15
  - pillow=11.0.0
  - pip=24.2
  - pysocks=1.7.1
  - python=3.12.8
  - pytorch=2.5.1
  - pytorch-cuda=11.8
  - pytorch-mutex=1.0
  - pyyaml=6.0.2
  - readline=8.2
  - requests=2.32.3
  - setuptools=72.1.0
  - sqlite=3.45.3
  - tbb=2021.8.0
  - tk=8.6.14
  - torchtriton=3.1.0
  - torchvision=0.20.1
  - typing_extensions=4.12.2
  - urllib3=2.2.3
  - wheel=0.44.0
  - xz=5.4.6
  - yaml=0.2.5
  - zlib=1.2.13
  - zstd=1.5.6
  - pip:
      - absl-py==2.1.0
      - adjusttext==1.3.0
      - aiohappyeyeballs==2.4.4
      - aiohttp==3.11.11
      - aiosignal==1.3.2
      - alabaster==1.0.0
      - argon2-cffi==23.1.0
      - argon2-cffi-bindings==21.2.0
      - attrs==24.3.0
      - autorank==1.2.1
      - babel==2.16.0
      - baycomp==1.0.3
      - catboost==1.2.7
      - cffi==1.17.1
      - cir-model==0.2.0
      - click==8.1.8
      - cloudpickle==3.1.0
      - contourpy==1.3.1
      - coverage==7.6.10
      - cycler==0.12.1
      - dask==2024.12.1
      - dask-expr==1.1.21
      - deprecation==2.1.0
      - dill==0.3.9
      - docutils==0.21.2
      - et-xmlfile==2.0.0
      - fire==0.7.0
      - fonttools==4.55.3
      - frozenlist==1.5.0
      - fsspec==2024.12.0
      - gpytorch==1.13
      - grpcio==1.69.0
      - imagesize==1.4.1
      - iniconfig==2.0.0
      - jaxtyping==0.2.19
      - joblib==1.4.2
      - jsonschema==4.23.0
      - jsonschema-specifications==2024.10.1
      - kiwisolver==1.4.8
      - liac-arff==2.5.0
      - lightgbm==4.5.0
      - lightning-utilities==0.11.9
      - linear-operator==0.5.3
      - locket==1.0.0
      - markdown==3.7
      - markdown-it-py==3.0.0
      - matplotlib==3.7.5
      - mdit-py-plugins==0.4.2
      - mdurl==0.1.2
      - minio==7.2.14
      - msgpack==1.1.0
      - msgpack-numpy==0.4.8
      - multidict==6.1.0
      - myst-parser==4.0.0
      - netcal==1.3.6
      - numpy==1.26.4
      - nvidia-ml-py==12.560.30
      - nvidia-nccl-cu12==2.24.3
      - openml==0.15.0
      - openpyxl==3.1.5
      - opt-einsum==3.4.0
      - packaging==24.2
      - pandas==2.2.3
      - partd==1.4.2
      - patool==3.1.0
      - patsy==1.0.1
      - plotly==5.24.1
      - pluggy==1.5.0
      - probmetrics==0.0.1
      - propcache==0.2.1
      - protobuf==5.29.3
      - psutil==6.1.1
      - pyarrow==18.1.0
      - pycparser==2.22
      - pycryptodome==3.21.0
      - pygments==2.19.1
      - pynvml==12.0.0
      - pyparsing==3.2.1
      - pyro-api==0.1.2
      - pyro-ppl==1.9.1
      - pytabkit==1.1.3
      - pytest==8.3.4
      - pytest-cov==6.0.0
      - python-dateutil==2.9.0.post0
      - python-graphviz==0.20.3
      - pytorch-lightning==2.5.0.post0
      - pytz==2024.2
      - ray==2.40.0
      - referencing==0.35.1
      - relplot==1.0
      - rpds-py==0.22.3
      - scikit-learn==1.5.2
      - scipy==1.15.1
      - seaborn==0.13.2
      - six==1.17.0
      - skorch==1.1.0
      - snowballstemmer==2.2.0
      - sphinx==8.1.3
      - sphinx-rtd-theme==3.0.2
      - sphinxcontrib-applehelp==2.0.0
      - sphinxcontrib-devhelp==2.0.0
      - sphinxcontrib-htmlhelp==2.1.0
      - sphinxcontrib-jquery==4.1
      - sphinxcontrib-jsmath==1.0.1
      - sphinxcontrib-qthelp==2.0.0
      - sphinxcontrib-serializinghtml==2.0.0
      - statsmodels==0.14.4
      - swig==4.3.0
      - sympy==1.13.1
      - tabulate==0.9.0
      - tenacity==9.0.0
      - tensorboard==2.18.0
      - tensorboard-data-server==0.7.2
      - termcolor==2.5.0
      - threadpoolctl==3.5.0
      - tikzplotlib==0.9.8
      - toolz==1.0.0
      - torchmetrics==1.6.1
      - tqdm==4.67.1
      - tueplots==0.0.17
      - typeguard==4.4.1
      - tzdata==2024.2
      - venn-abers==1.4.6
      - werkzeug==3.1.3
      - xgboost==2.1.3
      - xlrd==2.0.1
      - xmltodict==0.14.2
      - yarl==1.18.3

================================================
FILE: original_requirements/requirements_2024_06_25.txt
================================================
adjustText==1.0.4
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
asttokens==2.4.1
async-timeout==4.0.3
attrs==23.1.0
autorank==1.1.3
Babel==2.14.0
baycomp==1.0.3
blis==0.7.11
boltons==23.0.0
brotlipy==0.7.0
catalogue==2.0.10
catboost==1.2.2
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==2.0.4
cir-model==0.2.0
click==8.1.7
cloudpathlib==0.16.0
cloudpickle==3.0.0
cmake==3.28.1
colorama==0.4.6
comm==0.2.0
confection==0.1.4
ConfigSpace==0.7.1
contourpy==1.2.0
coverage==7.3.3
cramjam==2.7.0
cryptography==41.0.3
cycler==0.12.1
cymem==2.0.8
dask==2023.12.1
dask-jobqueue==0.8.2
debugpy==1.8.0
decorator==5.1.1
dill==0.3.7
distinctipy==1.3.4
distributed==2023.12.1
einops==0.7.0
emcee==3.1.4
et-xmlfile==1.1.0
exceptiongroup==1.1.3
executing==2.0.1
fastparquet==2023.10.1
filelock==3.13.1
fire==0.5.0
fonttools==4.46.0
frozenlist==1.4.1
fsspec==2023.12.2
future==0.18.3
gensim==4.3.2
ghp-import==2.1.0
graphviz==0.20.1
griffe==0.38.1
hyperopt==0.2.7
idna==3.4
importlib-metadata==6.8.0
importlib-resources==6.1.1
imutils==0.5.4
iniconfig==2.0.0
ipykernel==6.26.0
ipython==8.17.2
jedi==0.19.1
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.32
jsonpointer==2.1
jsonschema==4.20.0
jsonschema-specifications==2023.11.2
jupyter_client==8.6.0
jupyter_core==5.5.0
kditransform==0.2.0
kiwisolver==1.4.5
langcodes==3.3.0
liac-arff==2.5.0
lightgbm==4.1.0
lightning-utilities==0.10.0
lit==17.0.6
llvmlite==0.41.1
locket==1.0.0
Markdown==3.5.1
MarkupSafe==2.1.3
matplotlib==3.8.2
matplotlib-inline==0.1.6
mergedeep==1.3.4
minio==7.2.0
mkdocs==1.5.3
mkdocs-autorefs==0.5.0
mkdocs-material==9.5.2
mkdocs-material-extensions==1.3.1
mkdocstrings==0.24.0
mkdocstrings-python==1.7.5
more-itertools==10.1.0
mpmath==1.3.0
msgpack==1.0.7
msgpack-numpy==0.4.8
multidict==6.0.4
murmurhash==1.0.10
nest-asyncio==1.5.8
networkx==3.2.1
numba==0.58.1
numpy==1.26.2
nvidia-cublas-cu11==11.10.3.66
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu11==8.5.0.96
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu11==10.9.0.58
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu11==10.2.10.91
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu11==11.7.4.91
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu11==2.14.3
nvidia-nccl-cu12==2.18.1
nvidia-nvjitlink-cu12==12.3.101
nvidia-nvtx-cu11==11.7.91
nvidia-nvtx-cu12==12.1.105
opencv-contrib-python==4.8.1.78
openml==0.14.1
openpyxl==3.1.2
packaging==23.1
paginate==0.5.6
pandas==2.1.4
parso==0.8.3
partd==1.4.1
pathspec==0.12.1
patool==1.15.0
patsy==0.5.6
pexpect==4.8.0
Pillow==10.1.0
pkg_resources==0.0.0
platformdirs==3.11.0
plotly==5.18.0
pluggy==1.0.0
preshed==3.0.9
prompt-toolkit==3.0.39
protobuf==4.25.1
psutil==5.9.6
ptyprocess==0.7.0
pure-eval==0.2.2
py4j==0.10.9.7
pyarrow==14.0.2
pycosat==0.6.6
pycparser==2.21
pycryptodome==3.19.0
pydantic==1.10.13
pydantic_core==2.14.5
Pygments==2.16.1
pymdown-extensions==10.5
pynisher==1.0.10
pynvml==11.5.0
pyOpenSSL==23.2.0
pyparsing==3.1.1
pyrfr==0.9.0
PySocks==1.7.1
pytest==7.4.3
pytest-cov==4.1.0
python-dateutil==2.8.2
pytorch-lightning==2.1.2
pytorch-widedeep==1.4.0
pytz==2023.3.post1
PyYAML==6.0.1
pyyaml_env_tag==0.1
pyzmq==25.1.1
ray==2.8.1
referencing==0.32.0
regex==2023.10.3
requests==2.31.0
rpds-py==0.15.2
ruamel.yaml==0.17.21
ruamel.yaml.clib==0.2.6
scikit-learn==1.3.2
scipy==1.11.4
seaborn==0.13.1
six==1.16.0
skorch==0.15.0
smac==2.0.2
smart-open==6.4.0
sortedcontainers==2.4.0
spacy==3.7.2
spacy-legacy==3.0.12
spacy-loggers==1.0.5
srsly==2.4.8
stack-data==0.6.3
statsmodels==0.14.2
sympy==1.12
tabulate==0.9.0
tblib==3.0.0
tenacity==8.2.3
termcolor==2.4.0
textalloc==0.0.7
thinc==8.2.2
threadpoolctl==3.2.0
tomli==2.0.1
toolz==0.12.0
torch==2.0.0
torchmetrics==1.2.1
torchvision==0.16.2
tornado==6.3.3
tqdm==4.65.0
traitlets==5.13.0
triton==2.0.0
tueplots==0.0.12
typer==0.9.0
typing_extensions==4.8.0
tzdata==2023.3
urllib3==1.26.16
venn-abers==1.4.1
wasabi==1.1.2
watchdog==3.0.0
wcwidth==0.2.9
weasel==0.3.4
wrapt==1.16.0
xgboost==2.0.2
xlrd==2.0.1
xmltodict==0.13.0
yarl==1.9.4
zict==3.0.0
zipp==3.17.0
zstandard==0.19.0


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["hatchling>=1.26.1"]  # https://github.com/pypa/hatch/issues/1818
build-backend = "hatchling.build"

[project]
name = "pytabkit"
dynamic = ["version"]
description = 'ML models + benchmark for tabular data classification and regression'
readme = "README.md"
requires-python = ">=3.9"
license = "Apache-2.0"
keywords = ['tabular data', 'scikit-learn', 'deep learning', 'gradient boosting', 'RealMLP']
authors = [
    { name = "David Holzmüller" }, #, email = "a@b.org" },
    { name = "Léo Grinsztajn" }, #, email = "a@b.org" },
    { name = "Ingo Steinwart" }, #, email = "a@b.org" },
]
classifiers = [
    "Development Status :: 4 - Beta",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: Implementation :: CPython",
    "Programming Language :: Python :: Implementation :: PyPy",
    "License :: OSI Approved :: Apache Software License",
]
dependencies = [
    "torch>=2.0",
    "numpy>=1.25", # hopefully don't need <2.0 anymore?
    "pandas>=2.0",
    "scikit-learn>=1.3",
    # these could be made optional with lazy imports
    # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics used in TabR slow:
    # https://github.com/Lightning-AI/torchmetrics/pull/2184
    "torchmetrics>=1.2.1",
    # can also install the newer lightning package with more dependencies instead, it will be prioritized
    "pytorch_lightning>=2.0",
    "psutil>=5.0",  # used for getting logical CPU count in the sklearn base and for getting process RAM usage
]

[project.optional-dependencies]
models = [
    # use <2.6 for now since it can run into pickling issues with skorch if the skorch version is too old
    # see https://github.com/skorch-dev/skorch/commit/be93b7769d61aa22fb928d2e89e258c629bfeaf9
    "torch>=2.0",
    "xgboost>=2.0",
    "catboost>=1.2",
    "lightgbm>=4.1",
    "xrfm>=0.4.3",  # lower bound is not checked extensively
    # for rtdl models (MLP, ResNet) but also lightly used in TabR
    # note that scikit-learn 1.6 needs skorch >= 1.1.0
    "skorch>=0.15",
    "dask[dataframe]>=2023", # this is here because of a pandas warning:
    # "Dask dataframe query planning is disabled because dask-expr is not installed"
    # "packaging",  # unclear why this is here?

    "tqdm", # for TabM with verbosity >= 1

    # more classification metrics and post-hoc calibrators
    # not necessary unless these things are actually used
    "probmetrics>=0.0.1",

    # more powerful pickle, used for file-saving and multiprocessing.
    # Unfortunately it can't save certain torch objects
    "dill",
    # saving objects in yaml/msgpack
    # needed if used in utils.serialize() / deserialize()
    "pyyaml>=5.0",
    "msgpack>=1.0",
    # apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack?
    # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occurred
    # maybe it occurred because we tried to save hyperparameters that were numpy scalars instead of python scalars
    # "msgpack_numpy>=0.4",

    # this is needed because probmetrics uses unpinned numba,
    # but for some reason the github actions CI wants to install 0.53.1
    # which is incompatible with Python 3.11 and 3.12.
    # 0.59.0 is the lowest version that is compatible with 3.12
    "numba>=0.59.0",
]
autogluon = [
    "autogluon.tabular[all]>=1.0",
    "autogluon.multimodal>=1.0",
]
extra = [
    "kditransform>=0.2",
]
hpo = [
    "ConfigSpace>=0.7",
    "smac>=2.0",
    "hyperopt>=0.2",
]
bench = [
    "fire", # argparse utilities
    "ray>=2.8", # parallelization
    "openml>=0.14", # OpenML data download
    # ----- UCI import ------
    "requests>=2.0",
    "patool>=1.0",
    "openpyxl>=3.0",
    "xlrd>=2.0",
    # ----- plotting -----
    "matplotlib>=3.0",
    "tueplots>=0.0.12",
    "seaborn>=0.0.13",
    "adjustText>=1.0",
    "autorank>=1.0",
]
dev = [
    "pytest>=7.0",
    "pytest-cov>=4.0",
    "sphinx>=7.0",
    "myst_parser>=3.0",
    "sphinx_rtd_theme>=2.0",
]

[tool.hatch.version]
path = "pytabkit/__about__.py"

[tool.hatch.envs.default]
installer = "uv"
features = ["models", "bench", "autogluon", "extra", "hpo", "dev"]

[tool.hatch.envs.hatch-test]
installer = "uv"
features = ["models", "bench", "dev", "hpo"]
#features = ["models","bench","autogluon","extra","hpo","dev"]

[tool.hatch.build.targets.sdist]
package = ['pytabkit']
only-include = ['pytabkit']

[tool.hatch.build.targets.wheel]
package = ['pytabkit']
only-include = ['pytabkit']

[project.urls]
Documentation = "https://github.com/dholzmueller/pytabkit#readme"
Issues = "https://github.com/dholzmueller/pytabkit/issues"
Source = "https://github.com/dholzmueller/pytabkit"

[tool.hatch.envs.types]
extra-dependencies = [
    "mypy>=1.0.0",
]
[tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:pytabkit tests}"

[tool.coverage.run]
source_pkgs = ["pytabkit", "tests"]
branch = true
parallel = true
omit = [
    "pytabkit/__about__.py",
]

[tool.coverage.paths]
models = ["pytabkit/models", "*/pytabkit/pytabkit/models"]
bench = ["pytabkit/bench", "*/pytabkit/pytabkit/bench"]
tests = ["tests", "*/pytabkit/tests"]

[tool.coverage.report]
exclude_lines = [
    "no cov",
    "if __name__ == .__main__.:",
    "if TYPE_CHECKING:",
]

================================================
FILE: pytabkit/__about__.py
================================================
# SPDX-FileCopyrightText: 2024-present David Holzmüller
#
# SPDX-License-Identifier: Apache-2.0

__version__ = "1.7.3"


================================================
FILE: pytabkit/__init__.py
================================================
from .models.sklearn.sklearn_interfaces import *


================================================
FILE: pytabkit/bench/__init__.py
================================================


================================================
FILE: pytabkit/bench/alg_wrappers/__init__.py
================================================


================================================
FILE: pytabkit/bench/alg_wrappers/general.py
================================================
from pathlib import Path
from typing import List, Dict, Optional

from pytabkit.bench.data.tasks import TaskPackage, TaskInfo
from pytabkit.bench.run.results import ResultManager
from pytabkit.models.training.logging import Logger

from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.training.metrics import Metrics


class AlgWrapper:
    """
    Base class for ML methods that can be run in the benchmarking code.
    """
    def __init__(self, **config):
        """
        Constructor.

        :param config: Configuration parameters.
        """
        self.config = config

    def run(self, task_package: TaskPackage, logger: Logger, assigned_resources: NodeResources,
            tmp_folders: List[Path], metrics: Optional[Metrics] = None) -> Dict[str, List[ResultManager]]:
        """
        Run the ML method on the given task. Should be overridden in subclasses.

        :param task_package: Information about the task to be run.
        :param logger: Logger.
        :param assigned_resources: Assigned resources (e.g. number of threads).
        :param tmp_folders: Temporary folders, one for each train/test split, to save temporary data to.
        :return: A dictionary of lists of ResultManager objects.
            The dict key is the predict params name, which is used as a suffix for the alg_name,
            and each list contains ResultManagers for each train/test split.
        """
        raise NotImplementedError()

    def get_required_resources(self, task_package: TaskPackage) -> RequiredResources:
        """
        Should be overridden in subclasses.

        :param task_package: Information about the task that should be executed.
        :return: Information about the estimated required resources that will be needed to run this task.
        """
        raise NotImplementedError()

    def get_max_n_vectorized(self, task_info: TaskInfo) -> int:
        """
        Returns 1 by default, should be overridden in subclasses if they benefit from vectorization.

        :param task_info: Information about the task that this method should run on.
        :return: Maximum number of train/test splits that this method can be run on at once.
        """
        return 1

    def get_pred_param_names(self, task_package: TaskPackage) -> List[str]:
        """
        Return the possible prediction parameter names, used as suffixes for alg names
        :param task_package: Task package.
        :return: List of the possible names.
        """
        raise NotImplementedError()


# want to have:
# - more general / easy ResourceComputation
# - generic thread-allocation parameters for such a ResourceComputation
#   that allow to allocate more threads for larger workloads
# - better NodeResources class that supports mps or perhaps a new class that summarizes the allocated resources
# - should the resource estimation be moved to AlgInterface?
#   Then, we would need to instantiate an AlgInterface in the wrapper to do the estimation
# - maybe a code that estimates RAM (and time) constants? With fake data sets?

# better ResourceComputation:
# have identical components for CPU and GPU, and maybe also for RAM and time
# components:
# - dataset size
# - factory (model) size
# - RAM for forward (and backward) pass
# - generic calculation (constant, per-tree, per-class, per-sample),
# for the NN we might also need to include the batch size, number of epochs, etc.
# what about the number of threads etc.?
# want to have one per device?

# better NodeResources:
# maybe just have a dict with the devices that are being referred to by the array?


================================================
FILE: pytabkit/bench/alg_wrappers/interface_wrappers.py
================================================
import shutil
from pathlib import Path
from typing import Callable, List, Optional, Dict

import torch

from pytabkit.bench.data.paths import Paths
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.autogluon_model_interfaces import AutoGluonModelAlgInterface
from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface, CatBoostHyperoptAlgInterface, \
    CatBoostSklearnSubSplitInterface, RandomParamsCatBoostAlgInterface
from pytabkit.models.alg_interfaces.ensemble_interfaces import PrecomputedPredictionsAlgInterface, \
    CaruanaEnsembleAlgInterface, AlgorithmSelectionAlgInterface
from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface, LGBMHyperoptAlgInterface, \
    LGBMSklearnSubSplitInterface, RandomParamsLGBMAlgInterface
from pytabkit.bench.alg_wrappers.general import AlgWrapper
from pytabkit.bench.data.tasks import TaskPackage, TaskInfo
from pytabkit.bench.run.results import ResultManager
from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface, SklearnMLPSubSplitInterface, \
    KANSubSplitInterface, GrandeSubSplitInterface, GBTSubSplitInterface, RandomParamsRFAlgInterface, \
    TabPFN2SubSplitInterface, TabICLSubSplitInterface, RandomParamsExtraTreesAlgInterface, RandomParamsKNNAlgInterface, \
    ExtraTreesSubSplitInterface, KNNSubSplitInterface, RandomParamsLinearModelAlgInterface, \
    LinearModelSubSplitInterface
from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, MultiSplitWrapperAlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, RequiredResources
from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface, ResnetSubSplitInterface, \
    FTTransformerSubSplitInterface, RandomParamsResnetAlgInterface, RandomParamsRTDLMLPAlgInterface, \
    RandomParamsFTTransformerAlgInterface
from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface
from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface
from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface, \
    RandomParamsTabRAlgInterface
from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface, RandomParamsNNAlgInterface, \
    NNHyperoptAlgInterface
from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface, XGBHyperoptAlgInterface, \
    XGBSklearnSubSplitInterface, RandomParamsXGBAlgInterface
from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface, RandomParamsxRFMAlgInterface
from pytabkit.models.data.data import TaskType, DictDataset
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.torch_utils import TorchTimer
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import Metrics


# what is the value of wrappers around AlgInterface?
#  - it has a create-function that can create multiple instances,
#    and can wrap with MultiSplitAlgInterface and SingleSplitAlgInterface
#  - there is some wrapping code in run(), but this could be moved to where the wrapper is used
#  - it provides get_max_n_vectorized()

# perhaps we should generalize TreeResourceComputation to also work for NNs?
# But this would require extra functionality for backprop, GPU RAM, etc.

def get_prep_factory(**config):
    return config.get('factory', None) or PreprocessingFactory(**config)


class AlgInterfaceWrapper(AlgWrapper):
    """
    Base class for wrapping AlgInterface classes for benchmarking.
    """

    def __init__(self, create_alg_interface_fn: Optional[Callable[[...], AlgInterface]], **config):
        """
        Constructor.

        :param create_alg_interface_fn: Function to create an AlgInterface via create_alg_interface_fn(**config).
        :param config: Configuration parameters.
        """
        super().__init__(**config)
        self.create_alg_interface_fn = create_alg_interface_fn

    # def _create_alg_interface_impl(self, n_cv: int, n_splits: int, task_type: TaskType) -> AlgInterface:
    def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
        """
        Factory method to create an AlgInterface.
        Should be overridden unless ``create_alg_interface_fn`` has been provided in the constructor.
        This method should not be used directly, instead create_alg_interface() should be used.

        :param task_package: Task information.
        :return: An AlgInterface corresponding to an ML method.
        """
        if self.create_alg_interface_fn is not None:
            return self.create_alg_interface_fn(**self.config)
        else:
            raise NotImplementedError()

    def create_alg_interface(self, task_package: TaskPackage) -> AlgInterface:
        """
        Method to create an AlgInterface.

        :param task_package: Task information.
        :return: An AlgInterface corresponding to an ML method.
        """
        alg_interface = self._create_alg_interface_impl(task_package)
        if 'calibration_method' in self.config:
            try:
                from pytabkit.models.alg_interfaces.calibration import PostHocCalibrationAlgInterface
                alg_interface = PostHocCalibrationAlgInterface(alg_interface, **self.config)
            except ImportError:
                raise ValueError('Calibration methods are not implemented')
        if 'quantile_calib_alpha' in self.config:
            try:
                from pytabkit.models.alg_interfaces.custom_interfaces import QuantileCalibrationAlgInterface
                alg_interface = QuantileCalibrationAlgInterface(alg_interface, **self.config)
            except ImportError:
                raise ValueError('Quantile Calibration methods are not implemented')

        return alg_interface

    def run(self, task_package: TaskPackage, logger: Logger, assigned_resources: NodeResources,
            tmp_folders: List[Path], metrics: Optional[Metrics] = None) -> Dict[str, List[ResultManager]]:
        task = task_package.task_info.load_task(task_package.paths)
        task_desc = task_package.task_info.task_desc
        n_cv = task_package.n_cv
        n_refit = task_package.n_refit

        interface_resources = assigned_resources.get_interface_resources()

        old_torch_n_threads = torch.get_num_threads()
        old_torch_n_interop_threads = torch.get_num_interop_threads()
        torch.set_num_threads(interface_resources.n_threads)
        # don't set this because it can throw
        # Error: cannot set number of interop threads after parallel work has started or set_num_interop_threads called
        # torch.set_num_interop_threads(interface_resources.n_threads)

        ds = task.ds
        name = 'alg ' + task_package.alg_name + ' on task ' + str(task_desc)

        # return_preds = self.config.get(f'save_y_pred', False)
        return_preds = task_package.save_y_pred
        if metrics is None:
            metrics = Metrics.defaults(ds.tensor_infos['y'].cat_sizes,
                                       val_metric_name=self.config.get('val_metric_name', None))

        cv_idxs_list = []
        refit_idxs_list = []

        n_splits = len(task_package.split_infos)

        if n_splits == 1:
            logger.log(1,
                       f'Running on split {task_package.split_infos[0].id} of task {task_package.task_info.task_desc}')
        else:
            logger.log(1, f'Running on {n_splits} splits of task {task_package.task_info.task_desc}')

        for split_id, split_info in enumerate(task_package.split_infos):
            # this will usually be called with len(task_package.split_infos) == 1, but do a loop for safety
            test_split = split_info.splitter.split_ds(task.ds)
            trainval_idxs, test_idxs = test_split.idxs[0], test_split.idxs[1]
            trainval_ds = test_split.get_sub_ds(0)
            cv_sub_splits = split_info.get_sub_splits(trainval_ds, n_splits=n_cv, is_cv=True)
            cv_train_idxs = []
            cv_val_idxs = []
            for sub_idx, sub_split in enumerate(cv_sub_splits):
                cv_train_idxs.append(trainval_idxs[sub_split.idxs[0]])
                cv_val_idxs.append(trainval_idxs[sub_split.idxs[1]])
            cv_train_idxs = torch.stack(cv_train_idxs, dim=0)
            cv_val_idxs = torch.stack(cv_val_idxs, dim=0)
            cv_alg_seeds = [split_info.get_sub_seed(split_idx, is_cv=True) for split_idx in range(n_cv)]
            cv_idxs_list.append(SplitIdxs(cv_train_idxs, cv_val_idxs, test_idxs, split_seed=split_info.alg_seed,
                                          sub_split_seeds=cv_alg_seeds, split_id=split_id))

            if n_refit > 0:
                refit_train_idxs = torch.stack([trainval_idxs] * n_refit, dim=0)
                refit_alg_seeds = [split_info.get_sub_seed(split_idx, is_cv=False) for split_idx in range(n_refit)]
                refit_idxs_list.append(SplitIdxs(refit_train_idxs, None, test_idxs, split_seed=split_info.alg_seed,
                                                 sub_split_seeds=refit_alg_seeds, split_id=split_id))

        if task_package.rerun:
            for tmp_folder in tmp_folders:
                if utils.existsDir(tmp_folder):
                    # delete the folder such that the method doesn't load old results from the tmp folder
                    shutil.rmtree(tmp_folder)

        cv_tmp_folders = [tmp_folder / 'cv' for tmp_folder in tmp_folders]
        refit_tmp_folders = [tmp_folder / 'refit' for tmp_folder in tmp_folders]

        cv_alg_interface = self.create_alg_interface(task_package)

        pred_param_names = list(cv_alg_interface.get_available_predict_params().keys())

        if n_refit > 0 and len(pred_param_names) > 1:
            raise NotImplementedError('Refitting with multiple prediction parameters is currently not implemented')

        rms = {name: [ResultManager() for _ in task_package.split_infos] for name in pred_param_names}

        with TorchTimer() as cv_fit_timer:
            cv_alg_interface.fit(ds, cv_idxs_list, interface_resources, logger, cv_tmp_folders, name)

        for pred_param_name in pred_param_names:
            cv_alg_interface.set_current_predict_params(pred_param_name)

            with TorchTimer() as cv_eval_timer:
                cv_results_list = cv_alg_interface.eval(ds, cv_idxs_list, metrics, return_preds)

            for rm, cv_results in zip(rms[pred_param_name], cv_results_list):
                rm.add_results(is_cv=True, results_dict=cv_results.get_dict() |
                                                        dict(fit_time_s=cv_fit_timer.elapsed,
                                                             eval_time_s=cv_eval_timer.elapsed))

            if n_refit > 0:
                refit_alg_interface = cv_alg_interface.get_refit_interface(n_refit)

                with TorchTimer() as refit_fit_timer:
                    refit_alg_interface.fit(ds, refit_idxs_list, interface_resources, logger, refit_tmp_folders, name)

                with TorchTimer() as refit_eval_timer:
                    refit_results_list = refit_alg_interface.eval(ds, refit_idxs_list, metrics, return_preds)
                for rm, refit_results in zip(rms[pred_param_name], refit_results_list):
                    rm.add_results(is_cv=False,
                                   results_dict=refit_results.get_dict() |
                                                dict(fit_time_s=refit_fit_timer.elapsed,
                                                     eval_time_s=refit_eval_timer.elapsed))

        torch.set_num_threads(old_torch_n_threads)
        # torch.set_num_interop_threads(old_torch_n_interop_threads)

        return rms

    def get_required_resources(self, task_package: TaskPackage) -> RequiredResources:
        ds = DictDataset(tensors=None, tensor_infos=task_package.task_info.tensor_infos,
                         device='cpu', n_samples=task_package.task_info.n_samples)
        alg_interface = self.create_alg_interface(task_package)
        n_train, n_val = task_package.split_infos[0].get_train_and_val_size(n_samples=task_package.task_info.n_samples,
                                                                            n_splits=len(task_package.split_infos),
                                                                            is_cv=True)
        # n_train = split_info.get_sub_splits(trainval_ds, n_splits=n_cv, is_cv=True)
        return alg_interface.get_required_resources(ds=ds, n_cv=task_package.n_cv, n_refit=task_package.n_refit,
                                                    n_splits=len(task_package.split_infos),
                                                    split_seeds=[si.alg_seed for si in task_package.split_infos],
                                                    n_train=n_train)

    def get_pred_param_names(self, task_package: TaskPackage) -> List[str]:
        return list(self.create_alg_interface(task_package).get_available_predict_params().keys())


class LoadResultsWrapper(AlgInterfaceWrapper):
    def __init__(self, alg_name: str, **config):
        super().__init__(create_alg_interface_fn=None, **config)
        self.alg_name = alg_name

    def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
        assert len(task_package.split_infos) == 1  # only support single-split

        paths = self.config.get('paths', Paths.from_env_variables())
        task_info = task_package.task_info
        split_info = task_package.split_infos[0]
        split_id = split_info.id
        results_path = paths.results_alg_task_split(task_desc=task_info.task_desc, alg_name=self.alg_name,
                                                    n_cv=task_package.n_cv, split_type=split_info.split_type,
                                                    split_id=split_id)
        rm = ResultManager.load(results_path)
        y_preds_cv = rm.y_preds_cv if rm.y_preds_cv is not None else rm.other_dict['cv']['y_preds']
        y_preds_cv = torch.as_tensor(y_preds_cv, dtype=torch.float32)
        y_preds_refit = None
        if rm.y_preds_refit is not None:
            y_preds_refit = torch.as_tensor(rm.y_preds_refit, dtype=torch.float32)
        elif 'refit' in rm.other_dict:
            y_preds_refit = torch.as_tensor(rm.other_dict['refit']['y_preds'], dtype=torch.float32)
        fit_params_cv = rm.other_dict['cv']['fit_params']
        fit_params_refit = None if 'refit' not in rm.other_dict else rm.other_dict['refit']['fit_params']
        return PrecomputedPredictionsAlgInterface(y_preds_cv=y_preds_cv, y_preds_refit=y_preds_refit,
                                                  fit_params_cv=fit_params_cv, fit_params_refit=fit_params_refit)

    def get_required_resources(self, task_package: TaskPackage) -> RequiredResources:
        # do this here such that we don't have to load the results for computing the required resources
        return RequiredResources(time_s=1e-5 * task_package.task_info.n_samples, cpu_ram_gb=1.5, n_threads=1)


class CaruanaEnsembleWrapper(AlgInterfaceWrapper):
    def __init__(self, sub_wrappers: List[AlgInterfaceWrapper], **config):
        super().__init__(create_alg_interface_fn=None, **config)
        self.sub_wrappers = sub_wrappers

    def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
        single_split_alg_interfaces = []
        for split_info in task_package.split_infos:
            single_alg_interfaces = []
            for sub_wrapper in self.sub_wrappers:
                sub_tp = TaskPackage(task_info=task_package.task_info, split_infos=[split_info], n_cv=task_package.n_cv,
                                     n_refit=task_package.n_refit, paths=task_package.paths, rerun=task_package.rerun,
                                     alg_name=task_package.alg_name, save_y_pred=task_package.save_y_pred)
                single_alg_interfaces.append(sub_wrapper.create_alg_interface(sub_tp))
            single_split_alg_interfaces.append(CaruanaEnsembleAlgInterface(single_alg_interfaces, **self.config))
        return MultiSplitWrapperAlgInterface(single_split_alg_interfaces)

    def get_required_resources(self, task_package: TaskPackage) -> RequiredResources:
        single_resources = [sub_wrapper.get_required_resources(task_package)
                            for sub_wrapper in self.sub_wrappers]
        return RequiredResources.combine_sequential(single_resources)


class AlgorithmSelectionWrapper(AlgInterfaceWrapper):
    def __init__(self, sub_wrappers: List[AlgInterfaceWrapper], **config):
        super().__init__(create_alg_interface_fn=None, **config)
        self.sub_wrappers = sub_wrappers

    def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
        single_split_alg_interfaces = []
        for split_info in task_package.split_infos:
            single_alg_interfaces = []
            for sub_wrapper in self.sub_wrappers:
                sub_tp = TaskPackage(task_info=task_package.task_info, split_infos=[split_info], n_cv=task_package.n_cv,
                                     n_refit=task_package.n_refit, paths=task_package.paths, rerun=task_package.rerun,
                                     alg_name=task_package.alg_name, save_y_pred=task_package.save_y_pred)
                single_alg_interfaces.append(sub_wrapper.create_alg_interface(sub_tp))
            single_split_alg_interfaces.append(AlgorithmSelectionAlgInterface(single_alg_interfaces, **self.config))
        return MultiSplitWrapperAlgInterface(single_split_alg_interfaces)

    def get_required_resources(self, task_package: TaskPackage) -> RequiredResources:
        # too pessimistic for refit...
        single_resources = [sub_wrapper.get_required_resources(task_package)
                            for sub_wrapper in self.sub_wrappers]
        return RequiredResources.combine_sequential(single_resources)


class MultiSplitAlgInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, **config):
        super().__init__(create_alg_interface_fn=None, **config)

    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        raise NotImplementedError()

    def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
        n_cv = task_package.n_cv
        task_type = task_package.task_info.task_type
        n_splits = len(task_package.split_infos)
        return MultiSplitWrapperAlgInterface(
            single_split_interfaces=[self.create_single_alg_interface(n_cv, task_type)
                                     for i in range(n_splits)], **self.config)


class SubSplitInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def __init__(self, create_sub_split_learner_fn: Optional[Callable[[...], AlgInterface]] = None, **config):
        super().__init__(**config)
        self.create_sub_split_learner_fn = create_sub_split_learner_fn

    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        if self.create_sub_split_learner_fn is not None:
            return self.create_sub_split_learner_fn(**self.config)
        raise NotImplementedError()

    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return SingleSplitWrapperAlgInterface([self.create_sub_split_interface(task_type)
                                               for i in range(n_cv)], **self.config)


class NNInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, **config):
        super().__init__(NNAlgInterface, **config)

    def get_max_n_vectorized(self, task_info: TaskInfo) -> int:
        ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu',
                         n_samples=task_info.n_samples)
        max_ram_gb = 8.0
        max_n_vectorized = self.config.get('max_n_vectorized', 50)
        alg_interface = NNAlgInterface(**self.config)
        while max_n_vectorized > 1:
            required_resources = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=max_n_vectorized,
                                                                      split_seeds=[0] * max_n_vectorized,
                                                                      n_train=task_info.n_samples)
            if required_resources.gpu_ram_gb <= max_ram_gb and required_resources.cpu_ram_gb <= max_ram_gb:
                return max_n_vectorized
            max_n_vectorized -= 1

        return 1


class NNHyperoptInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, **config):
        super().__init__(NNHyperoptAlgInterface, **config)

    def get_max_n_vectorized(self, task_info: TaskInfo) -> int:
        ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu',
                         n_samples=task_info.n_samples)
        max_ram_gb = 8.0
        max_n_vectorized = self.config.get('max_n_vectorized', 50)
        alg_interface = NNHyperoptAlgInterface(**self.config)
        while max_n_vectorized > 1:
            required_resources = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=max_n_vectorized,
                                                                      split_seeds=[0] * max_n_vectorized,
                                                                      n_train=task_info.n_samples)
            if required_resources.gpu_ram_gb <= max_ram_gb and required_resources.cpu_ram_gb <= max_ram_gb:
                return max_n_vectorized
            max_n_vectorized -= 1

        return 1


class RandomParamsNNInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsNNAlgInterface, model_idx=model_idx, **config)


class LGBMSklearnInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType):
        return LGBMSklearnSubSplitInterface(**self.config)


class LGBMInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return LGBMSubSplitInterface(**self.config)


class LGBMHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return LGBMHyperoptAlgInterface(**self.config)


class RandomParamsLGBMInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return RandomParamsLGBMAlgInterface(**self.config)


class XGBSklearnInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return XGBSklearnSubSplitInterface(**self.config)


class XGBInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return XGBSubSplitInterface(**self.config)


class RandomParamsXGBInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return RandomParamsXGBAlgInterface(**self.config)


class XGBHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return XGBHyperoptAlgInterface(**self.config)


class CatBoostSklearnInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return CatBoostSklearnSubSplitInterface(**self.config)


class CatBoostInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return CatBoostSubSplitInterface(**self.config)


class CatBoostHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return CatBoostHyperoptAlgInterface(**self.config)


class RandomParamsCatBoostInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return RandomParamsCatBoostAlgInterface(**self.config)


class RFInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return RFSubSplitInterface(**self.config)


class ExtraTreesInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return ExtraTreesSubSplitInterface(**self.config)


class KNNInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return KNNSubSplitInterface(**self.config)


class LinearModelInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return LinearModelSubSplitInterface(**self.config)


class GBTInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return GBTSubSplitInterface(**self.config)


class SklearnMLPInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return SklearnMLPSubSplitInterface(**self.config)


class KANInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return KANSubSplitInterface(**self.config)


class GrandeInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return GrandeSubSplitInterface(**self.config)


class TabPFN2InterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return TabPFN2SubSplitInterface(**self.config)


class TabICLInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return TabICLSubSplitInterface(**self.config)


class MLPRTDLInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return RTDL_MLPSubSplitInterface(**self.config)


class ResNetRTDLInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return ResnetSubSplitInterface(**self.config)


class FTTransformerInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return FTTransformerSubSplitInterface(**self.config)


class TabRInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return TabRSubSplitInterface(**self.config)


class TabMInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return TabMSubSplitInterface(**self.config)


class RandomParamsResnetInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsResnetAlgInterface, model_idx=model_idx, **config)


class RandomParamsRTDLMLPInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsRTDLMLPAlgInterface, model_idx=model_idx, **config)


class RandomParamsFTTransformerInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsFTTransformerAlgInterface, model_idx=model_idx, **config)


class AutoGluonModelInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(AutoGluonModelAlgInterface, **config)


class RandomParamsTabRInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return RandomParamsTabRAlgInterface(**self.config)


class RandomParamsRFInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsRFAlgInterface, model_idx=model_idx, **config)


class RandomParamsExtraTreesInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsExtraTreesAlgInterface, model_idx=model_idx, **config)


class RandomParamsKNNInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsKNNAlgInterface, model_idx=model_idx, **config)


class RandomParamsLinearModelInterfaceWrapper(AlgInterfaceWrapper):
    def __init__(self, model_idx: int, **config):
        # model_idx should be the random search iteration (i.e. start from zero)
        super().__init__(RandomParamsLinearModelAlgInterface, model_idx=model_idx, **config)


class xRFMInterfaceWrapper(SubSplitInterfaceWrapper):
    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
        return xRFMSubSplitInterface(**self.config)


class RandomParamsxRFMInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
    def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
            -> AlgInterface:
        return RandomParamsxRFMAlgInterface(**self.config)


================================================
FILE: pytabkit/bench/data/__init__.py
================================================


================================================
FILE: pytabkit/bench/data/common.py
================================================

class TaskSource:
    UCI_BIN_CLASS = 'uci-bin-class'
    UCI_MULTI_CLASS = 'uci-multi-class'
    UCI_REGRESSION = 'uci-reg'
    OPENML_CLASS = 'openml-class'
    OPENML_CLASS_BIN_EXTRA = 'openml-class-bin-extra'
    OPENML_REGRESSION = 'openml-reg'
    AUTOML_CLASS_SMALL = 'automl-class-small'
    TABARENA_CLASS = 'tabarena-class'
    TABARENA_REG = 'tabarena-reg'
    CUSTOM = 'custom'


class SplitType:
    RANDOM = 'random-split'
    DEFAULT = 'default-split'


================================================
FILE: pytabkit/bench/data/get_uci.py
================================================
#!/usr/bin/python3
import os
import shutil
import ssl

import pandas

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.uci_file_ops import prepare_new_data_set_group_id, download_and_save, replace_chars_in_file, \
    load_raw_data, remove_columns, save_data_to_file, unzip_raw_data, concat_files, remove_files, UCIVars, \
    move_label_in_front, remove_rows_with_label, ungz_raw_data, load_mixed_raw_data, \
    auto_replace_categories_in_mixed_data, write_mixed_raw_data, replace_ordinals_in_mixed_data, \
    replace_isodate_by_day_in_mixed_data, replace_circulars_in_mixed_data, get_categories_in_mixed_data, \
    replace_time_by_seconds_in_mixed_data, unrar_raw_data, unarff_raw_data, un_z_raw_data, untar_raw_data, \
    replace_categories_in_mixed_data, replace_bin_cats_in_mixed_data, auto_replace_missing_in_mixed_data, \
    replace_manual_in_mixed_data
from pytabkit.models import utils
import numpy
import sklearn.datasets as datasets
import re as re


#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------


def get_skill_craft():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00272/SkillCraft1_Dataset.csv', 'skill_craft.data')

    replace_chars_in_file('skill_craft.data', '"', '')

    data = load_raw_data('skill_craft.data', sep = ',')
    data = remove_columns(data, [0])
    save_data_to_file(data, 'skill_craft', is_classification = True)
    

#---------------------------------------------------------------------------------------------------


def get_cargo_2000():
    
    prepare_new_data_set_group_id()
    print("Cargo 2000 data set is currently not processed since:")
    print("  - from the description it is completely unclear how this data set can be used")


#---------------------------------------------------------------------------------------------------


def get_KDC_4007():
    
    prepare_new_data_set_group_id()
    print("KDC 4007 data set is currently not processed since:")
    print("  - from the description it is completely unclear how this data set can be used")
    
    
#---------------------------------------------------------------------------------------------------


def get_sml2010():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00274/NEW-DATA.zip', 'sml2010.zip')

    unzip_raw_data('sml2010.zip')    
    concat_files(UCIVars.raw_data_folder + 'NEW-DATA*.txt', UCIVars.raw_data_folder + 'sml2010.data')
    remove_files(UCIVars.raw_data_folder, 'NEW-DATA*.txt')

    replace_chars_in_file('sml2010.data', '#', '')
    data = load_raw_data('sml2010.data', sep = ' ', description_columns = 2)
    
    data_dining = remove_columns(data, [1]) 
    save_data_to_file(data_dining, 'sml2010_dining', is_classification = False)

    data_room = remove_columns(data, [0]) 
    save_data_to_file(data_room, 'sml2010_room', is_classification = False)
    
    
#---------------------------------------------------------------------------------------------------


def get_wine_quality():


    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', 'wine_quality_white.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names', 'wine_quality.description')
    
    
    # The first task is to create data sets in which the quality is the label.
    # To this end, we add a column at the right, which indicates whether the wine is white or read.
    
    data_white = load_raw_data('wine_quality_white.data', sep = ';', header = True)
    data_white = move_label_in_front(data_white, 11)
    white_label = numpy.ones((numpy.shape(data_white)[0], 1))
    data_white = numpy.concatenate((data_white, white_label), axis = 1)
    
    save_data_to_file(data_white, 'wine_quality_white', is_classification = True)

    
    data_red = load_raw_data('wine_quality_red.data', sep = ';', header = True)
    data_red = move_label_in_front(data_red, 11)
    red_label = numpy.zeros((numpy.shape(data_red)[0], 1))
    data_red = numpy.concatenate((data_red, red_label), axis = 1)
    
    data_all = numpy.concatenate((data_red, data_white), axis = 0)
    save_data_to_file(data_all, 'wine_quality_all', is_classification = True)
    

    # The next task is to combine the white and red wine data set and 
    # to add a label describing the color of the wine. We further remove
    # the quality of the wine, since this may give too much information
    # about the color.

    data_white = load_raw_data('wine_quality_white.data', sep = ';', header = True)
    data_white = remove_columns(data_white, [11])
    white_label = numpy.ones((numpy.shape(data_white)[0], 1))
    data_white = numpy.concatenate((white_label, data_white), axis = 1)
    

    data_red = load_raw_data('wine_quality_red.data', sep = ';', header = True)
    data_red = remove_columns(data_red, [11])
    red_label = numpy.zeros((numpy.shape(data_red)[0], 1))
    data_red = numpy.concatenate((red_label, data_red), axis = 1)
    
    data_all = numpy.concatenate((data_red, data_white), axis = 0)
    save_data_to_file(data_all, 'wine_quality_type', is_classification = True, is_regression = False)      
    

#---------------------------------------------------------------------------------------------------


def get_parkinson():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data', 'parkinson_updrs.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names', 'parkinson_updrs.description')

    data = load_raw_data('parkinson_updrs.data', sep = ',', description_columns = 1)


    # The data has two variables that can be predicted, namely updrs_motor and updrs_total. 
    # For both prediction tasks, the other target variable needs to be removed from the data

    data_motor = remove_columns(data, [4])
    data_motor = move_label_in_front(data_motor, 3)
    save_data_to_file(data_motor, 'parkinson_motor', is_classification = False)
    
    data_total = remove_columns(data, [3])
    data_total = move_label_in_front(data_total, 3)
    save_data_to_file(data_total, 'parkinson_total', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_insurance_benchmark():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt', 'insurance_benchmark.train.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticeval2000.txt', 'insurance_benchmark.test.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/tictgts2000.txt', 'insurance_benchmark.test.labels.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/TicDataDescr.txt', 'insurance_benchmark.description')

    train_data = load_raw_data('insurance_benchmark.train.data', sep = '\t')
    test_data = load_raw_data('insurance_benchmark.test.data', sep = '\t')
    test_label = load_raw_data('insurance_benchmark.test.labels.data', sep = '\t')
    
    test_data = numpy.concatenate((test_data, test_label), axis = 1)
    data = numpy.concatenate((train_data, test_data), axis = 0)

    data = move_label_in_front(data, 85)
    save_data_to_file(data, 'insurance_benchmark', is_classification = True)
    
    
#---------------------------------------------------------------------------------------------------


def get_EEG_steady_state():
    
    prepare_new_data_set_group_id()
    print("EEG Steady State Visual data set is currently not processed since:")
    print("  - the description indicates that it is time series data")

    
#---------------------------------------------------------------------------------------------------


def get_air_quality():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip', 'air_quality.zip')
    unzip_raw_data('air_quality.zip')
    
    
    os.rename(UCIVars.raw_data_folder + 'AirQualityUCI.csv', UCIVars.raw_data_folder + 'air_quality.data')
    os.remove(UCIVars.raw_data_folder + 'AirQualityUCI.xlsx')
    
    data = load_raw_data('air_quality.data', sep = ';', date_column = 0, date_sep = '/', date_order = 'dmY', time_column = 1, time_sep = '.', german_decimal = True)


    # The data has five variables that can be predicted, 
    # namely those in columns 2, 4, 5, 7, and 9 (C++ like).
    # For these prediction tasks, the other target variables 
    # need to be removed from the data.

    data_co2 = remove_columns(data, [4, 5, 7, 9])
    data_co2 = move_label_in_front(data_co2, 2)
    data_co2 = remove_rows_with_label(data_co2, -200.0)
    save_data_to_file(data_co2, 'air_quality_co2', is_classification = False)
    
    # The hydrocarbon reference measurements have only been taken 914 times
    # For this reason, they are not included in the constructed data sets.
    
    data_bc = remove_columns(data, [2, 4, 7, 9])
    data_bc = move_label_in_front(data_bc, 3)
    data_bc = remove_rows_with_label(data_bc, -200.0)
    save_data_to_file(data_bc, 'air_quality_bc', is_classification = False)
    
    data_nox = remove_columns(data, [2, 4, 5, 9])
    data_nox = move_label_in_front(data_nox, 4)
    data_nox = remove_rows_with_label(data_nox, -200.0)
    save_data_to_file(data_nox, 'air_quality_nox', is_classification = False)
    
    data_no2 = remove_columns(data, [2, 4, 5, 7])
    data_no2 = move_label_in_front(data_no2, 5)
    data_no2 = remove_rows_with_label(data_no2, -200.0)
    save_data_to_file(data_no2, 'air_quality_no2', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_cycle_power_plant():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip', 'cycle_power_plant.zip')
    unzip_raw_data('cycle_power_plant.zip')
    
    # The zip file contains some junk and in addition, the data is in EXCEL format. This is addressed now:

    excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'CCPP/Folds5x2_pp.xlsx', engine = 'openpyxl')
    excel_data.to_csv(UCIVars.raw_data_folder + 'cycle_power_plant.data')
    shutil.rmtree(UCIVars.raw_data_folder + 'CCPP')
    
    
    # The response variable is in the last column
    
    data = load_raw_data('cycle_power_plant.data', sep = ',', description_columns = 1)
    data = move_label_in_front(data, 4)
    save_data_to_file(data, 'cycle_power_plant', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_carbon_nanotubes():
    prepare_new_data_set_group_id()
    
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00448/carbon_nanotubes.csv', 'carbon_nanotubes.data')
    
    data = load_raw_data('carbon_nanotubes.data', sep = ';', german_decimal = True)
    
    data_u = remove_columns(data, [6, 7])
    data_u = move_label_in_front(data_u, 5)
    save_data_to_file(data_u, 'carbon_nanotubes_u', is_classification = False)
    
    data_v = remove_columns(data, [5, 7])
    data_v = move_label_in_front(data_v, 5)
    save_data_to_file(data_v, 'carbon_nanotubes_v', is_classification = False)
    
    data_w = remove_columns(data, [5, 6])
    data_w = move_label_in_front(data_w, 5)
    save_data_to_file(data_w, 'carbon_nanotubes_w', is_classification = False)
    

#---------------------------------------------------------------------------------------------------


def get_naval_propulsion():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00316/UCI%20CBM%20Dataset.zip', 'naval_propulsion.zip')
    unzip_raw_data('naval_propulsion.zip')
    
    # The zip file contains quite a bit of junk, which is removed in the following
    
    
    shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/data.txt', UCIVars.raw_data_folder + 'naval_propulsion.data')
    shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/Features.txt', UCIVars.raw_data_folder + 'naval_propulsion.features.txt')
    shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/README.txt', UCIVars.raw_data_folder + 'naval_propulsion.description')    
    
    shutil.rmtree(UCIVars.raw_data_folder + 'UCI CBM Dataset/')
    shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX')
    
    
    data = load_raw_data('naval_propulsion.data', sep = '   ')
    
    
    # The data has actually three response variables, but one of those, namely the ship speed
    # is affine linear in the lever position, which is also recorded in the data. For this
    # reason, only the other two response variables are considered.
    
    data_comp = remove_columns(data, [17])
    data_comp = move_label_in_front(data_comp, 16)
    save_data_to_file(data_comp, 'naval_propulsion_comp', is_classification = False)
    
    data_turb = remove_columns(data, [16])
    data_turb = move_label_in_front(data_turb, 16)
    save_data_to_file(data_turb, 'naval_propulsion_turb', is_classification = False)


#---------------------------------------------------------------------------------------------------
    

def get_blood_pressure():
    
    prepare_new_data_set_group_id()
    print("Cuff-Less Blood pressure Estimation is currently not processed since:")
    print("  - the zip file is about 3.1GB large")
    print("  - the description indicates that each of the three features is actually a times series")
    print("  - the file is in matlab format")
    
    #print('The following download may take a while, since the .zip file is about 3.1GB large.')
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00340/data.zip', 'blood_pressure.zip')
    #unzip_raw_data('blood_pressure.zip')


#---------------------------------------------------------------------------------------------------


def get_gas_sensor_drift():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00270/driftdataset.zip', 'gas_sensor_drift.zip')
    unzip_raw_data('gas_sensor_drift.zip')
    
    concat_files(UCIVars.raw_data_folder + 'batch*.dat', UCIVars.raw_data_folder + 'gas_sensor_drift.data')
    remove_files(UCIVars.raw_data_folder, 'batch*.dat')
    
    
    # Next we need to replace ; by , in .data file, since otherwise the routines for libsvm-like formats won't work.
    # Also, the first label is multiplied by 10000 since the routine for libsvm-like formats seem to sort the 
    # labels. By multiplying the label by 10000, we actually can guarantee that the first label is always the larger
    # one, so that the routine places it at the second position in the list of labels.
    # Then we read a libsvm like file with multiple labels and convert it from Compressed Sparse Row format to normal format
    
    replace_chars_in_file('gas_sensor_drift.data', ';', '0000,')
    data = datasets.load_svmlight_file(UCIVars.raw_data_folder + 'gas_sensor_drift.data', multilabel = True)

    x_data = data[0].toarray()
    all_labels = numpy.reshape(data[1], newshape = (-1, 2))
    
    ## The data has two response variables, one indicating which chemical is measured 
    ## and one reporting its concentration. We simply take both as being of interest ...
    
    class_labels = numpy.reshape(all_labels[ :, 1], newshape = (-1, 1)) / 10000.0
    data_class = numpy.concatenate((class_labels, x_data), axis = 1)
    save_data_to_file(data_class, 'gas_sensor_drift_class', is_classification = True)

    conc_labels = numpy.reshape(all_labels[ :, 0], newshape = (-1, 1))
    data_conc = numpy.concatenate((conc_labels, x_data), axis = 1)
    save_data_to_file(data_conc, 'gas_sensor_drift_conc', is_classification = False)
    

#---------------------------------------------------------------------------------------------------


def get_bike_sharing():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip', 'bike_sharing.zip')
    unzip_raw_data('bike_sharing.zip')
    
    os.remove(UCIVars.raw_data_folder + 'day.csv')
    os.rename(UCIVars.raw_data_folder + 'hour.csv', UCIVars.raw_data_folder + 'bike_sharing.data')
    os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'bike_sharing.description')
    
    data = load_raw_data('bike_sharing.data', sep = ',', description_columns = 2)
    
    data_casual = remove_columns(data, [13, 14])
    data_casual = move_label_in_front(data_casual, 12)
    save_data_to_file(data_casual, 'bike_sharing_casual', is_classification = False)

    data_total = remove_columns(data, [12, 13])
    data_total = move_label_in_front(data_total, 12)
    save_data_to_file(data_total, 'bike_sharing_total', is_classification = False)
    
    
#---------------------------------------------------------------------------------------------------


def get_appliances_energy():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv', 'appliances_energy.data')

    # The data entries are saved as strings, that is as "...". In addition, date and time are not separated by commas.
    # The following lines cure this.

    replace_chars_in_file('appliances_energy.data', '"', '')
    replace_chars_in_file('appliances_energy.data', ',  ', ',')
    replace_chars_in_file('appliances_energy.data', ', ', ',')
    replace_chars_in_file('appliances_energy.data', ' ', ',')
    
    data = load_raw_data('appliances_energy.data', sep = ',', date_column = 0, date_sep = '-', date_order = 'Ymd', time_column = 1, time_sep = ':')
    
    data = move_label_in_front(data, 2)
    save_data_to_file(data, 'appliances_energy', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_indoor_loc():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00310/UJIndoorLoc.zip', 'indoor_loc.zip')
    unzip_raw_data('indoor_loc.zip')
    
 
    os.rename(UCIVars.raw_data_folder + 'UJIndoorLoc/trainingData.csv', UCIVars.raw_data_folder + 'indoor_loc.train.csv')
    os.rename(UCIVars.raw_data_folder + 'UJIndoorLoc/validationData.csv', UCIVars.raw_data_folder + 'indoor_loc.val.csv')
    shutil.rmtree(UCIVars.raw_data_folder + 'UJIndoorLoc')
    
    concat_files(UCIVars.raw_data_folder + 'indoor*.csv', UCIVars.raw_data_folder + 'indoor_loc.data')
    remove_files(UCIVars.raw_data_folder, 'indoor*.csv') 


# --- Regression part ------

    data = load_raw_data('indoor_loc.data', sep = ',')
    data = remove_columns(data, range(523, 529))
    
    data_long = remove_columns(data, [521, 522])
    data_long = move_label_in_front(data_long, 520)
    save_data_to_file(data_long, 'indoor_loc_long', is_classification = False)
    
    data_lat = remove_columns(data, [520, 522])
    data_lat = move_label_in_front(data_lat, 520)
    save_data_to_file(data_lat, 'indoor_loc_lat', is_classification = False)
    
    data_alt = remove_columns(data, [520, 521])
    data_alt = move_label_in_front(data_alt, 520)
    save_data_to_file(data_alt, 'indoor_loc_alt', is_classification = False)
    
    
# --- Classification part -----
    
    data = load_raw_data('indoor_loc.data', sep = ',')
    data = remove_columns(data, range(526, 529))

    data_relative = move_label_in_front(data, 525)
    data_relative = remove_columns(data_relative, range(521, 526))
    save_data_to_file(data_relative, 'indoor_loc_relative', is_classification = True, is_regression = False)

    data_building = move_label_in_front(data, 523)
    data_building = remove_columns(data_building, range(521, 526))
    save_data_to_file(data_building, 'indoor_loc_building', is_classification = True, is_regression = False)
   

#---------------------------------------------------------------------------------------------------


def get_online_news_popularity():

    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip', 'online_news_popularity.zip')
    unzip_raw_data('online_news_popularity.zip')

    os.rename(UCIVars.raw_data_folder + 'OnlineNewsPopularity/OnlineNewsPopularity.csv', UCIVars.raw_data_folder + 'online_news_popularity.data')
    os.rename(UCIVars.raw_data_folder + 'OnlineNewsPopularity/OnlineNewsPopularity.names', UCIVars.raw_data_folder + 'online_news_popularity.description')
    shutil.rmtree(UCIVars.raw_data_folder + 'OnlineNewsPopularity')
    
    data = load_raw_data('online_news_popularity.data', sep = ', ', description_columns = 2)
    data = move_label_in_front(data, 58)
    save_data_to_file(data, 'online_news_popularity', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_facebook_comment_volume():

    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip', 'facebook_comment_volume.zip')
    unzip_raw_data('facebook_comment_volume.zip')

    os.rename(UCIVars.raw_data_folder + 'Dataset/Training/Features_Variant_1.csv', UCIVars.raw_data_folder + 'facebook_comment_volume.data')
    
    shutil.rmtree(UCIVars.raw_data_folder + 'Dataset')
    shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX')
    

    data = load_raw_data('facebook_comment_volume.data', sep = ',')
    data = move_label_in_front(data, 53)
    save_data_to_file(data, 'facebook_comment_volume', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_bejing_pm25():

    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv', 'bejing_pm25.data')


    replace_chars_in_file('bejing_pm25.data', 'cv', '0,0')
    replace_chars_in_file('bejing_pm25.data', 'NW', '1,2')
    replace_chars_in_file('bejing_pm25.data', 'NE', '1,1')
    replace_chars_in_file('bejing_pm25.data', 'SE', '2,1')
    replace_chars_in_file('bejing_pm25.data', 'SW', '2,2')
    
    data = load_raw_data('bejing_pm25.data', sep = ',', description_columns = 1)
    data = move_label_in_front(data, 4)
    save_data_to_file(data, 'bejing_pm25', is_classification = False)
    

#---------------------------------------------------------------------------------------------------
    
    
def get_protein_tertiary_structure():    
    
    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv', 'protein_tertiary_structure.data')
    
    data = load_raw_data('protein_tertiary_structure.data', sep = ',')
    save_data_to_file(data, 'protein_tertiary_structure', is_classification = False)


#---------------------------------------------------------------------------------------------------


def get_tamilnadu_electricity():
    
    prepare_new_data_set_group_id()
    print("Tamilnadu Electricity data set is currently not processed since:")
    print("  - from the description it is completely unclear how this data set can be used")


#---------------------------------------------------------------------------------------------------


def get_metro_interstate_traffic_volume():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz', 'metro_interstate_traffic_volume.zip')

    ungz_raw_data('metro_interstate_traffic_volume.zip')
    os.rename(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.zip.data', UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.data')

    data = load_mixed_raw_data('metro_interstate_traffic_volume.data', sep = ',', header = True)
    size = data.shape[0]
    data[0:size, 7] = [date_and_time.replace(' ', ',') for date_and_time in data[0:size, 7]]
    
    
    # Deal with the holidays: we put all holidays in one category, and all non-holidays in the other. 
    # There are 11 holidays and 'None'. The latter receives the value 0, while all holidays receive the
    # value 1. The following code is based on string replacement and the particular form of the entries.
    
    data[0:size, 0] = [re.sub(r" ", '', holiday) for holiday in data[0:size, 0]]
    data[0:size, 0] = [re.sub(r"None", '0', holiday) for holiday in data[0:size, 0]]
    data[0:size, 0] = [re.sub(r"D", '1', holiday) for holiday in data[0:size, 0]]
    data[0:size, 0] = [re.sub(r"WashingtonsBirthday", '1', holiday) for holiday in data[0:size, 0]]
    data[0:size, 0] = [re.sub(r"StateFair", '1', holiday) for holiday in data[0:size, 0]]
    data[0:size, 0] = [re.sub(r"[a-zA-Z]", '', holiday) for holiday in data[0:size, 0]]


    # The weather is briefly described in column 5 and in more detail in column 6. 
    # We create two data sets, one for each type of description. 
    
    data_short = auto_replace_categories_in_mixed_data(data, 5, ',')
    data_short = remove_columns(data_short, 6)
    write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume_short.data', data_short, sep = ",")
    
    data_long = auto_replace_categories_in_mixed_data(data, 6, ',')
    data_long = remove_columns(data_long, 5)
    write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume_long.data', data_long, sep = ",")    

    write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.data', data, sep = ",")
    replace_chars_in_file('metro_interstate_traffic_volume.data', '  ', ' ')
    
    
    # Now we are in the position to read the data, convert the time and date, and movel the labels
    
    
    data = load_raw_data('metro_interstate_traffic_volume_short.data', ',', description_columns = 0, date_column = 16, date_sep = '-', date_order = 'Ymd', time_column = 17, time_sep = ':')
    data = move_label_in_front(data, 18)
    save_data_to_file(data, 'metro_interstate_traffic_volume_short', is_classification = False, is_regression = True)
    

    data = load_raw_data('metro_interstate_traffic_volume_long.data', ',', description_columns = 0, date_column = 43, date_sep = '-', date_order = 'Ymd', time_column = 44, time_sep = ':')
    data = move_label_in_front(data, 45)
    save_data_to_file(data, 'metro_interstate_traffic_volume_long', is_classification = False, is_regression = True)


#---------------------------------------------------------------------------------------------------


def get_facebook_live_sellers_thailand():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv', 'facebook_live_sellers_thailand.data')

    data = load_mixed_raw_data('facebook_live_sellers_thailand.data', sep = ",", header = True)
        
    # Columns 0 and 2 contain id and time information. These are deleted. The last 4 columns are empty,
    # and thus deleted, too.
        
    data = remove_columns(data, [0, 2, 12, 13, 14, 15])
    
    # Next we replace the status_type by some numbers
    
    categories = [u'link', u'photo', u'status', u'video']
    data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',')  
    write_mixed_raw_data(UCIVars.raw_data_folder + 'facebook_live_sellers_thailand.data', data, sep = ",")
    

    data = load_raw_data('facebook_live_sellers_thailand.data', ',')
    
    # The classes 1 and 3 contain 63 and 365 samples, only. We remove them for the classification data set
    
    data_class = remove_rows_with_label(data, 1)
    data_class = remove_rows_with_label(data_class, 3)
    save_data_to_file(data_class, 'facebook_live_sellers_thailand_status', is_classification = True, is_regression = False)
    
    
    # For the regression data set, we pick the 'shares' column as label 
    
    data_regr = move_label_in_front(data, 3)
    save_data_to_file(data_regr, 'facebook_live_sellers_thailand_shares', is_classification = False, is_regression = True)
    

#---------------------------------------------------------------------------------------------------


def get_parking_birmingham():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00482/dataset.zip', 'parking_birmingham.zip')
    unzip_raw_data('parking_birmingham.zip')
    os.rename(UCIVars.raw_data_folder + 'dataset.csv', UCIVars.raw_data_folder + 'parking_birmingham.data')
    
    
    # One could also convert the name of the parking spot into a binary vector. However, this vector is 
    # of dimension 30 and therefore it would dominate the remaining features. We thus use a one dimensional
    # representation, instead.

    data = load_mixed_raw_data('parking_birmingham.data', sep = ',', header = True)
    categories = ['BHMEURBRD01', 'BHMEURBRD02', 'Bull Ring', 'BHMBRCBRG02', 'BHMBRCBRG03', 'BHMBRCBRG01', 'Shopping', 'BHMNCPLDH01', 'BHMBCCSNH01', 'BHMNCPRAN01', 'BHMBCCPST01', 'Others-CCCPS133', 'BHMBRTARC01', 'Others-CCCPS98', 'NIA North', 'BHMNCPHST01', 'BHMNCPNST01', 'BHMNCPNHS01', 'BHMBCCTHL01', 'Others-CCCPS119a', 'Others-CCCPS8', 'Others-CCCPS105a', 'Broad Street', 'NIA South', 'NIA Car Parks', 'BHMBCCMKT01', 'BHMMBMMBX01', 'Others-CCCPS202', 'Others-CCCPS135a', 'BHMNCPPLS01']

    data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',')  
    write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.data', data, sep = ",")
    
    
    # Next we split date-time into two features
    
    replace_chars_in_file('parking_birmingham.data', '  ', ',')
    

    # Now, we convert the date into a weekday and then into a point on the circle
    # Furthermore, we create a second data set with rounded times fur possible future time series 
    # treatment.

    data = load_mixed_raw_data('parking_birmingham.data', sep = ",", header = False)

    data = replace_isodate_by_day_in_mixed_data(data, 3)
    data = replace_circulars_in_mixed_data(data, get_categories_in_mixed_data(data, 3), 3, ",")
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.data', data, sep = ",")

    data = replace_time_by_seconds_in_mixed_data(data, 4, sep = ':', rounded = 1800)
    write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.rounded.data', data, sep = ",")
    
    
    # Now we compute the relative occupancy and use it as label
    # Note that we keep both the parking spot number and its capacity
    
    data = load_raw_data('parking_birmingham.data', ',', time_column = 5, time_sep = ':')
    data[:, 2] = data[:, 2] / data[:, 1]
    data = move_label_in_front(data, 2)
    
    save_data_to_file(data, 'parking_birmingham', is_classification = False, is_regression = True)
    
    
#---------------------------------------------------------------------------------------------------


def get_tarvel_review_ratings():
    
    prepare_new_data_set_group_id()
    
    
    # Download the data and correct the misspelling of its name
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00485/google_review_ratings.csv', 'travel_review_ratings.data')
    
    
    # Remove the commas at the end of each row and clean a few messy lines
    
    replace_chars_in_file('travel_review_ratings.data', ',\r', '\r')
    replace_chars_in_file('travel_review_ratings.data', '"', '')
    replace_chars_in_file('travel_review_ratings.data', ',,', ',')
    replace_chars_in_file('travel_review_ratings.data', '\t', '')
    
    data = load_raw_data('travel_review_ratings.data', ',', description_columns = 1, header = True)
    
    
    # Determine the first column that contains the most ratings, use it as label, and remove possible rows 
    # with label = 0
    
    ratings_counts = data.astype(bool).sum(axis=0)
    most_rated_column = numpy.argmax(ratings_counts)
    data = move_label_in_front(data, most_rated_column)
    remove_rows_with_label(data, 0.0)
    
    save_data_to_file(data, 'travel_review_ratings', is_classification = False, is_regression = True)
    

#---------------------------------------------------------------------------------------------------


def get_superconductivity():

    prepare_new_data_set_group_id()
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip', 'superconductivity.zip')
    unzip_raw_data('superconductivity.zip')
    
    os.rename(UCIVars.raw_data_folder + 'train.csv', UCIVars.raw_data_folder + 'superconductivity.data')
    os.remove(UCIVars.raw_data_folder + 'unique_m.csv')
    
    
    data = load_raw_data('superconductivity.data', ',', header = True)
    data_regr = move_label_in_front(data, 81)

    save_data_to_file(data_regr, 'superconductivity', is_classification = False, is_regression = True)


    # We also create a classification daat set, in which we try to identify materials with critical temperature above 77K.
    # We refer to https://en.wikipedia.org/wiki/Superconductivity   for the importance of this threhsod in view of liquid nitrogen.

    data_class = move_label_in_front(data, 81)

    temperature_above_77K = data_class[:,0] > 77
    data_class[:,0] = temperature_above_77K.astype(float)

    save_data_to_file(data_class, 'superconductivity_class', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_gnfuv_unmanned_surface_vehicles():
    
    prepare_new_data_set_group_id()


    print("GNFUV Unmanned Surface Vehicles is currently not processed since:")
    print("  - the description indicates that it is actually very complicated times series data")


#---------------------------------------------------------------------------------------------------


def get_five_cities_pm25():
    
    prepare_new_data_set_group_id()
    
    print("PM2.5 of Five Chinese Cities is used since:")
    print("  - it actually contains 5 data sets of around 20.000 samples, each.")
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00394/FiveCitiePMData.rar', 'five_cities_pm25.rar')
    unrar_raw_data('five_cities_pm25.rar')
    
    cities = {}
    pm_locs = {}

    cities[0] = 'ShenyangPM20100101_20151231.csv'
    cities[1] = 'ChengduPM20100101_20151231.csv'
    cities[2] = 'BeijingPM20100101_20151231.csv'
    cities[3] = 'GuangzhouPM20100101_20151231.csv'
    cities[4] = 'ShanghaiPM20100101_20151231.csv'
    
    pm_locs[0] = (5,6,7)
    pm_locs[1] = (5,6,7)
    pm_locs[2] = (5,6,7,8)
    pm_locs[3] = (5,6,7)
    pm_locs[4] = (5,6,7)
    

    for i in range(0, 5):
        new_city_name = 'five_cities_' + cities[i][:-23].lower() + '_pm25.data'
        os.rename(UCIVars.raw_data_folder + cities[i], UCIVars.raw_data_folder + new_city_name)
        cities[i] = new_city_name
        
        replace_chars_in_file(cities[i], 'cv', '0,0')
        replace_chars_in_file(cities[i], 'NW', '1,2')
        replace_chars_in_file(cities[i], 'NE', '1,1')
        replace_chars_in_file(cities[i], 'SE', '2,1')
        replace_chars_in_file(cities[i], 'SW', '2,2')
        
        data = load_raw_data(cities[i], sep = ',', description_columns = 1)
        
        number_of_rows = numpy.shape(data)[0]
        pm_concs = data[0:number_of_rows, pm_locs[i]]
        pm_concs = numpy.mean(pm_concs, axis = 1)
        pm_concs = numpy.reshape(pm_concs, newshape = (number_of_rows, 1))
    
        data = remove_columns(data, pm_locs[i])
        data = numpy.concatenate((pm_concs, data), axis = 1)
        save_data_to_file(data, new_city_name[:-5], is_classification = False)

    
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------


def get_phishing():

    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00327/Training%20Dataset.arff', 'phishing.arff') 
    replace_chars_in_file('phishing.arff', ' -1', '-1')
    replace_chars_in_file('phishing.arff', ' 1', '1')
    replace_chars_in_file('phishing.arff', '1 ', '1')
    replace_chars_in_file('phishing.arff', '-1 ', '-1')
    replace_chars_in_file('phishing.arff', '0 ', '0')
    replace_chars_in_file('phishing.arff', ' 0', '0')
    unarff_raw_data('phishing')

    data = load_raw_data('phishing.data', sep = ',', description_columns = 0)
    data = move_label_in_front(data, 30)
    save_data_to_file(data, 'phishing', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_ozone_level():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.data', 'ozone_level_8hr.data')  
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.names', 'ozone_level_8hr.description')
    
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data', 'ozone_level_1hr.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.names', 'ozone_level_1hr.description')
  

    data = load_raw_data('ozone_level_8hr.data', sep = ',', description_columns = 1, na_string = '?')
    data = move_label_in_front(data, 72)
    save_data_to_file(data, 'ozone_level_8hr', is_classification = True, is_regression = False)
    
    data = load_raw_data('ozone_level_1hr.data', sep = ',', description_columns = 1, na_string = '?')
    data = move_label_in_front(data, 72)
    save_data_to_file(data, 'ozone_level_1hr', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------

  
def get_opportunity_activity():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00226/OpportunityUCIDataset.zip', 'opportunity_activity.zip')
    #unzip_raw_data('opportunity_activity.zip')
    
    
    print("Opportunity Activity Recognition is currently not processed since:")
    print("  - the zip file is about 292MB large")
    print("  - the description indicates that it is actually times series data")


#---------------------------------------------------------------------------------------------------

  
def get_australian_sign_language():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/auslan2-mld/tctodd.tar.gz', 'australian_sign_language.tar.gz')
    
    
    print("Australian Sign Language is currently not processed since:")
    print("  - each sign only has 27 samples")


#---------------------------------------------------------------------------------------------------

  
def get_seismic_bumps():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff', 'seismic_bumps.arff')
    
    unarff_raw_data('seismic_bumps')
    replace_chars_in_file('seismic_bumps.data', 'a', '1')
    replace_chars_in_file('seismic_bumps.data', 'b', '2')
    replace_chars_in_file('seismic_bumps.data', 'c', '3')
    replace_chars_in_file('seismic_bumps.data', 'd', '4')
    
    replace_chars_in_file('seismic_bumps.data', 'N', '0')
    replace_chars_in_file('seismic_bumps.data', 'W', '1')
    
    data = load_raw_data('seismic_bumps.data', sep = ',')
    data = move_label_in_front(data, 18)
    save_data_to_file(data, 'seismic_bumps', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------

  
def get_meu_mobile_ksd():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00399/MEU-Mobile%20KSD%202016.xlsx', 'meu_mobile_ksd.xlsx')


    print("MEU-Mobile KSD is currently not processed since:")
    print("  - according to the description it seems to be a anomaly detection data set")


#---------------------------------------------------------------------------------------------------

  
def get_character_trajectories():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/character-trajectories/mixoutALL_shifted.mat', 'character_trajectories.mat')


    print("Character Trajectories is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the file is in matlab format")


#---------------------------------------------------------------------------------------------------

  
def get_vicon_physical_action():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00214/Vicon%20Physical%20Action%20Data%20Set.rar', 'vicon_physical_action.rar')
    
    print("Vicon Physical Action is currently not processed since:")
    print("  - according to the description and an follow-up inspection it seems to be a time series data set")


#---------------------------------------------------------------------------------------------------

  
def get_simulated_falls():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00455/Tests.rar', 'simulated_falls.rar')
    print("Simulated Falls and Daily Living Activities is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the data set size is 1.2GB")


#---------------------------------------------------------------------------------------------------

  
def get_chess():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data', 'chess.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.names', 'chess.description')

    
    replace_chars_in_file('chess.data', 'nowin', '-1')
    replace_chars_in_file('chess.data', 'won', '1')
    replace_chars_in_file('chess.data', 'b', '0')
    replace_chars_in_file('chess.data', 'f', '1')
    replace_chars_in_file('chess.data', 'g', '2')
    replace_chars_in_file('chess.data', 'l', '3')
    replace_chars_in_file('chess.data', 'n', '4')
    replace_chars_in_file('chess.data', 't', '5')
    replace_chars_in_file('chess.data', 'w', '6')
    
    
    data = load_raw_data('chess.data', sep = ',')
    data = move_label_in_front(data, 36)
    save_data_to_file(data, 'chess', is_classification = True, is_regression = False)
    
    
#---------------------------------------------------------------------------------------------------

  
def get_abalone():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', 'abalone.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names', 'abalone.description')
    
    replace_chars_in_file('abalone.data', 'F', '-1')
    replace_chars_in_file('abalone.data', 'I', '0')
    replace_chars_in_file('abalone.data', 'M', '1')
    
    data = load_raw_data('abalone.data', sep = ',')
    save_data_to_file(data, 'abalone', is_classification = True, is_regression = False)
    
    
#---------------------------------------------------------------------------------------------------

  
def get_madelon():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data', 'madelon.train.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels', 'madelon.train.labels.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_valid.data', 'madelon.valid.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/madelon_valid.labels', 'madelon.valid.labels.data')
    
    
    # I could not find the test labels, so the test data set is not included. LIBSVM's data set does not contain the test part, either
    
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_test.data', 'madelon.test.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/Dataset.pdf', 'madelon.description.pdf')
    
    
    train_data = load_raw_data('madelon.train.data', sep = ' ')
    train_label = load_raw_data('madelon.train.labels.data', sep = ' ')
    train_data = numpy.concatenate((train_label, train_data), axis = 1)
    
    valid_data = load_raw_data('madelon.valid.data', sep = ' ')
    valid_label = load_raw_data('madelon.valid.labels.data', sep = ' ')
    valid_data = numpy.concatenate((valid_label, valid_data), axis = 1)
    
    data = numpy.concatenate((train_data, valid_data), axis = 0)
    
    save_data_to_file(data, 'madelon', is_classification = True, is_regression = False)
    
    
#---------------------------------------------------------------------------------------------------

  
def get_spambase():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.zip', 'spambase.zip')
    unzip_raw_data('spambase.zip')
    
    os.rename(UCIVars.raw_data_folder + 'spambase.names', UCIVars.raw_data_folder + 'spambase.feature.txt')
    os.rename(UCIVars.raw_data_folder + 'spambase.DOCUMENTATION', UCIVars.raw_data_folder + 'spambase.description')
    
    data = load_raw_data('spambase.data', sep = ',')
    data = move_label_in_front(data, 57)
    save_data_to_file(data, 'spambase', is_classification = True, is_regression = False)
    
    
#---------------------------------------------------------------------------------------------------

  
def get_wilt():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00285/wilt.zip', 'wilt.zip') 
    unzip_raw_data('wilt.zip')
    
    os.rename(UCIVars.raw_data_folder + 'training.csv', UCIVars.raw_data_folder + 'wilt.train.data')
    os.rename(UCIVars.raw_data_folder + 'testing.csv', UCIVars.raw_data_folder + 'wilt.test.data')
    
    concat_files(UCIVars.raw_data_folder + 'wilt.t*.data', UCIVars.raw_data_folder + 'wilt.data')
    
    replace_chars_in_file('wilt.data', 'n', '-1')
    replace_chars_in_file('wilt.data', 'w', '1')
    
        
    data = load_raw_data('wilt.data', sep = ',')
    save_data_to_file(data, 'wilt', is_classification = True, is_regression = False) 
    
    
#---------------------------------------------------------------------------------------------------


def get_waveform():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform.data.Z', 'waveform.Z')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform.names', 'waveform.description')

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform-+noise.data.Z', 'waveform_noise.Z')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform-+noise.names', 'waveform_noise.description')
    
    success = un_z_raw_data('waveform.Z')
    if success == True:
        os.rename(UCIVars.raw_data_folder + 'waveform', UCIVars.raw_data_folder + 'waveform.data')
        data = load_raw_data('waveform.data', sep = ',')
        data = move_label_in_front(data, 21)
        save_data_to_file(data, 'waveform', is_classification = True, is_regression = False)
    else:
        print("The waveform data set could not be built.")
    
    
    success = un_z_raw_data('waveform_noise.Z')
    if success == True:
        os.rename(UCIVars.raw_data_folder + 'waveform_noise', UCIVars.raw_data_folder + 'waveform_noise.data')
        data = load_raw_data('waveform_noise.data', sep = ',')
        data = move_label_in_front(data, 40)
        save_data_to_file(data, 'waveform_noise', is_classification = True, is_regression = False) 
    else:
        print("The waveform_noise data set could not be built.")    
    
    
#---------------------------------------------------------------------------------------------------

  
def get_wall_following_robot():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00194/AllData.zip', 'wall_follow_robot.zip')
    unzip_raw_data('wall_follow_robot.zip')
    
    os.rename(UCIVars.raw_data_folder + 'Wall-following.names', UCIVars.raw_data_folder + 'wall_follow_robot.description')
    
    os.rename(UCIVars.raw_data_folder + 'sensor_readings_2.data', UCIVars.raw_data_folder + 'wall_follow_robot_2.data')
    os.rename(UCIVars.raw_data_folder + 'sensor_readings_4.data', UCIVars.raw_data_folder + 'wall_follow_robot_4.data')
    os.rename(UCIVars.raw_data_folder + 'sensor_readings_24.data', UCIVars.raw_data_folder + 'wall_follow_robot_24.data')
    
    categories = ['Slight-Left-Turn', 'Move-Forward', 'Slight-Right-Turn', 'Sharp-Right-Turn']
    
    data = load_mixed_raw_data('wall_follow_robot_2.data', sep = ',', header = False)
    data = replace_ordinals_in_mixed_data(data, categories, 2, ',', unknown_string = '')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_2.trafo.data', data, sep = ',')
        
    data = load_mixed_raw_data('wall_follow_robot_4.data', sep = ',', header = False)
    data = replace_ordinals_in_mixed_data(data, categories, 4, ',', unknown_string = '')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_4.trafo.data', data, sep = ',')
    
    data = load_mixed_raw_data('wall_follow_robot_24.data', sep = ',', header = False)
    data = replace_ordinals_in_mixed_data(data, categories, 24, ',', unknown_string = '')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_24.trafo.data', data, sep = ',')
    
    data = load_raw_data('wall_follow_robot_2.trafo.data', sep = ',')
    data = move_label_in_front(data, 2)
    save_data_to_file(data, 'wall_follow_robot_2', is_classification = True, is_regression = True)
    
    data = load_raw_data('wall_follow_robot_4.trafo.data', sep = ',')
    data = move_label_in_front(data, 4)
    save_data_to_file(data, 'wall_follow_robot_4', is_classification = True, is_regression = True)
    
    data = load_raw_data('wall_follow_robot_24.trafo.data', sep = ',')
    data = move_label_in_front(data, 24)
    save_data_to_file(data, 'wall_follow_robot_24', is_classification = True, is_regression = True)
    
  
#---------------------------------------------------------------------------------------------------

  
def get_page_blocks():
   
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/page-blocks/page-blocks.data.Z', 'page_blocks.Z')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/page-blocks/page-blocks.names', 'page_blocks.description')
    
    success = un_z_raw_data('page_blocks.Z')
    
    if success == True:
        os.rename(UCIVars.raw_data_folder + 'page_blocks', UCIVars.raw_data_folder + 'page_blocks.data')
        
        replace_chars_in_file('page_blocks.data', '      ', ' ')
        replace_chars_in_file('page_blocks.data', '     ', ' ')
        replace_chars_in_file('page_blocks.data', '    ', ' ')
        replace_chars_in_file('page_blocks.data', '   ', ' ')
        replace_chars_in_file('page_blocks.data', '  ', ' ')
        replace_chars_in_file('page_blocks.data', ' ', ',')
        
        data = load_raw_data('page_blocks.data', sep = ',', description_columns = 1)
        data = move_label_in_front(data, 10)
        save_data_to_file(data, 'page_blocks', is_classification = True, is_regression = False)   
    else:
        print("The waveform data set could not be built.")


#---------------------------------------------------------------------------------------------------

  
def get_optical_recognition_handwritten_digits():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', 'optical_recognition_handwritten_digits.test.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', 'optical_recognition_handwritten_digits.train.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.names', 'optical_recognition_handwritten_digits.description')

    # The additional 'original' data sets contain the bitmaps of the handwritten digits in a strange format.
    # For this reason, they are not further considered.

    concat_files(UCIVars.raw_data_folder + 'optical_recognition_handwritten_digits.*.data', UCIVars.raw_data_folder + 'optical_recognition_handwritten_digits.data')

    data = load_raw_data('optical_recognition_handwritten_digits.data', sep = ',')
    data = move_label_in_front(data, 64)
    save_data_to_file(data, 'optical_recognition_handwritten_digits', is_classification = True, is_regression = False)   
    
    
#---------------------------------------------------------------------------------------------------

  
def get_bach_chorals_harmony():
    
    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00298/jsbach_chorals_harmony.zip', 'bach_chorals_harmony.zip')

    print("Bach Chorals Harmony is currently not processed since:")
    print("  - it contains a lot of classes with a handful of samples, only")


#---------------------------------------------------------------------------------------------------

  
def get_turkiye_student_evaluation():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00262/turkiye-student-evaluation_generic.csv', 'turkiye_student_evaluation.data')

    # Without an explicit target variable, we decided to use the instructor id as target variable

    data = load_raw_data('turkiye_student_evaluation.data', sep = ',')
    save_data_to_file(data, 'turkiye_student_evaluation', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_smartphone_human_activity():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00364/dataset_uci.zip', 'smartphone_human_activity.zip')
    unzip_raw_data('smartphone_human_activity.zip')

    os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_X_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.train.data')
    os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_X_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.test.data')

    os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_y_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.train.labels.data')
    os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_y_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.test.labels.data')

    os.rename(UCIVars.raw_data_folder + 'dataset_uci/features_info.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.features.txt')
    os.rename(UCIVars.raw_data_folder + 'dataset_uci/README.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.description')

    shutil.rmtree(UCIVars.raw_data_folder + 'dataset_uci')
    
    
    train_data = load_raw_data('smartphone_human_activity.train.data', sep = ',')
    train_label = load_raw_data('smartphone_human_activity.train.labels.data', sep = ',')
    train_data = numpy.concatenate((train_label, train_data), axis = 1)
    
    test_data = load_raw_data('smartphone_human_activity.test.data', sep = ',')
    test_label = load_raw_data('smartphone_human_activity.test.labels.data', sep = ',')
    test_data = numpy.concatenate((test_label, test_data), axis = 1)
    
    data = numpy.concatenate((train_data, test_data), axis = 0)
    save_data_to_file(data, 'smartphone_human_activity', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_artificial_characters():
    
    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/artificial-characters/character.tar.Z', 'artificial_characters.tar.Z')
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/artificial-characters/character.names', 'artificial_characters.description')

    print("Artificial Characters is currently not processed since:")
    print("  - the data comes in a rather convoluted form")


#---------------------------------------------------------------------------------------------------

  
def get_first_order_theorem_proving():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00249/ml-prove.tar.gz', 'first_order_theorem_proving.tar.gz')
    untar_raw_data('first_order_theorem_proving.tar.gz')
    
    os.rename(UCIVars.raw_data_folder + 'ml-prove/all-data-raw.csv', UCIVars.raw_data_folder + 'first_order_theorem_proving.data')
    os.rename(UCIVars.raw_data_folder + 'ml-prove/bridge-holden-paulson-details.txt', UCIVars.raw_data_folder + 'first_order_theorem_proving.description')
    shutil.rmtree(UCIVars.raw_data_folder + 'ml-prove')
    
    
    data = load_raw_data('first_order_theorem_proving.data', sep = ',')
    
    rows = numpy.shape(data)[0]
    columns = numpy.shape(data)[1]

    times_of_heuristics = data[0:rows, columns - 5:columns]
    data_features = data[0:rows, 0:columns - 5]

    # Create class labels, where -1 means the "decline" option, that occurs, if none of the 
    # five considered heuristics finished within 100 secs. Also, there are 13 samples, in 
    # which the heuristics appear to have finished instantaneously. These get a positive label.
    # One could also create regression tasks for each of the heuristics, but for now, we
    # don't do this.
    
    class_labels = numpy.reshape(numpy.sign(numpy.amax(times_of_heuristics, axis = 1)), newshape = (rows, 1))
    class_labels[numpy.where(class_labels[0:rows, 0] == 0)] = 1.0
    class_data = numpy.concatenate((class_labels, data_features), axis = 1)

    save_data_to_file(class_data, 'first_order_theorem_proving', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_landsat_satimage():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn', 'landsat_satimage.train.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst', 'landsat_satimage.test.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.doc', 'landsat_satimage.description')
    
    concat_files(UCIVars.raw_data_folder + 'landsat_satimage.*.data', UCIVars.raw_data_folder + 'landsat_satimage.data')


    data = load_raw_data('landsat_satimage.data', sep = ' ')
    data = move_label_in_front(data, 36)
    save_data_to_file(data, 'landsat_satimage', is_classification = True, is_regression = False)   


#---------------------------------------------------------------------------------------------------

  
def get_hiv_1_protease():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00330/newHIV-1_data.zip', 'hiv_1_protease.zip') 
    
    print("HIV-1 protease is currently not processed since:")
    print("  - the 1D data comes in a rather convoluted form")

    
#---------------------------------------------------------------------------------------------------

  
def get_musk():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z', 'musk.Z')     
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.info', 'musk.description')     
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.names', 'musk.features.txt')     
    
    success = un_z_raw_data('musk.Z')
    
    if success == True:
        os.rename(UCIVars.raw_data_folder + 'musk', UCIVars.raw_data_folder + 'musk.data')
        
        data = load_raw_data('musk.data', description_columns = 2, sep = ',')
        data = move_label_in_front(data, 166)
        save_data_to_file(data, 'musk', is_classification = True, is_regression = False)   
    else:
        print("The musk data set could not be built.")


#---------------------------------------------------------------------------------------------------

  
def get_ble_rssi_indoor_location():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00435/BLE_RSSI_dataset.zip', 'ble_rssi_indoor_location.zip') 
    
    print("BLE RSSI indoor location is currently not processed since:")
    print("  - it only has 1420 labeled samples")


#---------------------------------------------------------------------------------------------------

  
def get_australian_sign_language():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/auslan-mld/allsigns.tar.gz', 'australian_sign_language.zip') 
    
    print("Australian sign language is currently not processed since:")
    print("  - the 1D data comes in a rather convoluted form")
    print("  - it truly seems to be time series data")


#---------------------------------------------------------------------------------------------------

  
def get_anuran_calls():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00406/Anuran%20Calls%20(MFCCs).zip', 'anuran_calls.zip') 
    unzip_raw_data('anuran_calls.zip')
    
    os.rename(UCIVars.raw_data_folder + 'Frogs_MFCCs.csv', UCIVars.raw_data_folder + 'anuran_calls.data')
    os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'anuran_calls.description')
    
    
    data = load_mixed_raw_data('anuran_calls.data', sep = ',', header = True)

    categories = sorted(get_categories_in_mixed_data(data, 22))
    data = replace_ordinals_in_mixed_data(data, categories, 22, separator = ',', unknown_replacement_value = 0, begin_value = 1)
    
    categories = get_categories_in_mixed_data(data, 23)
    data = replace_ordinals_in_mixed_data(data, categories, 23, separator = ',', unknown_replacement_value = 0, begin_value = 1)
    
    categories = get_categories_in_mixed_data(data, 24)
    data = replace_ordinals_in_mixed_data(data, categories, 24, separator = ',', unknown_replacement_value = 0, begin_value = 1)
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'anuran_calls.data', data, sep = ',')
    
    
    data = load_raw_data('anuran_calls.data', sep = ',')
    data = remove_columns(data, [25])
    
    
    # There are three different classification problems, each have a few classes
    # with less than 250 samples. The following lines build these three problems
    # and remove the small classes.
    
    data_species = remove_columns(data, [22, 23])
    data_species = move_label_in_front(data_species, 22)     
    rows = numpy.shape(data_species)[0]
    data_species = data_species[numpy.where(data_species[0:rows, 0] != 3)[0], 0:24]
    rows = numpy.shape(data_species)[0]
    data_species = data_species[numpy.where(data_species[0:rows, 0] != 6)[0], 0:24]
    rows = numpy.shape(data_species)[0]
    data_species = data_species[numpy.where(data_species[0:rows, 0] != 10)[0], 0:24]
    save_data_to_file(data_species, 'anuran_calls_species', is_classification = True, is_regression = False)   
    
    data_genus = remove_columns(data, [22, 24])
    data_genus = move_label_in_front(data_genus, 22)
    rows = numpy.shape(data_genus)[0]
    data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 1)[0], 0:24]
    rows = numpy.shape(data_genus)[0]
    data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 4)[0], 0:24]
    rows = numpy.shape(data_genus)[0]
    data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 5)[0], 0:24]
    save_data_to_file(data_genus, 'anuran_calls_genus', is_classification = True, is_regression = False)  
    
    data_families = remove_columns(data, [23, 24])
    data_families = move_label_in_front(data_families, 22)
    rows = numpy.shape(data_families)[0]
    data_families = data_families[numpy.where(data_families[0:rows, 0] != 1)[0], 0:24]
    save_data_to_file(data_families, 'anuran_calls_families', is_classification = True, is_regression = False)  


#---------------------------------------------------------------------------------------------------

  
def get_thyroids():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick-euthyroid.data', 'thyroid_sick_eu.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick-euthyroid.names', 'thyroid_sick_eu.description') 

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.data', 'thyroid_sick.train.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.test', 'thyroid_sick.test.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.names', 'thyroid_sick.description') 

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.data', 'thyroid_dis.train.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.test', 'thyroid_dis.test.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.names', 'thyroid_dis.description')

    # new-thyroid.data only contains 215 samples and is thus omitted

    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data', 'thyroid_hypo.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names', 'thyroid_hypo.description')

    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-train.data', 'thyroid_ann.train.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-test.data', 'thyroid_ann.test.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-thyroid.names', 'thyroid_ann.description')
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-Readme', 'thyroid_ann.more_description')
    
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.data', 'thyroid_all_bp.train.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.test', 'thyroid_all_bp.test.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.names', 'thyroid_all_bp.description')    
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.data', 'thyroid_all_rep.train.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.test', 'thyroid_all_rep.test.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.names', 'thyroid_all_rep.description')    

    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.data', 'thyroid_all_hypo.train.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.test', 'thyroid_all_hypo.test.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.names', 'thyroid_all_hypo.description') 

    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.data', 'thyroid_all_hyper.train.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.test', 'thyroid_all_hyper.test.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.names', 'thyroid_all_hyper.description')    
    

    #--------------------------------------------------
    
    data = load_mixed_raw_data('thyroid_sick_eu.data', sep = ',', header = False)
    categories = [u'sick-euthyroid', u'negative']
    data = replace_categories_in_mixed_data(data, categories, 0, separator = ',')
    
    for col in range(2, 15):
        categories = get_categories_in_mixed_data(data, col)
        data = replace_bin_cats_in_mixed_data(data, categories, col, separator = ',')
    
    columns = [16, 18, 20, 22, 24]  
    for col in columns:
        categories = get_categories_in_mixed_data(data, col)
        data = replace_bin_cats_in_mixed_data(data, categories, col, separator = ',')
        
    # The last column is still in bad shape. The next two lines fix this
    # problem by a little dirty trick
        
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick_eu.data', data, sep = ',')
    data = load_mixed_raw_data('thyroid_sick_eu.data', sep = ',', header = False)
    
    data = auto_replace_missing_in_mixed_data(data, unknown_string = '?')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick_eu.data', data, sep = ',')

    data = load_raw_data('thyroid_sick_eu.data', sep = ',', na_string = '?')
    save_data_to_file(data, 'thyroid_sick_eu', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_sick.t*', UCIVars.raw_data_folder + 'thyroid_sick.data')
    replace_chars_in_file('thyroid_sick.data', '.|', ',')
    replace_chars_in_file('thyroid_sick.data', 'F', '0')
    replace_chars_in_file('thyroid_sick.data', 'M', '1')
    replace_chars_in_file('thyroid_sick.data', 'f', '0')
    replace_chars_in_file('thyroid_sick.data', 't', '1')
    replace_chars_in_file('thyroid_sick.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_sick.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    data = auto_replace_categories_in_mixed_data(data, 29, ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick.data', data, sep = ',')
    
    data = load_raw_data('thyroid_sick.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    save_data_to_file(data, 'thyroid_sick', is_classification = True, is_regression = False) 
    

    #--------------------------------------------------

    concat_files(UCIVars.raw_data_folder + 'thyroid_dis.t*', UCIVars.raw_data_folder + 'thyroid_dis.data')
    replace_chars_in_file('thyroid_dis.data', '.|', ',')
    replace_chars_in_file('thyroid_dis.data', 'F', '0')
    replace_chars_in_file('thyroid_dis.data', 'M', '1')
    replace_chars_in_file('thyroid_dis.data', 'f', '0')
    replace_chars_in_file('thyroid_dis.data', 't', '1')
    replace_chars_in_file('thyroid_dis.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_dis.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    data = auto_replace_categories_in_mixed_data(data, 29, ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_dis.data', data, sep = ',')
    
    data = load_raw_data('thyroid_dis.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    save_data_to_file(data, 'thyroid_dis', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------

    replace_chars_in_file('thyroid_hypo.data', 'F', '0')
    replace_chars_in_file('thyroid_hypo.data', 'M', '1')
    replace_chars_in_file('thyroid_hypo.data', 'f', '0')
    replace_chars_in_file('thyroid_hypo.data', 't', '1')
    replace_chars_in_file('thyroid_hypo.data', 'n', '0')
    replace_chars_in_file('thyroid_hypo.data', 'y', '1')   
    replace_chars_in_file('thyroid_hypo.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_hypo.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 0, ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_hypo.data', data, sep = ',')
        
    data = load_raw_data('thyroid_hypo.data', sep = ',', na_string = '?')
    save_data_to_file(data, 'thyroid_hypo', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_ann.t*', UCIVars.raw_data_folder + 'thyroid_ann.data')
    data = load_raw_data('thyroid_ann.data', sep = ' ', na_string = '?')
    data = move_label_in_front(data, 21)
    save_data_to_file(data, 'thyroid_ann', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_all_bp.t*', UCIVars.raw_data_folder + 'thyroid_all_bp.data')
    replace_chars_in_file('thyroid_all_bp.data', '.|', ',')
    replace_chars_in_file('thyroid_all_bp.data', 'F', '0')
    replace_chars_in_file('thyroid_all_bp.data', 'M', '1')
    replace_chars_in_file('thyroid_all_bp.data', 'f', '0')
    replace_chars_in_file('thyroid_all_bp.data', 't', '1')
    replace_chars_in_file('thyroid_all_bp.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_all_bp.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    
    # We combine all 2 non-negative classes to one, they are all very small
    
    categories = sorted(get_categories_in_mixed_data(data, 29))
    data = replace_manual_in_mixed_data(data, categories, 29, (1, 1, 2), ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_bp.data', data, sep = ',')
    
    data = load_raw_data('thyroid_all_bp.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    
    save_data_to_file(data, 'thyroid_all_bp', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_all_rep.t*', UCIVars.raw_data_folder + 'thyroid_all_rep.data')
    replace_chars_in_file('thyroid_all_rep.data', '.|', ',')
    replace_chars_in_file('thyroid_all_rep.data', 'F', '0')
    replace_chars_in_file('thyroid_all_rep.data', 'M', '1')
    replace_chars_in_file('thyroid_all_rep.data', 'f', '0')
    replace_chars_in_file('thyroid_all_rep.data', 't', '1')
    replace_chars_in_file('thyroid_all_rep.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_all_rep.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    
    # We combine all 3 non-negative classes to one, they are all very small
    
    categories = sorted(get_categories_in_mixed_data(data, 29))
    data = replace_manual_in_mixed_data(data, categories, 29, (1, 2, 2, 2), ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_rep.data', data, sep = ',')
    
    data = load_raw_data('thyroid_all_rep.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    save_data_to_file(data, 'thyroid_all_rep', is_classification = True, is_regression = False) 
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_all_hypo.t*', UCIVars.raw_data_folder + 'thyroid_all_hypo.data')
    replace_chars_in_file('thyroid_all_hypo.data', '.|', ',')
    replace_chars_in_file('thyroid_all_hypo.data', 'F', '0')
    replace_chars_in_file('thyroid_all_hypo.data', 'M', '1')
    replace_chars_in_file('thyroid_all_hypo.data', 'f', '0')
    replace_chars_in_file('thyroid_all_hypo.data', 't', '1')
    replace_chars_in_file('thyroid_all_hypo.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_all_hypo.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    
    # We combine 'primary' and 'secondary' to a new class since 'secondary' only has 2 samples
    
    categories = sorted(get_categories_in_mixed_data(data, 29))
    data = replace_manual_in_mixed_data(data, categories, 29, (1, 2, 3, 3), ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_hypo.data', data, sep = ',')
    
    data = load_raw_data('thyroid_all_hypo.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    save_data_to_file(data, 'thyroid_all_hypo', is_classification = True, is_regression = False)    
    
    
    #--------------------------------------------------
    
    concat_files(UCIVars.raw_data_folder + 'thyroid_all_hyper.t*', UCIVars.raw_data_folder + 'thyroid_all_hyper.data')
    replace_chars_in_file('thyroid_all_hyper.data', '.|', ',')
    replace_chars_in_file('thyroid_all_hyper.data', 'F', '0')
    replace_chars_in_file('thyroid_all_hyper.data', 'M', '1')
    replace_chars_in_file('thyroid_all_hyper.data', 'f', '0')
    replace_chars_in_file('thyroid_all_hyper.data', 't', '1')
    replace_chars_in_file('thyroid_all_hyper.data', ',0,?', ',0,0')
    
    data = load_mixed_raw_data('thyroid_all_hyper.data', sep = ',', header = False)
    data = auto_replace_categories_in_mixed_data(data, 28, ',')
    
    # We combine all 4 non-negative classes to one, they are all very small
    
    categories = sorted(get_categories_in_mixed_data(data, 29))
    data = replace_manual_in_mixed_data(data, categories, 29, (1, 1, 1, 2, 1), ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_hyper.data', data, sep = ',')
    
    data = load_raw_data('thyroid_all_hyper.data', sep = ',', na_string = '?')
    data = remove_columns(data, 34)
    data = move_label_in_front(data, 33)
    save_data_to_file(data, 'thyroid_all_hyper', is_classification = True, is_regression = False)    


#---------------------------------------------------------------------------------------------------

  
def get_isolet():

    prepare_new_data_set_group_id()
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/isolet/isolet1+2+3+4.data.Z', 'isolet.train.Z') 
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/isolet/isolet5.data.Z', 'isolet.test.Z') 
    
    print("ISOLET is currently not processed since:")
    print("  - all classes are rather small (around 300 each)")


#---------------------------------------------------------------------------------------------------

  
def get_mushroom():

    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 'mushroom.data') 
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names', 'mushroom.description') 


    data = load_mixed_raw_data('mushroom.data', sep = ',', header = False)
    columns = numpy.shape(data)[1]
    for col in range(0, columns):
        data = auto_replace_categories_in_mixed_data(data, col, ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'mushroom.data', data, sep = ',')
    
    data = load_raw_data('mushroom.data', sep = ',')
    save_data_to_file(data, 'mushroom', is_classification = True, is_regression = False)   

#---------------------------------------------------------------------------------------------------

  
def get_assamese_characters():

    prepare_new_data_set_group_id()
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00208/Online%20Handwritten%20Assamese%20Characters%20Dataset.rar', 'assamese_characters.rar') 
    
    print("Assamese Characters is currently not processed since:")
    print("  - all classes are rather small (around 45 each)")


#---------------------------------------------------------------------------------------------------

  
def get_arabic_digit():

    prepare_new_data_set_group_id()
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/Test_Arabic_Digit.txt', 'arabic_digit.test.data') 
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/Train_Arabic_Digit.txt', 'arabic_digit.train.data') 
    #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/documentation.html', 'arabic_digit.html') 
    
    print("Arabic Digits is currently not processed since:")
    print("  - I could not find the time to figure out the format")


#---------------------------------------------------------------------------------------------------

  
def get_eeg_steady_state_visual():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00457/BCI-SSVEP_Database_Aceves.zip', 'eeg_steady_state_visual.zip') 
    
    print("EMG Physical Action is currently not processed since:")
    print("  - the data comes in a rather convoluted form")
    print("  - it truly seems to be time series data")


#---------------------------------------------------------------------------------------------------

  
def get_gesture_phase_segmentation():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00302/gesture_phase_dataset.zip', 'gesture_phase_segmentation.zip') 
    unzip_raw_data('gesture_phase_segmentation.zip')

    os.rename(UCIVars.raw_data_folder + 'data_description.txt', UCIVars.raw_data_folder + 'gesture_phase_segmentation.description')

    letters = ['a', 'b', 'c']
    versions = ['raw', 'va3']
    
    for version in versions:
        for letter in letters:
            concat_files(UCIVars.raw_data_folder + letter + '?_' + version + '.csv', UCIVars.raw_data_folder + 'gesture_phase_segmentation.' + letter + version + '.data')
            remove_files(UCIVars.raw_data_folder, letter + '?_' + version + '.csv')
            
        tmp_filename = 'gesture_phase_segmentation.?' + version + '.data'  
        version_filename = 'gesture_phase_segmentation_' + version + '.data'
            
        concat_files(UCIVars.raw_data_folder + tmp_filename, UCIVars.raw_data_folder + version_filename)
        remove_files(UCIVars.raw_data_folder, tmp_filename)
        
        if version == 'raw':
            replace_chars_in_file(version_filename, 'Rest', '1')
            replace_chars_in_file(version_filename, 'Preparation', '2')
            replace_chars_in_file(version_filename, 'Stroke', '3')
            replace_chars_in_file(version_filename, 'Hold', '4')
            replace_chars_in_file(version_filename, 'Retraction', '5')    
        else:
            replace_chars_in_file(version_filename, 'D', '1')
            replace_chars_in_file(version_filename, 'P', '2')
            replace_chars_in_file(version_filename, 'S', '3')
            replace_chars_in_file(version_filename, 'H', '4')
            replace_chars_in_file(version_filename, 'R', '5')    
        
        data = load_raw_data(version_filename, sep = ',')
        columns = numpy.shape(data)[1]
        
        data = move_label_in_front(data, columns - 1)
        save_data_to_file(data, 'gesture_phase_segmentation_' + version, is_classification = True, is_regression = False)   
    

#---------------------------------------------------------------------------------------------------

  
def get_emg_physical_action():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00213/EMG%20Physical%20Action%20Data%20Set.rar', 'emg_physical_action.rar') 
    #unrar_raw_data('emg_physical_action.rar')
    
    print("EMG Physical Action is currently not processed since:")
    print("  - the data comes in a rather convoluted form")


#---------------------------------------------------------------------------------------------------

  
def get_human_activity_smartphone():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip', 'human_activity_smartphone.zip') 
    unzip_raw_data('human_activity_smartphone.zip')
    
    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/train/X_train.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.train.data')
    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/test/X_test.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.test.data')

    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/train/y_train.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.train.labels.data')
    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/test/y_test.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.test.labels.data')

    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/features_info.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.features.txt')
    os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/README.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.description')

    shutil.rmtree(UCIVars.raw_data_folder + 'UCI HAR Dataset')
    shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX')

    replace_chars_in_file('human_activity_smartphone.train.data', '  ', ' ')
    replace_chars_in_file('human_activity_smartphone.test.data', '  ', ' ')
    
    train_data = load_raw_data('human_activity_smartphone.train.data', sep = ' ')
    train_label = load_raw_data('human_activity_smartphone.train.labels.data', sep = ',')
    train_data = numpy.concatenate((train_label, train_data), axis = 1)
    
    test_data = load_raw_data('human_activity_smartphone.test.data', sep = ' ')
    test_label = load_raw_data('human_activity_smartphone.test.labels.data', sep = ',')
    test_data = numpy.concatenate((test_label, test_data), axis = 1)
    
    data = numpy.concatenate((train_data, test_data), axis = 0)
        
    save_data_to_file(data, 'human_activity_smartphone', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_polish_companies_bankruptcy():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip', 'polish_companies_bankruptcy.zip')
    unzip_raw_data('polish_companies_bankruptcy.zip')
    

    for i in range(1, 6):
        unarff_raw_data(str(i) + 'year')
        remove_files(UCIVars.raw_data_folder, str(i) + 'year.arff')
        os.rename(UCIVars.raw_data_folder + str(i) + 'year.data', UCIVars.raw_data_folder + 'polish_companies_bankruptcy_' + str(i) + 'year.data')
        replace_chars_in_file('polish_companies_bankruptcy_' + str(i) + 'year.data', 'nan', '?')
        
        data = load_mixed_raw_data('polish_companies_bankruptcy_' + str(i) + 'year.data', sep = ',')
        data = auto_replace_missing_in_mixed_data(data, unknown_string = '?')
        write_mixed_raw_data(UCIVars.raw_data_folder + 'polish_companies_bankruptcy_' + str(i) + 'year.trafo.data', data, sep = ',')
        
        data = load_raw_data('polish_companies_bankruptcy_' + str(i) + 'year.trafo.data', sep = ',')
        data = move_label_in_front(data, 64)
        save_data_to_file(data, 'polish_companies_bankruptcy_' + str(i) + 'year', is_classification = True, is_regression = False)   

    
#---------------------------------------------------------------------------------------------------

  
def get_crowd_sourced_mapping():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00400/Crowdsourced%20Mapping.zip', 'crowd_sourced_mapping.zip')
    unzip_raw_data('crowd_sourced_mapping.zip')
    
    os.rename(UCIVars.raw_data_folder + 'training.csv', UCIVars.raw_data_folder + 'crowd_sourced_mapping.train.data')
    os.rename(UCIVars.raw_data_folder + 'testing.csv', UCIVars.raw_data_folder + 'crowd_sourced_mapping.test.data')
    
    
    # Get rid of the headers ...
    
    train_data = load_mixed_raw_data('crowd_sourced_mapping.train.data', sep = ',', header = True)
    write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.train.data', train_data, sep = ',')
    
    test_data = load_mixed_raw_data('crowd_sourced_mapping.test.data', sep = ',', header = True)
    write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.test.data', test_data, sep = ',')
    
    concat_files(UCIVars.raw_data_folder + 'crowd_sourced_mapping.*.data', UCIVars.raw_data_folder + 'crowd_sourced_mapping.data')
    
    
    # The data set actually has the following classes: ['impervious', 'orchard', 'farm', 'water', 'forest', 'grass']
    # However, 'orchard' and 'water' only occur 100 and 250 times, respectively. Ignoring them during the 
    # replacement below leads eventually to a 4-class problem with the remaining classes.
    
    data = load_mixed_raw_data('crowd_sourced_mapping.data', sep = ',', header = True)
    categories = ['impervious', 'farm', 'forest', 'grass']
    data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.data', data, sep = ',')
    

    data = load_raw_data('crowd_sourced_mapping.data', sep = ',')
    save_data_to_file(data, 'crowd_sourced_mapping', is_classification = True, is_regression = False) 
    

#---------------------------------------------------------------------------------------------------

  
def get_firm_teacher_clave(): 
  
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00324/ClaveVectors_Firm-Teacher_Model.txt', 'firm_teacher_clave.data') 
    
    replace_chars_in_file('firm_teacher_clave.data', ' ', ',')
    replace_chars_in_file('firm_teacher_clave.data', 'error,fixed', '')
    replace_chars_in_file('firm_teacher_clave.data', ',	', '')
 
    
    data = load_mixed_raw_data('firm_teacher_clave.data', sep = ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'firm_teacher_clave.data', data, sep = ',')
    
    data = load_raw_data('firm_teacher_clave.data', sep = ',')
    
    # The data set has four classes, and their labels are stored as a four-dimensional
    # 'categorial'-vector. The following lines convert this format to the usual one.

    rows = numpy.shape(data)[0]
    columns = numpy.shape(data)[1]
    label_vectors = data[0:rows, columns - 4:columns]
    data_features = data[0:rows, 0:columns - 4]    
    
    labels = numpy.zeros(shape = (rows, 1))    
    for i in range(0, 4):
        labels[numpy.where(label_vectors[0:rows, i] == 1)] = i
    
    data = numpy.concatenate((labels, data_features), axis = 1)
    save_data_to_file(data, 'firm_teacher_clave', is_classification = True, is_regression = False) 

    
#---------------------------------------------------------------------------------------------------

  
def get_smartphone_human_activity_postural():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00341/HAPT%20Data%20Set.zip', 'smartphone_human_activity_postural.zip') 
    unzip_raw_data('smartphone_human_activity_postural.zip')
    
    os.rename(UCIVars.raw_data_folder + 'Train/X_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.train.data')
    os.rename(UCIVars.raw_data_folder + 'Test/X_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.test.data')

    os.rename(UCIVars.raw_data_folder + 'Train/y_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.train.labels.data')
    os.rename(UCIVars.raw_data_folder + 'Test/y_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.test.labels.data')

    os.rename(UCIVars.raw_data_folder + 'features_info.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.features.txt')
    os.rename(UCIVars.raw_data_folder + 'README.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.description')

    shutil.rmtree(UCIVars.raw_data_folder + 'Train')
    shutil.rmtree(UCIVars.raw_data_folder + 'Test')
    shutil.rmtree(UCIVars.raw_data_folder + 'RawData')
    os.remove(UCIVars.raw_data_folder + 'features.txt')
    os.remove(UCIVars.raw_data_folder + 'activity_labels.txt')
    
    
    train_data = load_raw_data('smartphone_human_activity_postural.train.data', sep = ' ')
    train_label = load_raw_data('smartphone_human_activity_postural.train.labels.data', sep = ',')
    train_data = numpy.concatenate((train_label, train_data), axis = 1)
    
    test_data = load_raw_data('smartphone_human_activity_postural.test.data', sep = ' ')
    test_label = load_raw_data('smartphone_human_activity_postural.test.labels.data', sep = ',')
    test_data = numpy.concatenate((test_label, test_data), axis = 1)
    
    data = numpy.concatenate((train_data, test_data), axis = 0)
    
    # The transitional classes 7 to 12 are very small compared to the first 6 classes. Since
    # we are mostly interested in data sets for which no extra care is needed, we remove these
    # six classes.
    
    rows = numpy.shape(data)[0]
    columns = numpy.shape(data)[1]
    data = data[numpy.where(data[0:rows, 0] <= 6)[0], 0:columns]
    
    save_data_to_file(data, 'smartphone_human_activity_postural', is_classification = True, is_regression = False) 
    
    
#---------------------------------------------------------------------------------------------------

  
def get_pen_recognition_handwritten_characters():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra', 'pen_recognition_handwritten_characters.train.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes', 'pen_recognition_handwritten_characters.test.data') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.names', 'pen_recognition_handwritten_characters.description')    
    
    concat_files(UCIVars.raw_data_folder + 'pen_recognition_handwritten_characters.*.data', UCIVars.raw_data_folder + 'pen_recognition_handwritten_characters.data')
    replace_chars_in_file('pen_recognition_handwritten_characters.data', '  ', '')
    replace_chars_in_file('pen_recognition_handwritten_characters.data', ' ', '')
    
    data = load_raw_data('pen_recognition_handwritten_characters.data', sep = ',')

    data = move_label_in_front(data, 16)
    save_data_to_file(data, 'pen_recognition_handwritten_characters', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_epileptic_seizure_recognition():

    print("Epileptic seizure recognition is currently not processed since:")
    print("  - it was removed from the UCI repository")
    
    #prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00388/data.csv', 'epileptic_seizure_recognition.data')

    #data = load_raw_data('epileptic_seizure_recognition.data', description_columns = 1, sep = ',')
    #data = move_label_in_front(data, 178)
    #save_data_to_file(data, 'epileptic_seizure_recognition', is_classification = True, is_regression = False)   


#---------------------------------------------------------------------------------------------------

  
def get_nursery():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data', 'nursery.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.names', 'nursery.description')

    data = load_mixed_raw_data('nursery.data', ',')

    categories = [u'usual', u'pretentious', u'great_pret']
    data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',')
    
    categories = [u'proper', u'less_proper', u'improper', u'critical', u'very_crit']
    data = replace_ordinals_in_mixed_data(data, categories, 1, separator = ',')    
    
    categories = [u'complete', u'completed', u'incomplete', u'foster']
    data = replace_ordinals_in_mixed_data(data, categories, 2, separator = ',')      
    
    categories = [u'1', u'3', u'2', u'more']
    data = replace_ordinals_in_mixed_data(data, categories, 3, separator = ',')      

    categories = [u'convenient', u'less_conv', u'critical']
    data = replace_ordinals_in_mixed_data(data, categories, 4, separator = ',')       
    
    categories = [u'convenient', u'inconv']
    data = replace_ordinals_in_mixed_data(data, categories, 5, separator = ',')         

    categories = [u'nonprob', u'slightly_prob', u'problematic']
    data = replace_ordinals_in_mixed_data(data, categories, 6, separator = ',')        

    categories = [u'not_recom', u'recommended', u'priority']
    data = replace_ordinals_in_mixed_data(data, categories, 7, separator = ',')    
    
    
    # We combine the classes 'not_recom' and 'recommend', since the latter only has two instances
    
    categories = [u'recommend']
    data = replace_ordinals_in_mixed_data(data, categories, 8, separator = ',') 
    categories = [u'not_recom', u'very_recom', u'priority', u'spec_prior']
    data = replace_ordinals_in_mixed_data(data, categories, 8, separator = ',')   
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'nursery.trafo.data', data, sep = ',')
    
    
    data = load_raw_data('nursery.trafo.data', sep = ',')
    data = move_label_in_front(data, 8)
    save_data_to_file(data, 'nursery', is_classification = True, is_regression = True)   


#---------------------------------------------------------------------------------------------------

  
def get_indoor_user_movement_prediction():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00348/MovementAAL.zip', 'indoor_user_movement_prediction.zip')

    print("Indoor User Movement Prediction is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the number of time series samples is small, namely a few hundreds")
    
    
#---------------------------------------------------------------------------------------------------

  
def get_eeg_eye_state():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff', 'eeg_eye_state.arff')
    unarff_raw_data('eeg_eye_state')
    
    data = load_raw_data('eeg_eye_state.data', sep = ',')
    data = move_label_in_front(data, 14)
    save_data_to_file(data, 'eeg_eye_state', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------

  
def get_htru2():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip', 'htru2.zip')
    unzip_raw_data('htru2.zip')

    os.rename(UCIVars.raw_data_folder + 'HTRU_2.csv', UCIVars.raw_data_folder + 'htru2.data')
    os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'htru2.description')
    os.remove(UCIVars.raw_data_folder + 'HTRU_2.arff')
    
    
    # Somehow, the original htru2.data file has a strange format, so that all data is 
    # viewed to be as a single row. Probably, the endofline characters are messed up. 
    # In any case, the following two lines cure this.
    
    data = load_mixed_raw_data('htru2.data', ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'htru2.data', data, sep = ',')
    
    data = load_raw_data('htru2.data', ',')
    data = move_label_in_front(data, 8)
    save_data_to_file(data, 'htru2', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_magic_gamma_telescope():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data', 'magic_gamma_telescope.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names', 'magic_gamma_telescope.description')
    
    replace_chars_in_file('magic_gamma_telescope.data', 'g', '1')
    replace_chars_in_file('magic_gamma_telescope.data', 'h', '-1')
    
    data = load_raw_data('magic_gamma_telescope.data', ',')
    data = move_label_in_front(data, 10)
    save_data_to_file(data, 'magic_gamma_telescope', is_classification = True, is_regression = False) 
    

#---------------------------------------------------------------------------------------------------

  
def get_letter_recognition():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', 'letter_recognition.data')    
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.names', 'letter_recognition.description')    
    
    data = load_mixed_raw_data('letter_recognition.data', sep = ',')
    categories = get_categories_in_mixed_data(data, 0)
    data = replace_ordinals_in_mixed_data(data, sorted(categories), 0, separator = ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'letter_recognition.data', data, sep = ',')
    
    data = load_raw_data('letter_recognition.data', ',')
    save_data_to_file(data, 'letter_recognition', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_occupancy_detection():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip', 'occupancy_detection.zip')
    unzip_raw_data('occupancy_detection.zip')
    
    os.rename(UCIVars.raw_data_folder + 'datatraining.txt', UCIVars.raw_data_folder + 'occupancy_detection.train.data')
    os.rename(UCIVars.raw_data_folder + 'datatest.txt', UCIVars.raw_data_folder + 'occupancy_detection.val.data')
    os.rename(UCIVars.raw_data_folder + 'datatest2.txt', UCIVars.raw_data_folder + 'occupancy_detection.test.data')
    concat_files(UCIVars.raw_data_folder + 'occupancy_detection.*.data', UCIVars.raw_data_folder + 'occupancy_detection.data')
    
    replace_chars_in_file('occupancy_detection.data', ' ', ',')
    replace_chars_in_file('occupancy_detection.data', '"', '')
    
    data = load_raw_data('occupancy_detection.data', ',', description_columns = 1, date_column = 1, date_sep = '-', date_order = 'Ymd', time_column = 2, time_sep = ':')
    data = move_label_in_front(data, 7)
    save_data_to_file(data, 'occupancy_detection', is_classification = True, is_regression = False)
    
    
#---------------------------------------------------------------------------------------------------

  
def get_avila():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip', 'avila.zip')
    unzip_raw_data('avila.zip')
    
    os.rename(UCIVars.raw_data_folder + 'avila/avila-description.txt', UCIVars.raw_data_folder + 'avila.description')
    os.rename(UCIVars.raw_data_folder + 'avila/avila-tr.txt', UCIVars.raw_data_folder + 'avila.train.data')
    os.rename(UCIVars.raw_data_folder + 'avila/avila-ts.txt', UCIVars.raw_data_folder + 'avila.test.data')
    shutil.rmtree(UCIVars.raw_data_folder + 'avila')
    
    concat_files(UCIVars.raw_data_folder + 'avila.*.data', UCIVars.raw_data_folder + 'avila.data')
    
    replace_chars_in_file('avila.data', 'A', '1')
    replace_chars_in_file('avila.data', 'B', '2')
    replace_chars_in_file('avila.data', 'C', '3')
    replace_chars_in_file('avila.data', 'D', '4')
    replace_chars_in_file('avila.data', 'E', '5')
    replace_chars_in_file('avila.data', 'F', '6')
    replace_chars_in_file('avila.data', 'G', '7')
    replace_chars_in_file('avila.data', 'H', '8')
    replace_chars_in_file('avila.data', 'I', '9')
    replace_chars_in_file('avila.data', 'W', '10')
    replace_chars_in_file('avila.data', 'X', '11')
    replace_chars_in_file('avila.data', 'Y', '12')
    
    data = load_raw_data('avila.data', ',')
    data = move_label_in_front(data, 10)
    save_data_to_file(data, 'avila', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_grammatical_facial_expressions():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00317/grammatical_facial_expression.zip', 'grammatical_facial_expression.zip')

    print("Activity Recognition is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the number of time series samples is very low, namely 36")

#---------------------------------------------------------------------------------------------------


def get_chess_krvk():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data', 'chess_krvk.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.info', 'chess_krvk.description')

    
    data = load_mixed_raw_data('chess_krvk.data', sep = ',')
        
    data = auto_replace_categories_in_mixed_data(data, 0, separator = ',')
    data = auto_replace_categories_in_mixed_data(data, 2, separator = ',')
    data = auto_replace_categories_in_mixed_data(data, 4, separator = ',')

    categories = ['draw', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen']
    data = replace_ordinals_in_mixed_data(data, categories, 6, separator = ',', begin_value = -1)

    write_mixed_raw_data(UCIVars.raw_data_folder + 'chess_krvk.trafo.data', data, sep = ',')  

    data = load_raw_data('chess_krvk.trafo.data', sep = ',')
    data = move_label_in_front(data, 23)
    save_data_to_file(data, 'chess_krvk', is_classification = True, is_regression = True)


#---------------------------------------------------------------------------------------------------


def get_default_credit_card():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', 'default_credit_card.xls')

    excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'default_credit_card.xls', engine = 'xlrd')
    excel_data.to_csv(UCIVars.raw_data_folder + 'default_credit_card.data')
    
    
    data = load_raw_data('default_credit_card.data', sep = ',', description_columns = 1)
    data = move_label_in_front(data, 24)
    save_data_to_file(data, 'default_credit_card', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------


def get_nomao():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00227/Nomao.zip', 'nomao.zip')
    unzip_raw_data('nomao.zip')
    
    os.rename(UCIVars.raw_data_folder + 'Nomao/Nomao.data', UCIVars.raw_data_folder + 'nomao.data')
    os.rename(UCIVars.raw_data_folder + 'Nomao/Nomao.names', UCIVars.raw_data_folder + 'nomao.description')
    
    shutil.rmtree(UCIVars.raw_data_folder + 'Nomao')
    
    replace_chars_in_file('nomao.data', '#', ',')
    
    data = load_mixed_raw_data('nomao.data', sep = ',', header = False)
    
    categories = ['s', 'm', 'n']
    columns = [8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 57, 64, 65, 72, 73, 80, 81, 88, 89, 93, 97, 101, 105, 109, 113, 117]
    
    for i in range(len(columns)):
        data = replace_ordinals_in_mixed_data(data, categories, columns[i], ',', unknown_string = '')

    data = auto_replace_missing_in_mixed_data(data, unknown_string = '?')

     
    write_mixed_raw_data(UCIVars.raw_data_folder + 'nomao.trafo.data', data, sep = ',')  
    
    data = load_raw_data('nomao.trafo.data', sep = ',')
    data = move_label_in_front(data, 120)
    save_data_to_file(data, 'nomao', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------


def get_indoor_loc_mag():

    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00343/UJIIndoorLoc-Mag-forUCI.zip', 'indoor_loc_mag.zip')
    
    print("Indoor Location Mag is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the number of time series samples is too low")


#---------------------------------------------------------------------------------------------------

  
def get_activity_recognition():
    
    prepare_new_data_set_group_id()
    #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00366/AReM.zip', 'activity_recognition.zip')


    print("Activity Recognition is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")
    print("  - the number of time series samples is too low")

  
#---------------------------------------------------------------------------------------------------

  
def get_bank_marketing():

    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip', 'bank_marketing.zip') 
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip', 'bank_marketing_additional.zip') 
    
    
    unzip_raw_data('bank_marketing.zip')
    os.rename(UCIVars.raw_data_folder + 'bank-full.csv', UCIVars.raw_data_folder + 'bank_marketing.data')
    os.rename(UCIVars.raw_data_folder + 'bank-names.txt', UCIVars.raw_data_folder + 'bank_marketing.description')
    os.remove(UCIVars.raw_data_folder + 'bank.csv')
    

    replace_chars_in_file('bank_marketing.data', '"', '')
    data = load_mixed_raw_data('bank_marketing.data', sep = ';', header = True)

    categories = ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed']
    data = replace_categories_in_mixed_data(data, categories, 1, ';', unknown_string = 'unknown', unknown_replacement_value = 0)

    categories = ['divorced', 'married', 'single']
    data = replace_categories_in_mixed_data(data, categories, 2, ';', unknown_string = 'unknown', unknown_replacement_value = 0)

    categories = ['primary', 'secondary', 'tertiary']
    data = replace_ordinals_in_mixed_data(data, categories, 3, ';', unknown_string = '')

    categories = ['no', 'yes']
    data = replace_bin_cats_in_mixed_data(data, categories, 4, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 6, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 7, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 16, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    
    categories = ['cellular', 'telephone']
    data = replace_bin_cats_in_mixed_data(data, categories, 8, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    
    categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    data = replace_circulars_in_mixed_data(data, categories, 10, ';', unknown_string = 'unknown')
    
    categories = ['failure', 'success']
    data = replace_bin_cats_in_mixed_data(data, categories, 15, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'bank_marketing.trafo.data', data, sep = ';')


    data = load_raw_data('bank_marketing.trafo.data', sep = ';', na_string = 'unknown')
    data = move_label_in_front(data, 29)
    save_data_to_file(data, 'bank_marketing', is_classification = True, is_regression = False) 

    
#------------------------------------------------
    

    unzip_raw_data('bank_marketing_additional.zip')
    shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX')
    os.rename(UCIVars.raw_data_folder + 'bank-additional/bank-additional-full.csv', UCIVars.raw_data_folder + 'bank_marketing_additional.data')
    os.rename(UCIVars.raw_data_folder + 'bank-additional/bank-additional-names.txt', UCIVars.raw_data_folder + 'bank_marketing_additional.description')
    shutil.rmtree(UCIVars.raw_data_folder + 'bank-additional')


    replace_chars_in_file('bank_marketing_additional.data', '"', '')
    data = load_mixed_raw_data('bank_marketing_additional.data', sep = ';', header = True)
       
    categories = ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed']
    data = replace_categories_in_mixed_data(data, categories, 1, ';', unknown_string = 'unknown', unknown_replacement_value = 0)

    categories = ['divorced', 'married', 'single']
    data = replace_categories_in_mixed_data(data, categories, 2, ';', unknown_string = 'unknown', unknown_replacement_value = 0)

    categories = ['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
    data = replace_ordinals_in_mixed_data(data, categories, 3, ';', unknown_string = '')

    categories = ['no', 'yes']
    data = replace_bin_cats_in_mixed_data(data, categories, 4, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 5, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 6, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    data = replace_bin_cats_in_mixed_data(data, categories, 20, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    
    categories = ['cellular', 'telephone']
    data = replace_bin_cats_in_mixed_data(data, categories, 7, ';', unknown_string = 'unknown', unknown_replacement_value = 0)
    
    categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    data = replace_circulars_in_mixed_data(data, categories, 8, ';', unknown_string = 'unknown')

    categories = ['mon', 'tue', 'wed', 'thu', 'fri']
    data = replace_circulars_in_mixed_data(data, categories, 9, ';', unknown_string = 'unknown')
    
    categories = ['failure', 'success']
    data = replace_bin_cats_in_mixed_data(data, categories, 14, ';', unknown_string = 'nonexistent', unknown_replacement_value = 0)
    

    write_mixed_raw_data(UCIVars.raw_data_folder + 'bank_marketing_additional.trafo.data', data, sep = ';')

    data = load_raw_data('bank_marketing_additional.trafo.data', sep = ';')
    data = move_label_in_front(data, 34)
    save_data_to_file(data, 'bank_marketing_additional', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_census_income():
    
    prepare_new_data_set_group_id()
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 'adult.train.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', 'adult.test.data')
    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names', 'adult.description')
    if os.path.exists(UCIVars.raw_data_folder + 'adult.trafo.data'):
        os.remove(UCIVars.raw_data_folder + 'adult.trafo.data')

    concat_files(UCIVars.raw_data_folder + 'adult.t*.data', UCIVars.raw_data_folder + 'adult.data')
    replace_chars_in_file('adult.data', '>50K.', '>50K') 
    replace_chars_in_file('adult.data', '<=50K.', '<=50K') 
    replace_chars_in_file('adult.data', '|1x3 Cross validator', '')
    
    replace_chars_in_file('adult.data', ', ', ',')
    
    data = load_mixed_raw_data('adult.data', sep = ',', header = False)
          
    categories = ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']
    data = replace_categories_in_mixed_data(data, categories, 1, ',', unknown_string = '')
    
    categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Assoc-acdm', 'Assoc-voc', 'Some-college', 'Bachelors', 'Prof-school', 'Masters', 'Doctorate']
    data = replace_ordinals_in_mixed_data(data, categories, 3, ',', unknown_string = '')
    
    categories = ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse']
    data = replace_categories_in_mixed_data(data, categories, 5, ',', unknown_string = '')
    
    categories = ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces']
    data = replace_categories_in_mixed_data(data, categories, 6, ',', unknown_string = '')
    
    categories = ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']
    data = replace_categories_in_mixed_data(data, categories, 7, ',', unknown_string = '')
    
    categories = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']
    data = replace_categories_in_mixed_data(data, categories, 8, ',', unknown_string = '')
    
    categories = ['Female', 'Male']
    data = replace_bin_cats_in_mixed_data(data, categories, 9, ',', unknown_string = '')
    
    categories = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands']
    data = replace_categories_in_mixed_data(data, categories, 13, ',', unknown_string = '')
    
    categories = ['<=50K', '>50K']
    data = replace_bin_cats_in_mixed_data(data, categories, 14, ',', unknown_string = '')
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'adult.trafo.data', data, sep = ',')
    
    
    data = load_raw_data('adult.trafo.data', sep = ',')
    data = move_label_in_front(data, 89)
    
    save_data_to_file(data, 'adult', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------

  
def get_emg_for_gestures():

    prepare_new_data_set_group_id()


    print("EMG for Gestures is currently not processed since:")
    print("  - according to the description it seems to be a time series data set")


#---------------------------------------------------------------------------------------------------

  
def get_indoor_channel_measurements():

    prepare_new_data_set_group_id()


    print("Indoor Channel Measurements is currently not processed since:")
    print("  - according to the description it seems to be a complicated time series data set")
    
    
#---------------------------------------------------------------------------------------------------

  
def get_electrical_grid_stability_simulated():
    
    prepare_new_data_set_group_id()
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv', 'electrical_grid_stability_simulated.data')
    
    data = load_mixed_raw_data('electrical_grid_stability_simulated.data', sep = ',', header = True)
    
    categories = get_categories_in_mixed_data(data, 13)
    data = replace_bin_cats_in_mixed_data(data, categories, 13, ',')
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'electrical_grid_stability_simulated.data', data, sep = ',')
    
    
    data = load_raw_data('electrical_grid_stability_simulated.data', ',')

    data_class = move_label_in_front(data, 13)
    data_class = remove_columns(data_class, 13)
    save_data_to_file(data_class, 'electrical_grid_stability_simulated', is_classification = True, is_regression = False) 
    
    
    data_regr = move_label_in_front(data, 12)
    data_regr = remove_columns(data_regr, 13)
    save_data_to_file(data_regr, 'electrical_grid_stability_simulated', is_classification = False, is_regression = True) 
    

#---------------------------------------------------------------------------------------------------


def get_online_shoppers_attention():

    prepare_new_data_set_group_id()
    
    download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', 'online_shoppers_attention.data')

    data = load_mixed_raw_data('online_shoppers_attention.data', sep = ',', header = True)
    
    data = auto_replace_categories_in_mixed_data(data, 16, ',')
    data = auto_replace_categories_in_mixed_data(data, 17, ',')
    
    categories = get_categories_in_mixed_data(data, 15)
    data = replace_categories_in_mixed_data(data, categories, 15, ',')
    
    categories = [u'Jan', u'Feb', u'Mar', u'Apr', u'May', u'June', u'Jul', u'Aug', u'Sep', u'Oct', u'Nov', u'Dec']
    data = replace_circulars_in_mixed_data(data, categories, 10, ',')
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'online_shoppers_attention.data', data, sep = ',')
    
    
    data = load_raw_data('online_shoppers_attention.data', ',')
    data = move_label_in_front(data, 20)
    save_data_to_file(data, 'online_shoppers_attention', is_classification = True, is_regression = False) 


#---------------------------------------------------------------------------------------------------


def get_pmu_ud():
    
    prepare_new_data_set_group_id()


    print("PMU-UD is currently not processed since:")
    print("  - the data consists of .jpg images")
    

#---------------------------------------------------------------------------------------------------


def get_seoul_bike_data():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv', 'seoul_bike_data.data')

    # The purpose of the following two lines is to remove the header, which gives an annoying encoding error...
    data = pandas.read_csv(UCIVars.raw_data_folder + 'seoul_bike_data.data', encoding = 'unicode_escape')
    data.to_csv(UCIVars.raw_data_folder + 'seoul_bike_data.data', header = False, index = False)
    
    data = load_mixed_raw_data('seoul_bike_data.data', sep = ',', header = False)

    categories = ['No', 'Yes']
    data = replace_bin_cats_in_mixed_data(data, categories, column = 13, separator = ',')

    categories = ['No Holiday', 'Holiday']
    data = replace_bin_cats_in_mixed_data(data, categories, column = 12, separator = ',')

    categories = ['Winter', 'Spring', 'Summer', 'Autumn']
    data = replace_circulars_in_mixed_data(data, categories, 11, ',')

    write_mixed_raw_data(UCIVars.raw_data_folder + 'seoul_bike_data.data', data, sep = ',')
    data = load_raw_data('seoul_bike_data.data', sep=',', date_column=0, date_sep='/', date_order=['d','m','Y'], header=False)
    data = move_label_in_front(data, 1)
    save_data_to_file(data, 'seoul_bike_data', is_classification = False, is_regression = True)
    

#---------------------------------------------------------------------------------------------------


def get_south_german_credit():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00573/SouthGermanCredit.zip', 'south_german_credit.zip')
    unrar_raw_data('south_german_credit.zip')
    remove_files(UCIVars.raw_data_folder, 'read_SouthGermanCredit.R')
    remove_files(UCIVars.raw_data_folder, 'codetable.txt')
    remove_files(UCIVars.raw_data_folder, 'south_german_credit.zip')
    data = load_raw_data('SouthGermanCredit.asc', sep = ' ', header = True)
    data = move_label_in_front(data, 20)
    save_data_to_file(data, 'south_german_credit', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_shill_bidding():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill%20Bidding%20Dataset.csv', 'shill_bidding.data')
    data = load_mixed_raw_data('shill_bidding.data', sep = ',', header = True)

    # Remove Record ID, Auction ID, Bidder ID
    data = remove_columns(data, [0, 1, 2])

    write_mixed_raw_data(UCIVars.raw_data_folder + 'shill_bidding.data', data, sep = ',')

    data = load_raw_data('shill_bidding.data', sep = ',')
    data = move_label_in_front(data, 9)
    save_data_to_file(data, 'shill_bidding', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_gas_turbine():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00551/pp_gas_emission.zip', 'gas_turbine.zip')
    unzip_raw_data('gas_turbine.zip')
    remove_files(UCIVars.raw_data_folder, 'gas_turbine.zip')

    concat_files(UCIVars.raw_data_folder + 'gt_201*.csv', UCIVars.raw_data_folder + 'gt.data')
    remove_files(UCIVars.raw_data_folder, 'gt_201*.csv')

    data = load_raw_data('gt.data', sep = ',', header = True) # Will report 4 errors because of headers in the middle of the data

    data_co = remove_columns(data, [10])
    data_co = move_label_in_front(data_co, 9)
    save_data_to_file(data_co, 'gas_turbine_co', is_classification=False, is_regression=True)

    data_nox = remove_columns(data, [9])
    data_nox = move_label_in_front(data_nox, 9)
    save_data_to_file(data_nox, 'gas_turbine_nox', is_classification=False, is_regression=True)


#---------------------------------------------------------------------------------------------------


def get_oral_toxicity():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00508/qsar_oral_toxicity.zip', 'oral_toxicity.zip')
    unzip_raw_data('oral_toxicity.zip')
    remove_files(UCIVars.raw_data_folder, 'oral_toxicity.zip')

    data = load_mixed_raw_data('qsar_oral_toxicity.csv', sep = ';', header = False)
    categories = ['negative', 'positive']
    data = replace_bin_cats_in_mixed_data(data, categories, column = 1024, separator = ';')
    
    write_mixed_raw_data(UCIVars.raw_data_folder + 'oral_toxicity.data', data, sep = ',')
    remove_files(UCIVars.raw_data_folder, 'qsar_oral_toxicity.csv')

    data = load_raw_data('oral_toxicity.data', sep = ',', header = False)
    data = move_label_in_front(data, 1024)
    save_data_to_file(data, 'oral_toxicity', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_wave_energy():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00494/WECs_DataSet.zip', 'wave_energy.zip')
    unzip_raw_data('wave_energy.zip')
    remove_files(UCIVars.raw_data_folder, 'wave_energy.zip')

    # For each of the 4 data sets, the last column contains the sum of columns 32 to 47.
    # I assume the last column is the label and columns 32 to 47 are intermediate results
    # and that only the first 32 columns should be used as features.
    indices = range(32, 48)

    data_adelaide = load_raw_data('WECs_DataSet/Adelaide_Data.csv', sep=',')
    data_adelaide = remove_columns(data_adelaide, indices)
    data_adelaide = move_label_in_front(data_adelaide, 32)
    save_data_to_file(data_adelaide, 'wave_energy_adelaide', is_classification=False, is_regression=True)

    
    data_perth = load_raw_data('WECs_DataSet/Perth_Data.csv', sep=',')
    data_perth = remove_columns(data_perth, indices)
    data_perth = move_label_in_front(data_perth, 32)
    save_data_to_file(data_perth, 'wave_energy_perth', is_classification=False, is_regression=True)
    
    data_sydney = load_raw_data('WECs_DataSet/Sydney_Data.csv', sep=',')
    data_sydney = remove_columns(data_sydney, indices)
    data_sydney = move_label_in_front(data_sydney, 32)
    save_data_to_file(data_sydney, 'wave_energy_sydney', is_classification=False, is_regression=True)
    
    data_tasmania = load_raw_data('WECs_DataSet/Tasmania_Data.csv', sep=',')
    data_tasmania = remove_columns(data_tasmania, indices)
    data_tasmania = move_label_in_front(data_tasmania, 32)
    save_data_to_file(data_tasmania, 'wave_energy_tasmania', is_classification=False, is_regression=True)


#---------------------------------------------------------------------------------------------------


def get_firewall():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00542/log2.csv', 'firewall.data')

    data = load_mixed_raw_data('firewall.data', sep = ',', header = True)
    categories = ['allow', 'drop', 'deny', 'reset-both']
    data = replace_ordinals_in_mixed_data(data, categories, column = 4, separator = ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'firewall.data', data, sep = ',')

    data = load_raw_data('firewall.data', sep = ',', header = False)
    data = move_label_in_front(data, 4)
    save_data_to_file(data, 'firewall', is_classification = True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_real_estate_value():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx', 'real_estate_value.xlsx')
    excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'real_estate_value.xlsx', engine = 'openpyxl')
    excel_data.to_csv(UCIVars.raw_data_folder + 'real_estate_value.data', index = False)
    remove_files(UCIVars.raw_data_folder, 'real_estate_value.xlsx')

    data = load_raw_data('real_estate_value.data', sep = ',', header = True)
    data = remove_columns(data, [0])
    data = move_label_in_front(data, 6)
    save_data_to_file(data, 'real_estate_value', is_classification = False, is_regression = True)


#---------------------------------------------------------------------------------------------------


def get_crop_mapping():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00525/data.zip', 'crop_mapping.zip')
    unzip_raw_data('crop_mapping.zip')
    remove_files(UCIVars.raw_data_folder, 'crop_mapping.zip')
    data = load_raw_data('WinnipegDataset.txt', sep=',', header=True)
    save_data_to_file(data, 'crop_mapping', is_classification=True, is_regression = False)


#---------------------------------------------------------------------------------------------------


def get_bitcoin_heist():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip', 'bitcoin_heist.zip')
    unzip_raw_data('bitcoin_heist.zip')
    remove_files(UCIVars.raw_data_folder, 'bitcoin_heist.zip')

    data = load_mixed_raw_data('BitcoinHeistData.csv', sep = ',', header = True)
    data = remove_columns(data, [0])

    # The labels consist of 28 ransomware types and the labe 'white' for not ransomware.
    # We merge every ransomware type into one class. The resulting data set still only has 1.4% positive labels.
    categories = sorted(get_categories_in_mixed_data(data, 8))
    new_cats = [1]*(len(categories)-1) + [2]
    data = replace_manual_in_mixed_data(data, categories, 8, new_cats, ',')
    write_mixed_raw_data(UCIVars.raw_data_folder + 'bitcoin_heist.data', data, sep = ',')
    remove_files(UCIVars.raw_data_folder, 'BitcoinHeistData.csv')

    data = load_raw_data('bitcoin_heist.data', sep = ',', header = False)
    data = move_label_in_front(data, 8)
    save_data_to_file(data, 'bitcoin_heist', is_classification = True, is_regression = False)
    

#---------------------------------------------------------------------------------------------------


def get_query_analytics():

    prepare_new_data_set_group_id()

    download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00493/datasets.zip', 'query_analytics.zip')
    unzip_raw_data('query_analytics.zip')
    remove_files(UCIVars.raw_data_folder, 'query_analytics.zip')
    remove_files(UCIVars.raw_data_folder + 'Datasets/', 'Radius-Queries.csv')

    data_radius = load_raw_data('Datasets/Radius-Queries-Count.csv', sep = ',', header = False)
    data_radius = move_label_in_front(data_radius, 3)
    save_data_to_file(data_radius, 'radius_query', is_classification = False, is_regression = True)

    data_range = load_raw_data('Datasets/Range-Queries-Aggregates.csv', sep = ',', header = True)
    data_range = remove_columns(data_range, [0])

    data_range_incidents = remove_columns(data_range, [5, 6])
    data_range_incidents = move_label_in_front(data_range_incidents, 4)
    save_data_to_file(data_range_incidents, 'range_query_incidents', is_classification = False, is_regression = True)

    data_range_arrests = remove_columns(data_range, [4, 6])
    data_range_arrests = move_label_in_front(data_range_arrests, 4)
    save_data_to_file(data_range_arrests, 'range_query_arrests', is_classification = False, is_regression = True)

    data_range_beat = remove_columns(data_range, [4, 5])
    data_range_beat = move_label_in_front(data_range_beat, 4)
    save_data_to_file(data_range_beat, 'range_query_beat', is_classification = False, is_regression = True)
    

#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------

def download_all_uci(paths: Paths):
    # preparation
    # code was written with global variables, so we set the global variable values here for the paths
    base_folder = str(paths.uci_download())
    #global data_folder
    #global UCIVars.raw_data_folder
    #global regression_data_folder
    #global binary_classification_data_folder
    #global multiclass_classification_data_folder
    #global statistics_filename

    UCIVars.data_folder = base_folder + '/data/'
    UCIVars.raw_data_folder = base_folder + '/raw_data/'
    UCIVars.regression_data_folder = base_folder + '/regression-data/'
    UCIVars.binary_classification_data_folder = base_folder + '/bin-class-data/'
    UCIVars.multiclass_classification_data_folder = base_folder + '/multi-class-data/'
    UCIVars.statistics_filename = base_folder + '/data_statistics.csv'

    utils.ensureDir(UCIVars.data_folder)
    utils.ensureDir(UCIVars.raw_data_folder)
    utils.ensureDir(UCIVars.regression_data_folder)
    utils.ensureDir(UCIVars.binary_classification_data_folder)
    utils.ensureDir(UCIVars.multiclass_classification_data_folder)

    # this was also a global statement
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context

    #if os.path.exists(statistics_filename):
        #os.remove(statistics_filename)

    #---------------------------------------------------------------------------------------------------

    # Data sets that are (primarily) of regression type

    #---------------------------------------------------------------------------------------------------

    get_skill_craft()
    get_cargo_2000()
    get_KDC_4007()
    get_sml2010()
    get_wine_quality()
    get_parkinson()
    get_insurance_benchmark()
    get_air_quality()
    get_EEG_steady_state()
    get_cycle_power_plant()
    get_carbon_nanotubes()
    get_naval_propulsion()
    get_blood_pressure()
    get_gas_sensor_drift()
    get_bike_sharing()
    get_appliances_energy()
    get_indoor_loc()
    get_online_news_popularity()
    get_facebook_comment_volume()
    get_bejing_pm25()
    get_protein_tertiary_structure()
    get_five_cities_pm25()
    get_tamilnadu_electricity()

    # Additional data sets added after mid 2018

    get_metro_interstate_traffic_volume()
    get_facebook_live_sellers_thailand()
    get_parking_birmingham()
    get_tarvel_review_ratings()
    get_superconductivity()
    get_gnfuv_unmanned_surface_vehicles()


    # Additional data sets added February 2021

    #get_seoul_bike_data()
    #get_gas_turbine()
    #get_wave_energy()
    #get_real_estate_value()
    #get_query_analytics()


    #---------------------------------------------------------------------------------------------------

    # Data sets that are (primarily) of classification type

    #---------------------------------------------------------------------------------------------------


    get_phishing()
    get_ozone_level()
    get_opportunity_activity()
    get_australian_sign_language()
    get_seismic_bumps()
    get_meu_mobile_ksd()
    get_character_trajectories()
    get_vicon_physical_action()
    get_simulated_falls()
    get_chess()
    get_abalone()
    get_madelon()
    get_spambase()
    get_wilt()
    get_waveform()
    get_wall_following_robot()
    get_page_blocks()
    get_optical_recognition_handwritten_digits()
    get_bach_chorals_harmony()
    get_smartphone_human_activity()
    get_turkiye_student_evaluation()
    get_artificial_characters()
    get_first_order_theorem_proving()
    get_landsat_satimage()
    get_hiv_1_protease()
    get_musk()
    get_ble_rssi_indoor_location()
    get_anuran_calls()
    get_thyroids()
    get_isolet()
    get_mushroom()
    get_assamese_characters()
    get_arabic_digit()
    get_eeg_steady_state_visual()
    get_gesture_phase_segmentation()
    get_emg_physical_action()
    get_human_activity_smartphone()
    get_polish_companies_bankruptcy()
    get_crowd_sourced_mapping()
    get_firm_teacher_clave()
    get_smartphone_human_activity_postural()
    get_pen_recognition_handwritten_characters()
    get_epileptic_seizure_recognition()
    get_nursery()
    get_indoor_user_movement_prediction()
    get_eeg_eye_state()
    get_htru2()
    get_magic_gamma_telescope()
    get_letter_recognition()
    get_occupancy_detection()
    get_avila()
    get_grammatical_facial_expressions()
    get_chess_krvk()
    get_default_credit_card()
    get_nomao()
    get_indoor_loc_mag()
    get_activity_recognition()
    get_bank_marketing()
    get_census_income()


    ## Additional data sets added after mid 2018


    get_emg_for_gestures()
    get_indoor_channel_measurements()
    get_electrical_grid_stability_simulated()
    get_online_shoppers_attention()
    get_pmu_ud()


    # Additional data sets added February 2021


    #get_south_german_credit()
    #get_shill_bidding()
    #get_oral_toxicity()
    #get_firewall()
    #get_crop_mapping()
    #get_bitcoin_heist()


================================================
FILE: pytabkit/bench/data/import_talent_benchmark.py
================================================
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd

from pytabkit.bench.data.import_tasks import PandasTask
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskCollection
from pytabkit.models import utils
from pytabkit.models.data.data import TaskType


def import_talent_benchmark(paths: Paths, talent_folder: str, source_name: str, allow_regression: bool = True,
                            allow_classification: bool = True,
                            normalize_y: bool = False, min_n_samples: int = 1, max_n_classes: int = 100000,
                            min_n_classes: int = 0, remove_missing_cont: bool = True, remove_duplicates: bool = False,
                            max_n_samples: Optional[int] = None, ignore_above_n_classes: int = 100000,
                            dry_run: bool = False):
    talent_folder = Path(talent_folder)
    dataset_folders = [dataset_folder for dataset_folder in talent_folder.iterdir()]
    for i, dataset_folder in enumerate(dataset_folders):
        dataset_name = dataset_folder.name

        info = utils.deserialize(dataset_folder / 'info.json', use_json=True)
        if dry_run:
            train_size = info.get("train_size", None)
            n_samples = info['train_size'] + info['val_size'] + info['test_size']
            if train_size >= 100_000:
                print(f'{dataset_name}: {train_size=}')
            if n_samples >= 100_000:
                print(f'{dataset_name}: {n_samples=}')
            continue

        print(f'Importing dataset {dataset_name} [{i + 1}/{len(dataset_folders)}]')
        # can be 'regression', 'multiclass', 'binclass'
        task_type = info['task_type']
        print(f'{task_type=}')
        assert task_type in ['regression', 'multiclass', 'binclass']

        if task_type == 'regression' and not allow_regression:
            print(f'Skipping regression datasets')
            continue
        elif task_type != 'regression' and not allow_classification:
            print(f'Skipping classification datasets')
            continue

        # can be 1 for regression
        n_classes = info.get('n_classes', info.get('num_classes', None))

        print(f'{n_classes=}')

        y = np.concatenate(
            [np.load(dataset_folder / f'y_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']], axis=0)
        n_samples = y.shape[0]

        # print(f'{y[:5]=}')

        # print(f'{y.shape=}, {y.dtype=}')

        if len(y.shape) == 2 and y.shape[1] == 1:
            y = y[:, 0]

        y_df = pd.Series(y)

        if task_type == 'regression':
            y_df = y_df.astype(np.float32)
        else:
            y_df = y_df.astype('category')
            if np.any(y_df.isnull()):
                raise ValueError(f'Missing values in class labels not allowed')

        x_dfs = []

        if utils.existsFile(dataset_folder / 'N_train.npy'):
            N = np.concatenate(
                [np.load(dataset_folder / f'N_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']],
                axis=0)
            # print(f'{N.shape=}, {N.dtype=}')
            df = pd.DataFrame(N, columns=[f'cont_{i}' for i in range(N.shape[1])]).astype(np.float32)
            # print(df.head())
            # print(f'{df.columns=}')
            x_dfs.append(df)
            # print(N.flatten()[0])
            # if np.any(np.isnan(N)):
            if np.any(df.isnull()):
                print(f'Contains missing numerical values! ##########################################')
        else:
            N = np.zeros(shape=(n_samples, 0), dtype=np.float32)

        if utils.existsFile(dataset_folder / 'C_train.npy'):
            C = np.concatenate(
                [np.load(dataset_folder / f'C_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']],
                axis=0)
            # print(f'{C.shape=}, {C.dtype=}')
            df = pd.DataFrame(C, columns=[f'cat_{i}' for i in range(C.shape[1])]).astype('category')
            # print(f'{df.columns=}')
            x_dfs.append(df)
            if np.any(df.isnull()):
                print(f'Contains missing categorical values! ##########################################')
        else:
            C = np.zeros(shape=(n_samples, 0), dtype=np.int32)

        if len(x_dfs) == 1:
            x_df = x_dfs[0]
        elif len(x_dfs) == 2:
            x_df = pd.concat(x_dfs, axis='columns')
        else:
            raise ValueError(f'Expected len(x_dfs) in [1, 2], but got {len(x_dfs)=}')

        cat_columns = x_df.select_dtypes(include='category').columns.tolist()
        cat_indicator = [column in cat_columns for column in x_df.columns]

        task_type = TaskType.REGRESSION if task_type == 'regression' else TaskType.CLASSIFICATION

        # task_source_name = 'talent-reg' if task_type == TaskType.REGRESSION else 'talent-class'
        task_desc = TaskDescription(source_name, dataset_name)

        pd_task = PandasTask(x_df, y_df, cat_indicator, task_type, more_info=info)
        if remove_missing_cont:
            pd_task.remove_missing_cont()
        if remove_duplicates:
            pd_task.deduplicate()
        if max_n_samples is not None:
            pd_task.subsample(max_n_samples)
        if normalize_y:
            pd_task.normalize_regression_y()
        if pd_task.get_n_classes() > ignore_above_n_classes:
            print(f'Ignoring task with {pd_task.get_n_classes()} > {ignore_above_n_classes} classes')
            continue
        if pd_task.get_n_classes() > max_n_classes:
            print(f'Only keeping the most frequent {max_n_classes} out of {pd_task.get_n_classes()} classes')
            pd_task.limit_n_classes(max_n_classes)
        if pd_task.get_n_samples() < min_n_samples:
            print(f'Too few samples ({pd_task.get_n_samples()} < {min_n_samples}), ignoring task')
            continue
        if pd_task.get_n_classes() < min_n_classes:
            print(f'Too few classes, ignoring task')
            continue
        pd_task.get_task(task_desc).save(paths)

    if not dry_run:
        TaskCollection.from_source(source_name, paths).save(paths)


================================================
FILE: pytabkit/bench/data/import_tasks.py
================================================
from typing import Union, Optional, List, Dict

import sklearn.model_selection

import torch
from pathlib import Path
import numpy as np
import pandas as pd

from pytabkit.bench.data.common import TaskSource
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskInfo, Task, TaskCollection
from pytabkit.models import utils
from pytabkit.models.data.data import TaskType, DictDataset, TensorInfo


def download_if_not_exists(url: str, dest: str):
    import requests
    """
    Simple function for downloading a file from an url if no file at the destination path exists.
    :param url: URL of the file to download.
    :param dest: Path where to save the downloaded file.
    """
    # following https://dzone.com/articles/simple-examples-of-downloading-files-using-python
    utils.ensureDir(dest)
    if not utils.existsFile(dest):
        print('Downloading ' + url, flush=True)
        # file = requests.get(url)
        # open(dest, 'wb').write(file.content)
        r = requests.get(url, stream=True)
        with open(dest, 'wb') as f:
            print('Progress (dot = 1 MB): ', end='', flush=True)
            for ch in r.iter_content(chunk_size=1024**2):
                print('.', end='', flush=True)
                f.write(ch)
            print(flush=True)


def extract_categories(X):
    n_cols = X.shape[1]
    n_samples = X.shape[0]
    is_categorical = np.asarray([np.allclose(np.abs(X[:, i]), 1.0) for i in range(n_cols)])

    cat_idx_groups = []
    i = 0
    while i < n_cols:
        if not is_categorical[i]:
            i += 1
            continue
        compat_signs = []
        while i < n_cols:
            signs = X[:, i] > 0
            if np.any([np.any(np.logical_and(signs, cs)) for cs in compat_signs]):
                break
            compat_signs.append(signs)
            i += 1

        cat_idx_groups.append(list(np.arange(i - len(compat_signs), i)))

    cont_idxs = list(np.argwhere(~is_categorical)[:, 0])
    X_conts = X[:, cont_idxs] if len(cont_idxs) > 0 else np.zeros(shape=(n_samples, 0), dtype=np.float32)
    signs = X > 0
    # for binary categorical variables, shift by 1 since the category 0 is reserved for missing values
    X_cats = [np.sum(signs[:, g] * np.arange(1, len(g) + 1), axis=1) + (1 if len(g) == 1 else 0) for g in cat_idx_groups]
    X_cats = np.stack(X_cats, axis=1).astype(np.int32) if len(X_cats) > 0 else np.zeros(shape=(n_samples, 0), dtype=np.int32)
    # binary categorical variables need to be shifted one more since here
    # "-1" is not already the missing variable category
    cat_sizes = [len(group) + 1 + (1 if len(group) == 1 else 0) for group in cat_idx_groups]
    return X_conts, X_cats, cat_sizes


def check_zero_hot(uci_base_path):
    uci_base = Path(uci_base_path)
    uci_paths = [uci_base / 'bin-class-data',
                   uci_base / 'multi-class-data',
                   uci_base / 'regression-data']

    for path in uci_paths:
        ds_names = [file.stem for file in path.iterdir() if file.is_file()]
        ds_names.sort()
        for ds_name in ds_names:
            print('Processing dataset', ds_name)
            ds_path = path / (ds_name + '.csv')

            data = np.genfromtxt(ds_path, delimiter=',')
            X = data[:, 1:]
            X_cont, X_cat, cat_sizes = extract_categories(X)
            if np.any(np.logical_and(np.min(X_cat, axis=0) == 0, np.max(X_cat, axis=0) >= 2)):
                print('This dataset has a zero-hot encoding')


def convert_to_class_numbers(y):
    y = np.rint(y)
    y_target = np.zeros(y.shape, dtype=np.int32)
    classes = np.unique(y)
    n_classes = len(classes)
    for i, c in enumerate(classes):
        y_target[y == c] = i
    return y_target, n_classes


def import_from_csv(ds_path: Union[Path, str], task_type: TaskType, task_desc: TaskDescription, paths: Paths,
                    default_split_idx: Optional[int] = None, remove_duplicates: bool = False):
    data = np.genfromtxt(ds_path, delimiter=',')
    X = data[:, 1:]
    y = data[:, 0]
    x_cont, x_cat, cat_sizes = extract_categories(X)
    n_classes = 0

    if remove_duplicates:
        # check for duplicates
        df_cont = pd.DataFrame(x_cont)
        df_cat = pd.DataFrame(x_cat)
        df_combined = pd.concat([df_cont, df_cat], axis=1)  # Concatenate the two DataFrames along the column axis
        is_duplicated = df_combined.duplicated()
        if is_duplicated.any():
            print(f'Warning: Data set contains {is_duplicated.sum()} duplicate values! Removing duplicates...')
            not_duplicated_np = (~is_duplicated).values
            x_cont = x_cont[not_duplicated_np]
            x_cat = x_cat[not_duplicated_np]
            y = y[not_duplicated_np]

    # preprocess y
    if task_type == TaskType.CLASSIFICATION:
        y, n_classes = convert_to_class_numbers(y)
    elif task_type == TaskType.REGRESSION:
        # normalize y
        y = (y - np.mean(y, axis=-1)) / (np.std(y, axis=-1) + 1e-30)
    ds = DictDataset({'x_cont': torch.as_tensor(x_cont, dtype=torch.float32),
                      'x_cat': torch.as_tensor(x_cat, dtype=torch.long),
                      'y': torch.as_tensor(y[:, None])},
                     {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]),
                      'x_cat': TensorInfo(cat_sizes=cat_sizes),
                      'y': TensorInfo(cat_sizes=[n_classes])})
    task_info = TaskInfo.from_ds(task_desc, ds, default_split_idx=default_split_idx)
    task = Task(task_info, ds)
    task.save(paths)


def import_uci_tasks(paths: Paths, remove_duplicates: bool = False, rerun=False):
    uci_base = Path(paths.uci_download())
    uci_matches = [(TaskSource.UCI_BIN_CLASS, uci_base / 'bin-class-data'),
                   (TaskSource.UCI_MULTI_CLASS, uci_base / 'multi-class-data'),
                   (TaskSource.UCI_REGRESSION, uci_base / 'regression-data')]

    for src, path in uci_matches:
        print('Processing task source', src)
        ds_names = [file.stem for file in path.iterdir() if file.is_file()]
        ds_names.sort()
        task_type = TaskType.CLASSIFICATION if 'class' in src else TaskType.REGRESSION
        for ds_name in ds_names:
            task_desc = TaskDescription(task_source=src, task_name=ds_name)
            if (not rerun) and task_desc.exists_task(paths):
                continue
            print('Processing dataset', ds_name)
            ds_path = path / (ds_name + '.csv')

            import_from_csv(ds_path=ds_path, task_type=task_type, task_desc=task_desc, paths=paths,
                            remove_duplicates=remove_duplicates)
        TaskCollection.from_source(src, paths).save(paths)
        print()


def get_openml_task_ids(suite_id: Union[str, int]) -> List[int]:
    import openml
    suite = openml.study.get_suite(suite_id)
    return suite.tasks


class PandasTask:
    def __init__(self, x_df: pd.DataFrame, y_df: pd.Series, cat_indicator: List[bool], task_type: str, more_info: Dict):
        if len(x_df.columns) != len(cat_indicator):
            raise ValueError('x.shape[1] != len(category_indicator)')

        self.x_df = x_df  # should be (sparse) pd.DataFrame
        # should be (sparse) pd.Series  (i.e. a single column of a DataFrame)
        self.y_df = y_df if task_type == TaskType.REGRESSION else y_df.astype('category')
        # if pd.api.types.is_sparse(self.y_df):
        if isinstance(self.y_df.dtype, pd.SparseDtype):
            self.y_df = self.y_df.sparse.to_dense()

        # this is a fix because category_indicator[0] was False for the dataset MIP-2016-regression
        # despite the column being categorical (dtype=object)
        self.cat_indicator = [v or not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]])
                          for i, v in enumerate(cat_indicator)]
        self.cont_indicator = [not b for b in self.cat_indicator]
        self.task_type = task_type
        self.more_info_dict = more_info  # could be passed along to TaskInfo

    def get_n_classes(self):
        if self.task_type == TaskType.REGRESSION:
            return 0
        else:
            self.y_df = self.y_df.cat.remove_unused_categories()
            return len(self.y_df.cat.categories)

    def get_n_samples(self):
        return len(self.x_df)

    def deduplicate(self):
        is_duplicated = self.x_df.duplicated()
        if is_duplicated.any():
            print(f'Warning: Data set contains {is_duplicated.sum()} duplicate values! Removing duplicates...')
            self.x_df = self.x_df.loc[~is_duplicated]
            self.y_df = self.y_df[~is_duplicated]

    def limit_n_classes(self, max_n_classes: int):
        n_classes = self.get_n_classes()
        if n_classes <= max_n_classes:
            return

        vc = self.y_df.value_counts()
        # use mergesort to make it more deterministic
        perm = np.argsort(vc, kind='mergesort')
        cats = vc.axes[0]
        largest_classes = [cats[i] for i in perm[-max_n_classes:]]
        other_classes = [cats[i] for i in perm[:-max_n_classes]]
        to_keep = self.y_df.isin(largest_classes)
        self.x_df = self.x_df.loc[to_keep, :]
        self.y_df = self.y_df[to_keep]
        self.y_df = self.y_df.cat.remove_categories(other_classes)

    def subsample(self, max_size: int):
        if self.x_df.shape[0] > max_size:
            gen = np.random.default_rng(seed=0)
            perm = gen.permutation(self.x_df.shape[0])
            idxs = perm[:max_size]
            self.x_df = self.x_df.iloc[idxs]
            self.y_df = self.y_df.iloc[idxs]

    def remove_missing_cont(self):
        if not np.any(self.cont_indicator):
            return  # no continuous columns

        not_nan_rows = self.x_df.loc[:, self.cont_indicator].notna().all(axis=1)
        self.x_df = self.x_df.loc[not_nan_rows, :]
        self.y_df = self.y_df[not_nan_rows]

    def normalize_regression_y(self):
        if self.task_type == TaskType.REGRESSION and len(self.y_df) >= 2:
            y_np = np.asarray(self.y_df)
            self.y_df.loc[:] = (y_np - np.mean(y_np)) / (np.std(y_np) + 1e-30)

    def get_task(self, task_desc: TaskDescription) -> Task:
        x_cont = np.array(self.x_df.loc[:, self.cont_indicator], dtype=np.float32)

        x_cat_columns = []
        cat_sizes = []
        for i, is_cat in enumerate(self.cat_indicator):
            if is_cat:
                # this fails if column names are also row names,
                # but this is maybe a good check because this might otherwise cause problems in other places...
                col = self.x_df[self.x_df.columns[i]].astype('category')
                # print(f'{type(self.x_df.iloc[:, i])=}')
                # print(f'{type(col)=}')
                col = col.cat.remove_unused_categories()
                # detect missing values
                col = col.cat.remove_categories([s for s in ['', '?'] if s in col.cat.categories])
                # don't use asarray to make sure that the array is not read-only
                col = np.array(col.cat.codes, dtype=np.int32)
                col += 1  # category 0 is used for missing value
                x_cat_columns.append(col)
                cat_sizes.append(1 + np.max(col))

        if len(x_cat_columns) > 0:
            x_cat = np.stack(x_cat_columns, axis=1)
        else:
            x_cat = np.zeros(shape=(len(self.x_df), 0), dtype=np.int32)

        if self.task_type == TaskType.CLASSIFICATION:
            self.y_df = self.y_df.cat.remove_unused_categories()
            y = np.array(self.y_df.cat.codes, dtype=np.int32)
            # y, n_classes = convert_to_class_numbers(y)
        else:
            y = np.array(self.y_df, dtype=np.float32)

        ds = DictDataset({'x_cont': torch.as_tensor(x_cont), 'x_cat': torch.as_tensor(x_cat),
                          'y': torch.as_tensor(y[:, None])},
                         {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]),
                          'x_cat': TensorInfo(cat_sizes=cat_sizes),
                          'y': TensorInfo(cat_sizes=[self.get_n_classes()])})
        task_info = TaskInfo.from_ds(task_desc, ds, more_info_dict=self.more_info_dict)
        return Task(task_info, ds)

    @staticmethod
    def from_openml_task_id(task_id: int):
        import openml
        task = openml.tasks.get_task(task_id, download_data=False)
        dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
        x_df, y_df, cat_indicator, names = dataset.get_data(target=task.target_name, dataset_format='dataframe')

        if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
            task_type = TaskType.CLASSIFICATION
        elif task.task_type_id == openml.tasks.TaskType.SUPERVISED_REGRESSION:
            task_type = TaskType.REGRESSION
        else:
            raise RuntimeError(f'Unknown OpenML Task Type: {task.task_type}')

        more_info_dict = dict(openml_task_id=task_id, openml_dataset_id=task.dataset_id)

        return PandasTask(x_df, y_df, cat_indicator, task_type, more_info=more_info_dict)


def set_openml_cache_dir(dir_name: Union[str, Path]):
    import openml
    if 'set_root_cache_directory' in dir(openml.config):
        # newer openml versions
        openml.config.set_root_cache_directory(str(dir_name))
    elif 'set_cache_directory' in dir(openml.config):
        # older openml versions
        openml.config.set_cache_directory(str(dir_name))


def get_openml_ds_names(task_ids: List[int]):
    import openml
    names = []
    for i, task_id in enumerate(task_ids):
        task = openml.tasks.get_task(task_id, download_data=False)
        dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
        names.append(dataset.name)

    return names


def import_openml(task_ids: List[int], task_source_name: str, paths: Paths, cache_dir: Union[str, Path] = None,
                  normalize_y: bool = False, min_n_samples: int = 1, max_n_classes: int = 100000,
                  min_n_classes: int = 0, remove_missing_cont: bool = True, remove_duplicates: bool = False,
                  exclude_ds_names: Optional[List[str]] = None, max_n_samples: Optional[int] = None,
                  include_only_ds_names: Optional[List[str]] = None, rerun: bool = False,
                  ignore_above_n_classes: int = 100000):
    print(f'Processing task source {task_source_name}')
    import openml

    for i, task_id in enumerate(task_ids):
        with paths.new_tmp_folder() as tmp_folder:
            set_openml_cache_dir(cache_dir or tmp_folder)
            task = openml.tasks.get_task(task_id, download_data=False)
            dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
            print(f'Processing task {dataset.name} for OpenML task source {task_source_name} [{i+1}/{len(task_ids)}]')
            if dataset.name in (exclude_ds_names or []) or \
                    (include_only_ds_names is not None and dataset.name not in include_only_ds_names):
                print('Task was manually excluded')
                continue
            task_desc = TaskDescription(task_source_name, dataset.name)
            if (not rerun) and task_desc.exists_task(paths):
                continue

            pd_task = PandasTask.from_openml_task_id(task_id)
            if remove_missing_cont:
                pd_task.remove_missing_cont()
            if remove_duplicates:
                pd_task.deduplicate()
            if max_n_samples is not None:
                pd_task.subsample(max_n_samples)
            if normalize_y:
                pd_task.normalize_regression_y()
            if pd_task.get_n_classes() > ignore_above_n_classes:
                print(f'Ignoring task with {pd_task.get_n_classes()} > {ignore_above_n_classes} classes')
                continue
            if pd_task.get_n_classes() > max_n_classes:
                print(f'Only keeping the most frequent {max_n_classes} out of {pd_task.get_n_classes()} classes')
                pd_task.limit_n_classes(max_n_classes)
            if pd_task.get_n_samples() < min_n_samples:
                print(f'Too few samples ({pd_task.get_n_samples()} < {min_n_samples}), ignoring task')
                continue
            if pd_task.get_n_classes() < min_n_classes:
                print(f'Too few classes, ignoring task')
                continue
            pd_task.get_task(task_desc).save(paths)

    TaskCollection.from_source(task_source_name, paths).save(paths)
    print(f'Finished importing OpenML tasks {task_source_name}')
    print()


if __name__ == '__main__':
    # import time
    # paths = Paths.from_env_variables()
    # start_time = time.time()
    # with paths.new_tmp_folder() as tmp_folder:
    #     pass
    # print(f'Time: {time.time() - start_time:g} s')
    task_ids = get_openml_task_ids(271)
    import_openml(task_ids[1:2], 'test', Paths('test'))
    pass


================================================
FILE: pytabkit/bench/data/paths.py
================================================
import os
import uuid
from pathlib import Path
from typing import Optional

from pytabkit.models import utils
import shutil


class TmpPathContextManager:
    """
    Helper class: Context manager for creating temporary paths.
    """
    def __init__(self, path: Path):
        self.path = path

    def __enter__(self) -> Path:
        if utils.existsDir(self.path):
            raise RuntimeError('Temporary path already exists:', self.path)
        utils.create_dir(self.path)
        return self.path

    def __exit__(self, type, value, traceback):
        shutil.rmtree(self.path)


class Paths:
    """
    This class provides paths where data can be stored. Its base path can be configured.
    It requires one base folder, which will have several subfolders:
    algs, tasks, task_collections, results, result_summaries, eval, plots, tmp, ...
    by subclassing this class, specific folders can be re-located (e.g. put data on SSD)
    """

    def __init__(self, base_folder: str, tasks_folder: Optional[str] = None, results_folder: Optional[str] = None,
                 result_summaries_folder: Optional[str] = None, uci_download_folder: Optional[str] = None):
        self.base_path = Path(base_folder)
        self.tasks_path = Path(tasks_folder) if tasks_folder is not None else self.base_path / 'tasks'
        self.results_path = Path(results_folder) if results_folder is not None else self.base_path / 'results'
        self.result_summaries_path = Path(
            result_summaries_folder) if result_summaries_folder is not None else self.base_path / 'result_summaries'
        self.uci_download_path = Path(
            uci_download_folder) if uci_download_folder is not None else self.base_path / 'uci_download'

    @staticmethod
    def from_env_variables() -> 'Paths':
        """
        Construct a Paths object that is constructed from environment variables if they are set.
        Otherwise, the base folder will either be taken from custom_paths.py,
        if available, or set to './tab_bench_data'.
        :return: Paths object.
        """
        base_folder = os.environ.get('TAB_BENCH_DATA_BASE_FOLDER', None)
        if base_folder is None:
            try:
                from scripts import custom_paths
                base_folder = custom_paths.get_base_folder()
            except:
                base_folder = './tab_bench_data'
        tasks_folder = os.environ.get('TAB_BENCH_DATA_TASKS_FOLDER', None)
        results_folder = os.environ.get('TAB_BENCH_DATA_RESULTS_FOLDER', None)
        result_summaries_folder = os.environ.get('TAB_BENCH_DATA_RESULT_SUMMARIES_FOLDER', None)
        uci_download_folder = os.environ.get('TAB_BENCH_DATA_UCI_DOWNLOAD_FOLDER', None)
        return Paths(base_folder=base_folder, tasks_folder=tasks_folder, results_folder=results_folder,
                     result_summaries_folder=result_summaries_folder, uci_download_folder=uci_download_folder)

    def base(self) -> Path:
        return self.base_path

    def algs(self) -> Path:
        return self.base() / 'algs'

    def tasks(self) -> Path:
        return self.tasks_path

    def task_collections(self) -> Path:
        return self.base() / 'task_collections'

    def results(self) -> Path:
        return self.results_path

    def result_summaries(self) -> Path:
        return self.result_summaries_path

    def eval(self) -> Path:
        return self.base() / 'eval'

    def plots(self) -> Path:
        return self.base() / 'plots'

    def tmp(self) -> Path:
        return self.base() / 'tmp'

    def uci_download(self) -> Path:
        return self.uci_download_path

    def resources(self):
        return self.base() / 'resources'

    def times(self) -> Path:
        return self.base() / 'times'

    def new_tmp_folder(self) -> TmpPathContextManager:
        # https://stackoverflow.com/questions/2759644/python-multiprocessing-doesnt-play-nicely-with-uuid-uuid4
        return TmpPathContextManager(self.tmp() / str(uuid.UUID(bytes=os.urandom(16), version=4)))

    def results_alg_task(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int) -> Path:
        return self.results() / alg_name / task_desc.task_source / task_desc.task_name / f'{n_cv}-fold'

    def summary_alg_task(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int) -> Path:
        return self.result_summaries() / alg_name / task_desc.task_source / task_desc.task_name \
            / f'{n_cv}-fold'

    def results_alg_task_split(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int, split_type: str,
                               split_id: int) -> Path:
        return self.results_alg_task(task_desc, alg_name, n_cv) / split_type / str(split_id)

    def tasks_task(self, task_desc: 'TaskDescription') -> Path:
        return self.tasks() / task_desc.task_source / task_desc.task_name

    def results_task(self, task_desc: 'TaskDescription') -> Path:
        return self.results() / task_desc.task_source / task_desc.task_name

    def resources_exp_it(self, exp_name: str, iteration: int) -> Path:
        return self.resources() / exp_name / str(iteration)

    def task_source(self, task_source_name: str) -> Path:
        return self.tasks() / task_source_name

    def times_alg_task(self, alg_name: str, task_desc: 'TaskDescription'):
        return self.times() / alg_name / task_desc.task_source / task_desc.task_name


================================================
FILE: pytabkit/bench/data/tasks.py
================================================
from typing import Dict, List, Optional

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.models import utils
import numpy as np
import torch

from pytabkit.models.data.data import TensorInfo, TaskType, DictDataset
from pytabkit.models.data.splits import SplitInfo, RandomSplitter, IndexSplitter


# Should a Task/TaskInfo allow to configure the sizes of train/val/test?
# Disadvantages:
# - Might want to compare different train sizes on the same test set
# - How do we distinguish them in a TaskDescription?
# current solution is instead to set this in RunConfig
# alternatively, could consider encoding this in the split type,
# but this would only concern the fraction of test samples

# make default split simply an int so it can be serialized more easily?
# Do we ever need something other than an IndexSplitter?


class TaskDescription:
    """
    The minimal necessary information to identify a task, consisting of a task source and a task name.
    A task is a dataset with a specific target variable.
    """
    def __init__(self, task_source: str, task_name: str):
        """
        :param task_source: Name of the source where the task was retrieved from (see ``data.common.TaskSource``)
        :param task_name: Name of the task (dataset).
        """
        self.task_source = task_source
        self.task_name = task_name

    def load_info(self, paths: Paths) -> 'TaskInfo':
        """
        Load the associated TaskInfo object.

        :param paths: Path configuration.
        :return: Task info object.
        """
        return TaskInfo.load(paths, self)

    def load_task(self, paths: Paths):
        """
        Load the associated Task object.

        :param paths: Path configuration.
        :return: Task object.
        """
        return self.load_info(paths).load_task(paths)

    def exists_task(self, paths: Paths):
        """
        Check if the task for this description is stored on disk.

        :param paths: Path configuration.
        :return: True iff it exists.
        """
        return utils.existsFile(paths.tasks_task(self) / 'info.yaml')

    def __str__(self):
        """
        :return: Description as a string ``f'{self.task_source}/{self.task_name}'``
        """
        return f'{self.task_source}/{self.task_name}'

    def to_dict(self) -> Dict:
        """
        Convert to a dictionary for saving.

        :return: Dictionary with 'task_source' and 'task_name' entries.
        """
        return {'task_source': self.task_source, 'task_name': self.task_name}

    @staticmethod
    def from_dict(data: Dict) -> 'TaskDescription':
        """
        Create from a dictionary.

        :param data: Dictionary.
        :return: TaskDescription object.
        """
        return TaskDescription(task_source=data['task_source'], task_name=data['task_name'])

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        if not isinstance(other, TaskDescription):
            return False
        return self.task_source == other.task_source and self.task_name == other.task_name


class TaskCollection:
    """
    Collection (list) of TaskDescription objects with its own name (can be the name of the task source).
    """
    # there should be a TaskCollection for every TaskSource with the same name
    # but there can be other collections with other names
    def __init__(self, coll_name: str, task_descs: List[TaskDescription]):
        """
        :param coll_name: Name of the task collection.
        :param task_descs: Task descriptions.
        """
        self.coll_name = coll_name
        self.task_descs = task_descs

    def save(self, paths: Paths):
        file = paths.task_collections() / f'{self.coll_name}.yaml'
        data = {'coll_name': self.coll_name, 'task_descs': [td.to_dict() for td in self.task_descs]}
        utils.serialize(file, data, use_yaml=True)

    def load_infos(self, paths: Paths) -> List['TaskInfo']:
        return [desc.load_info(paths) for desc in self.task_descs]

    @staticmethod
    def from_name(coll_name: str, paths: Paths) -> 'TaskCollection':
        file = paths.task_collections() / f'{coll_name}.yaml'
        data = utils.deserialize(file, use_yaml=True)
        task_descs = [TaskDescription.from_dict(d) for d in data['task_descs']]
        return TaskCollection(data['coll_name'], task_descs)

    @staticmethod
    def from_source(task_source: str, paths: Paths) -> 'TaskCollection':
        """
        Create a task collection with all tasks from a given task source
        (that have been imported/saved with this task source name).
        The task collection will have the same name as the source.
        :param task_source: Name of the task source.
        :param paths: Path configuration.
        :return: TaskCollection object.
        """
        path = paths.task_source(task_source)
        if not utils.existsDir(path):
            return TaskCollection(task_source, [])
        task_descs = [TaskDescription(task_source, p.name) for p in path.iterdir()]
        task_descs.sort(key=lambda task_desc: str(task_desc).lower())  # sort by name
        return TaskCollection(task_source, task_descs)


class TaskInfo:
    """
    Information about a task (without containing the dataset itself).
    """
    def __init__(self, task_desc: TaskDescription, n_samples: int, tensor_infos: Dict[str, TensorInfo],
                 default_split_idx: Optional[int], more_info_dict: Optional[Dict], max_n_trainval: Optional[int] = None):
        """
        :param task_desc: Task description.
        :param n_samples: Number of samples.
        :param tensor_infos: Information about the tensors (x_cat, x_cont, y).
        :param default_split_idx: If the dataset has a default split, this is the index of the first test sample.
            We assume that in this case, the training part is stored before the test part.
        :param more_info_dict: Dictionary with more information that can be stored,
            for example about the original OpenML dataset id.
        :param max_n_trainval: maximum number of samples used for training+validation in random splits.
            If None (default value), no maximum is imposed.
        """
        self.task_desc = task_desc
        self.n_samples = n_samples
        self.tensor_infos = tensor_infos
        self.task_type = TaskType.REGRESSION if tensor_infos['y'].is_cont() else TaskType.CLASSIFICATION
        self.default_split_idx = default_split_idx
        self.more_info_dict = more_info_dict or dict()
        self.max_n_trainval = max_n_trainval

    def get_n_classes(self) -> int:
        """
        :return: Number of classes for classification, or 0 for regression.
        """
        return self.tensor_infos['y'].get_cat_size_product()   # we take the product, but it should only be 1 element

    def load_task(self, paths: Paths) -> 'Task':
        """
        Load the associated task.
        :param paths: Path configuration.
        :return: Task object.
        """
        path = paths.tasks_task(self.task_desc)
        tensors = {}
        tensors['x_cont'] = torch.as_tensor(np.load(str(path / 'x_cont.npy'))).type(torch.float32)
        tensors['x_cat'] = torch.as_tensor(np.load(str(path / 'x_cat.npy'))).type(torch.long)
        tensors['y'] = torch.as_tensor(np.load(str(path / 'y.npy'))).type(
            torch.long if self.task_type == TaskType.CLASSIFICATION else torch.float32)
        ds = DictDataset(tensors=tensors, tensor_infos=self.tensor_infos)
        return Task(task_info=self, ds=ds)

    def get_ds_size_gb(self) -> float:
        """
        :return: Dataset size in gigabyte, when stored in torch Tensors
            (8 byte for categorical variables, 4 byte for continuous variables).
        """
        # need 8 byte for categorical variables (torch.long) but only 4 for continuous (torch.float32)
        return self.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4)
                                     for ti in self.tensor_infos.values()]) / (1024**3)

    def save(self, paths: Paths):
        path = paths.tasks_task(self.task_desc)
        info_dict = {'task_desc': self.task_desc.to_dict(), 'n_samples': self.n_samples,
                     'tensor_infos': {key: value.to_dict() for key, value in self.tensor_infos.items()},
                     'default_split_idx': None if self.default_split_idx is None else int(self.default_split_idx),
                     'more_info_dict': self.more_info_dict,
                     'max_n_trainval': self.max_n_trainval}
        utils.serialize(path / 'info.yaml', info_dict, use_yaml=True)

    @staticmethod
    def load(paths: Paths, task_desc: TaskDescription):
        info_dict = utils.deserialize(paths.tasks_task(task_desc) / 'info.yaml', use_yaml=True)
        return TaskInfo(task_desc=TaskDescription.from_dict(info_dict['task_desc']),
                        n_samples=info_dict['n_samples'],
                        tensor_infos={key: TensorInfo.from_dict(value)
                                      for key, value in info_dict['tensor_infos'].items()},
                        default_split_idx=info_dict['default_split_idx'],
                        more_info_dict=info_dict.get('more_info_dict', dict()),
                        max_n_trainval=info_dict.get('max_n_trainval', None))

    @staticmethod
    def from_ds(task_desc: TaskDescription, ds: DictDataset, default_split_idx: Optional[int] = None,
                more_info_dict: Optional[Dict] = None) -> 'TaskInfo':
        return TaskInfo(task_desc=task_desc, n_samples=ds.n_samples, tensor_infos=ds.tensor_infos,
                        default_split_idx=default_split_idx, more_info_dict=more_info_dict)

    def get_random_splits(self, n_splits: int, trainval_fraction: float = 0.8, train_fraction: float = 0.75) -> List[SplitInfo]:
        # use n_samples to generate alg_seed
        # in order to have the randomness also depend on the data set and not only on the split index
        return [SplitInfo(RandomSplitter(seed=i, first_fraction=trainval_fraction, max_n_first=self.max_n_trainval),
                          SplitType.RANDOM, id=i,
                          alg_seed=utils.combine_seeds(self.n_samples, i), train_fraction=train_fraction)
                for i in range(n_splits)]

    def get_default_splits(self, n_splits) -> List[SplitInfo]:
        if self.default_split_idx is None:
            return []
        else:
            return [SplitInfo(IndexSplitter(self.default_split_idx), SplitType.DEFAULT, id=i,
                              alg_seed=utils.combine_seeds(self.n_samples, i))
                    for i in range(n_splits)]


class Task:
    """
    Task (dataset with defined target variable),
    consisting of a task info and a dataset.
    """
    def __init__(self, task_info: TaskInfo, ds: DictDataset):
        self.task_info = task_info
        self.ds = ds  # data is on CPU here

    def save(self, paths: Paths):
        path = paths.tasks_task(self.task_info.task_desc)
        utils.ensureDir(path / 'x_cont.npy')
        np.save(str(path / 'x_cont.npy'), self.ds.tensors['x_cont'].type(torch.float32).numpy())
        np.save(str(path / 'x_cat.npy'), self.ds.tensors['x_cat'].type(torch.int32).numpy())
        np.save(str(path / 'y.npy'), self.ds.tensors['y'].type(
            torch.int32 if self.task_info.task_type == TaskType.CLASSIFICATION else torch.float32).numpy())
        self.task_info.save(paths)


class TaskPackage:
    """
    Combines information about how to run a task on a benchmark.
    """
    def __init__(self, task_info: TaskInfo, split_infos: List[SplitInfo], n_cv: int, n_refit: int, paths: Paths,
                 rerun: bool, alg_name: str, save_y_pred: bool):
        self.task_info = task_info
        self.split_infos = split_infos
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.paths = paths
        self.rerun = rerun
        self.alg_name = alg_name
        self.save_y_pred = save_y_pred


================================================
FILE: pytabkit/bench/data/uci_file_ops.py
================================================


import os as os
import re as re
import csv as csv
import math as math
from pathlib import Path

import pandas as pandas
import numpy as numpy
import os.path as path
import glob as glob
import shutil as shutil
import zipfile as zipfile
from scipy.io import arff
import patoolib as patoolib
import sklearn.preprocessing as preprocessing
import sklearn.datasets as datasets


import urllib.request as urllib2
import time
import datetime
import codecs
import platform
import tarfile
import gzip
import ssl

from collections import Counter


class UCIVars:
    # formerly global variables, will be re-set by get_uci.download_all_uci()
    data_folder = '../data/'
    raw_data_folder = '../raw-data/'
    regression_data_folder = '../regression-data/'
    binary_classification_data_folder = '../bin-class-data/'
    multiclass_classification_data_folder = '../multi-class-data/'
    statistics_filename = "../data_statistics.csv"

    data_group_id = 0


# if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
#     ssl._create_default_https_context = ssl._create_unverified_context


#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------


#---------------------------------------------------------------------------------------------------

def prepare_new_data_set_group_id():
    print("\n==================================================================")
    UCIVars.data_group_id = UCIVars.data_group_id + 1
    

#---------------------------------------------------------------------------------------------------


def make_folder(folder):
    if (os.path.exists(folder) == False):
        os.mkdir(folder)
    

#---------------------------------------------------------------------------------------------------


def download_and_save(url, filename):
    
    data_link = urllib2.urlopen(url)
    print('Downloading: ' + filename) 
    
    if os.path.exists(UCIVars.raw_data_folder + filename):
        os.remove(UCIVars.raw_data_folder + filename)
    
    with open(UCIVars.raw_data_folder + filename, 'wb') as output:
        output.write(data_link.read())
    
    
#---------------------------------------------------------------------------------------------------


def unzip_raw_data(filename):
    
    zip_ref = zipfile.ZipFile(UCIVars.raw_data_folder + filename, 'r')
    zip_ref.extractall(UCIVars.raw_data_folder)
    zip_ref.close()


#---------------------------------------------------------------------------------------------------


def unrar_raw_data(filename):
    
    full_filename = UCIVars.raw_data_folder + filename 
    patoolib.extract_archive(full_filename, outdir = UCIVars.raw_data_folder)

    
#---------------------------------------------------------------------------------------------------


def my_decode(x):
    
    if isinstance(x, bytes):
        return x.decode('utf-8')
    else:
        return str(x)


#---------------------------------------------------------------------------------------------------


def unarff_raw_data(filename):
    
    data = arff.loadarff(UCIVars.raw_data_folder + filename + '.arff')[0]
    target_filename = UCIVars.raw_data_folder + filename + '.data'
        
    data_cleaned = []
    for row in data:
        data_cleaned.append([my_decode(entry) for entry in row])

    
    with open(target_filename, "w") as target_file:
        writer = csv.writer(target_file, lineterminator = '\n')
        writer.writerows(data_cleaned)


#---------------------------------------------------------------------------------------------------


def un_z_raw_data(filename):
    
    if platform.system() == "Linux":
        os.system('uncompress -f ' + UCIVars.raw_data_folder + filename)
        return True
    else:
        print("Could not decompress .Z file, since this requires Linux.")
        return False
    
    
#---------------------------------------------------------------------------------------------------


def untar_raw_data(filename):
    
    full_filename = UCIVars.raw_data_folder + filename
    tar = tarfile.open(full_filename)
    tar.extractall(UCIVars.raw_data_folder)
    tar.close()
    
    
#---------------------------------------------------------------------------------------------------


def ungz_raw_data(filename):
    
    full_filename = UCIVars.raw_data_folder + filename
    target_filename = UCIVars.raw_data_folder + filename + '.data'
    
    target_file = open(target_filename, "w")
    with gzip.open(full_filename, 'rt') as source_file:
        data = source_file.read()

    target_file.write(data)
    target_file.close()
    
    
#---------------------------------------------------------------------------------------------------


def replace_chars_in_file(filename, old_char, new_char):
    
    fr = codecs.open(UCIVars.raw_data_folder + filename, encoding = 'utf-8')
    content = fr.read()
    fr.close()

    newcontent = content.replace(old_char, new_char)
    
    fw = codecs.open(UCIVars.raw_data_folder + filename, 'w', encoding = 'utf-8')
    fw.write(newcontent)
    fw.close()
   
   
#---------------------------------------------------------------------------------------------------
    
    
def get_category_replace_string(category_size, position, separator):
    
    string = ''
    for i in range(position):
        string = string + '0' + separator
        
    string = string + '1' + separator    
    
    
    for i in range(position + 1, category_size):
        string = string + '0' + separator
    
    string = string[0:len(string) - len(separator)]
    
    return string


#---------------------------------------------------------------------------------------------------

    
def replace_categories_in_file(filename, categories, separator):
    
    for i in range(len(categories)):
        replace_chars_in_file(filename, categories[i], get_category_replace_string(len(categories), i, separator))
        
        
#---------------------------------------------------------------------------------------------------

def convert_replace_string_to_vector(string, separator):
    
    string_vector = string.split(separator)
    
    return list(numpy.float_(string_vector))
    

#---------------------------------------------------------------------------------------------------

    
def get_categories_in_mixed_data(data, column):

    rows = numpy.shape(data)[0]
    categories = list(set(data[0:rows, column]))
    
    return categories

    
#---------------------------------------------------------------------------------------------------

    
def auto_replace_categories_in_mixed_data(data, column, separator, unknown_string = '', unknown_replacement_value = 0):    
    
    categories = get_categories_in_mixed_data(data, column)
    if numpy.shape(categories)[0] == 2:
        new_data = replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string = unknown_string, unknown_replacement_value = unknown_replacement_value)
    else:
        new_data = replace_categories_in_mixed_data(data, categories, column, separator, unknown_string = unknown_string, unknown_replacement_value = unknown_replacement_value)
    
    return new_data


#---------------------------------------------------------------------------------------------------

    
def auto_replace_missing_in_mixed_data(data, unknown_string = '?'):    
    
    rows = numpy.shape(data)[0]
    dim = numpy.shape(data)[1]    
    
    columns = range(dim)
    for i in range(len(columns)):
        count_entries = Counter(data[0:rows, columns[i]])
        weighted_sum = 0.0
        entries_sum = 0.0
        for key in count_entries:
            if key != unknown_string:
                weighted_sum = weighted_sum + float(key) * count_entries[key]
                entries_sum = entries_sum + count_entries[key]
        average = weighted_sum / float(entries_sum)    
        data = replace_categories_in_mixed_data(data, [], columns[i], ',', unknown_string = '?', unknown_replacement_value = average)
        
        
    return data
    
    
#---------------------------------------------------------------------------------------------------

    
def replace_categories_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0):
    
    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]
    
    
    empty_string_length = len(categories) * max(1, len(str(unknown_replacement_value))) + (len(categories) - 1) * len(separator)
    empty_string = ' ' * empty_string_length
    new_column = [empty_string] * rows
    new_column = data[0:rows, column]
    
    for i in range(len(categories)):
        replacement = get_category_replace_string(len(categories), i, separator)
        new_column = [replacement if word == categories[i] else word for word in new_column]
        
    if unknown_string != '':
        replacement = str(unknown_replacement_value)
        for i in range(len(categories) - 1):
            replacement = replacement + separator + str(unknown_replacement_value)
        new_column = [replacement if word == unknown_string else word for word in new_column]
        
    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)    

    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]
    
    empty_string_length = max(2, len(str(unknown_replacement_value)))
    empty_string = ' ' * empty_string_length
    new_column = [empty_string] * rows
    new_column = data[0:rows, column]

    if unknown_string != '':
        replacement = str(unknown_replacement_value)
        new_column = [replacement if word == unknown_string else word for word in new_column]
        
    for i in range(len(categories)):
        replacement = str(2 * i - 1)
        new_column = [replacement if word == categories[i] else word for word in new_column]

    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)    
        
    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_ordinals_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0, begin_value = 1):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]
    
    empty_string_length = max(len(str(unknown_replacement_value)), len(str(len(categories) + 1)))
    empty_string = ' ' * empty_string_length
    new_column = [empty_string] * rows
    new_column = data[0:rows, column]

    for i in range(len(categories)):
        replacement = str(i + begin_value)
        new_column = [replacement if word == categories[i] else word for word in new_column]
        
    if unknown_string != '':
        replacement = str(unknown_replacement_value)
        new_column = [replacement if word == unknown_string else word for word in new_column]

    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)    
        
    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_manual_in_mixed_data(data, categories, column, replacement, separator, unknown_string = '', unknown_replacement_value = 0):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]
    
    empty_string_length = max(len(str(unknown_replacement_value)), len(str(len(categories) + 1)))
    empty_string = ' ' * empty_string_length
    new_column = [empty_string] * rows
    new_column = data[0:rows, column]

    for i in range(len(categories)):
        new_column = [str(replacement[i]) if word == categories[i] else word for word in new_column]
        
    if unknown_string != '':
        replacement_tmp = str(unknown_replacement_value)
        new_column = [replacement_tmp if word == unknown_string else word for word in new_column]

    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)    
        
    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_circulars_in_mixed_data(data, categories, column, separator, unknown_string = ''):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]
    decimals = 5
    
    
    empty_string_length = 2 * (decimals + 3) + len(separator)
    empty_string = ' ' * empty_string_length
    new_column = [empty_string] * rows
    new_column = data[0:rows, column]

    for i in range(len(categories)):
        radians = float(i) * 2.0 * math.pi / float(len(categories))
        replacement = str(round(math.cos(radians), decimals)) + separator + str(round(math.sin(radians), decimals))
        new_column = [replacement if word == categories[i] else word for word in new_column]
        
    if unknown_string != '':
        replacement = str(0.0) + separator + str(0.0)
        new_column = [replacement if word == unknown_string else word for word in new_column]

    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)    
        
    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_isodate_by_day_in_mixed_data(data, column):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]

    old_column = [numpy.datetime64(date) for date in data[0:rows, column]]
    new_column = [str(date.astype(datetime.datetime).isoweekday()) for date in old_column]
    
    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)
    
        
    return new_data


#---------------------------------------------------------------------------------------------------

    
def replace_time_by_seconds_in_mixed_data(data, column, sep, rounded = 1):

    rows = numpy.shape(data)[0]
    cols = numpy.shape(data)[1]

    new_column = [str(int(round(float(convert_time_to_seconds(time, sep)) / float(rounded))) * rounded) for time in data[0:rows, column]]
    
    new_column = numpy.reshape(new_column, newshape = (rows, 1))
    new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1)
    
        
    return new_data


#---------------------------------------------------------------------------------------------------


def remove_files(folder, filename_pattern):
    
    filenames = glob.glob(folder + filename_pattern)

    for name in filenames:
        os.remove(name)

#---------------------------------------------------------------------------------------------------


def concat_files(source_filename_pattern, target_filename):
    
    filenames = glob.glob(source_filename_pattern)
    
    if os.path.exists(target_filename):
        os.remove(target_filename)
    
    with open(target_filename,'wb') as target_file:
        for name in filenames:
            with open(name,'rb') as source_file:
                shutil.copyfileobj(source_file, target_file, 1024*1024*10)
                
#---------------------------------------------------------------------------------------------------


def load_mixed_raw_data(filename, sep, header = False):
    
    # Some Python versions issue a warning if 'encoding' is not set, while other versions do not know 'encoding'
    # Pick the one you prefer ...
    
    #data = numpy.genfromtxt(UCIVars.raw_data_folder + filename, dtype = None, delimiter = sep)
    data = numpy.genfromtxt(UCIVars.raw_data_folder + filename, dtype = str, delimiter = sep, encoding = None)
    if (header == True):
        data = numpy.delete(data, 0, 0)

    if len(numpy.shape(data)) == 1:
        dim = len(data[0])
        rows = numpy.shape(data)[0]
        new_data = [None] * (dim * rows)
        new_data = numpy.reshape(new_data, newshape = (rows, dim))
        
        for i in range(rows):
            new_data[i] = map(str, data[i]) 
            
        data = new_data

    return data


#---------------------------------------------------------------------------------------------------


def write_mixed_raw_data(filename, data, sep):
    
    with open(filename, mode = 'w') as write_file:
        writer = csv.writer(write_file, delimiter = sep, quotechar = '', quoting = csv.QUOTE_NONE, escapechar = ' ')
        writer.writerows(data)

    # replace_chars_in_file will add the raw_data_path, so we have to remove it from the filename
    replace_chars_in_file(Path(filename).name, ' ' + sep, sep)

    
#---------------------------------------------------------------------------------------------------
    
    
def load_raw_data(filename, sep, description_columns = 0, date_column = -1, date_sep = '', date_order = '', time_column = -1, time_sep = '', german_decimal = False, na_string = '---', show_intermediate = False, header = False):
    
    fp = open(UCIVars.raw_data_folder + filename, 'r')
    
    number_of_rows = 0
    number_of_lines = 0
    max_number_of_columns = 0

    rows_with_na_string = 0
    rows_with_incorrect_date = 0
    rows_with_incorrect_time = 0
    rows_with_incorrect_number_of_columns = 0
    rows_with_odd_error = 0
    
    is_first_line = True
    
    for row in fp:
        if (is_first_line == True) and (header == True):
            is_first_line = False
        else:
            row = row.strip()
            raw_row = row.split(sep)
            
            number_of_columns = numpy.shape(raw_row)[0]
            max_number_of_columns = max(number_of_columns, max_number_of_columns)
            
            number_of_data_columns = number_of_columns - description_columns
            current_row = numpy.zeros(shape = (1, number_of_data_columns))
            number_of_lines = number_of_lines + 1
            
            if ((number_of_lines % 1000 == 0) and (show_intermediate == True)):
                print("Read %d lines" %number_of_lines)


            correct_row = True
            for c in range(description_columns, number_of_columns):
                if (raw_row[c] == na_string):
                    correct_row = False
                    rows_with_na_string = rows_with_na_string + 1
                elif (c == date_column):
                    date = raw_row[c].split(date_sep)
                    if (len(date) != 3):
                        correct_row = False
                        rows_with_incorrect_date = rows_with_incorrect_date + 1
                    else:
                        date_string = date[0] + '-' + date[1] + '-' + date[2] 
                        date_fmt = '%' + date_order[0] + '-%' + date_order[1] + '-%' + date_order[2]
                        date_result = datetime.datetime.strptime(date_string, date_fmt)
                        date_tuple = date_result.timetuple()     
                        current_row[0, c - description_columns] = float(date_tuple.tm_yday)
                elif (c == time_column):
                    time = raw_row[c].split(time_sep)
                    if (len(time) != 3):
                        correct_row = False
                        rows_with_incorrect_time = rows_with_incorrect_time + 1 
                    else:
                        current_row[0, c - description_columns] = 3600.0 * float(time[0]) + 60.0 * float(time[1]) + float(time[2])
                elif (is_number(raw_row[c], german_decimal) == True):
                    if (german_decimal == False):
                        current_row[0, c - description_columns] = float(raw_row[c])
                    else:
                        current_row[0, c - description_columns] = float(raw_row[c].replace(',', '.', 1))
                elif (raw_row[c] == ''):
                    current_row[0, c - description_columns] = 0.0
                else:
                    correct_row = False
                    rows_with_odd_error = rows_with_odd_error + 1
                
                if (number_of_columns != max_number_of_columns):
                    correct_row = False
                    rows_with_incorrect_number_of_columns = rows_with_incorrect_number_of_columns + 1
                
                if (correct_row == False):
                    break

                
            if (correct_row == True):    
                number_of_rows = number_of_rows + 1
                if (number_of_rows == 1):
                    data = numpy.zeros(shape = (0, number_of_data_columns))
                    data_block = current_row
                else:
                    data_block = numpy.concatenate((data_block, current_row), axis = 0)
                    
                if (number_of_rows == 1000):
                    data = data_block
                    data_block = numpy.zeros(shape = (0, number_of_data_columns))
                elif (number_of_rows % 1000 == 0):
                    data = numpy.concatenate((data, data_block), axis = 0)
                    data_block = numpy.zeros(shape = (0, number_of_data_columns))


    # Make sure the last block is added if this has not just happened
    
    if (number_of_rows % 1000 != 0):
        data = numpy.concatenate((data, data_block), axis = 0)
        
    fp.close()
    
    if (number_of_lines - number_of_rows > 0):
        
        if (number_of_rows > 0):
            print("File %s has %d data columns and %d rows with complete data and %d rows with corrupted data" % (filename, numpy.shape(data)[1], number_of_rows, number_of_lines - number_of_rows))
            
            print("Rows with na string: %d" % rows_with_na_string)
            print("Rows with incorrect date: %d" % rows_with_incorrect_date)
            print("Rows with incorrect time: %d" % rows_with_incorrect_time)
            print("Rows with incorrect number of columns: %d" % rows_with_incorrect_number_of_columns)
            print("Rows with odd error: %d" % rows_with_odd_error)
        else:
            print("Could not read a single row!!!\n")
            quit()
        
    else:
        print("File %s has %d data columns and %d rows" % (filename, numpy.shape(data)[1], number_of_rows))
    
    return data


#---------------------------------------------------------------------------------------------------


def remove_rows_with_label(data, label):
    
    bad_rows = numpy.where(data[:, 0] == label)[0]

    if (len(bad_rows) > 0):
        data = numpy.delete(data, bad_rows, axis = 0)
        print('Removing %d rows with label %1.3f' % (len(bad_rows), label))
        
    return data

#---------------------------------------------------------------------------------------------------


def remove_empty_columns(data):
    
    min_values = numpy.min(data, axis = 0)
    max_values = numpy.max(data, axis = 0)
    value_range = max_values - min_values
    
    empty_columns = numpy.where(value_range == 0.0)[0]
    
    if (len(empty_columns) > 0):
        print('Removing %d empty columns' % len(empty_columns))
        data = remove_columns(data, empty_columns)
    
    return data
    
    
#---------------------------------------------------------------------------------------------------


def save_data_to_file(data, filename, is_classification, is_regression = True, min_scale = -1.0, max_scale = 1.0):
    
    data_stats = {}
    data_stats['filename'] = filename
    
    data = remove_empty_columns(data)
    
    number_of_rows = numpy.shape(data)[0]
    number_of_columns = numpy.shape(data)[1]
    
    data_stats['rows'] = number_of_rows
    data_stats['columns'] = number_of_columns - 1
    data_stats['binary columns'] = count_bin_columns(data)
    
    
    print("Writing file %s with dim = %d and %d rows" % (filename, number_of_columns - 1, number_of_rows))
    numpy.savetxt(UCIVars.data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '')
    
    min_values = numpy.min(data, axis = 0)
    max_values = numpy.max(data, axis = 0)
    value_range = max_values - min_values
    
    
    for c in range(1, number_of_columns):
        m = (max_scale - min_scale) / value_range[c]
        b = min_scale - m * min_values[c]
        data[:, c] = m * data[:, c] + b

    if (is_classification == False):
        min_scale = -1.0
        max_scale = 1.0
        m = (max_scale - min_scale) / value_range[0]
        b = min_scale - m * min_values[0]
        data[:, 0] = m * data[:, 0] + b

    
    if (is_regression == True):
        numpy.savetxt(UCIVars.regression_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '')
        data_stats['classes'] = 0
        data_stats['naive'] = numpy.var(data[:, 0])
        
        save_data_stats(data_stats)
        
        
    if (is_classification == True):
        all_labels = data[:, 0].astype(int)
        labels, label_counts = numpy.unique(all_labels, return_counts = True)
        data_stats['classes'] = len(labels)
        
        highest_frequency = numpy.max(label_counts)
        data_stats['naive'] = float(number_of_rows - highest_frequency) / float(number_of_rows)
        
        if (len(labels) == 2):
            m = 2.0 / (labels[1] - labels[0])
            b = - (labels[1] + labels[0]) / (labels[1] - labels[0])
            data[:, 0] = numpy.floor(m * data[:, 0] + b + 0.5)
            
            numpy.savetxt(UCIVars.binary_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '')
            save_data_stats(data_stats)
        else:
            numpy.savetxt(UCIVars.multiclass_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '')
            save_data_stats(data_stats)
            
            second_highest_frequency = numpy.sort(label_counts)[len(labels) - 2]
            if (highest_frequency != second_highest_frequency):
                label_1 = labels[numpy.nonzero(label_counts == highest_frequency)[0][0]]
                label_2 = labels[numpy.nonzero(label_counts == second_highest_frequency)[0][0]]
            else:
                label_1 = labels[numpy.nonzero(label_counts == highest_frequency)[0][0]]
                label_2 = labels[numpy.nonzero(label_counts == highest_frequency)[0][1]]
              
            data_1 = data[data[:, 0] == label_1]
            data_1[:, 0] = -1.0 
            data_2 = data[data[:, 0] == label_2]
            data_2[:, 0] = 1.0 
            data = numpy.concatenate((data_1, data_2), axis = 0)
            
            data_stats['classes'] = 2
            data_stats['rows'] = highest_frequency + second_highest_frequency
            data_stats['naive'] = float(second_highest_frequency) / float(data_stats['rows'])
            
            if (data_stats['rows'] >= 2500):
                numpy.savetxt(UCIVars.binary_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '')
                save_data_stats(data_stats)
            
        
#---------------------------------------------------------------------------------------------------

def save_data_stats(data_stats): 

    if os.path.exists(UCIVars.statistics_filename):
        string = ''
    else:
        string = 'Name, Rows, Columns, Binary Columns, Classes, Naive Error, Relative Weight\n'
        

    string = string + data_stats['filename'] + ', ' + str(data_stats['rows']) + ', ' + str(data_stats['columns']) + ', ' + str(data_stats['binary columns']) + ', ' + str(data_stats['classes']) + ', ' + str(data_stats['naive']) + ', ' + str(UCIVars.data_group_id) + '\n'
    with open(UCIVars.statistics_filename, "a") as fp:
        fp.write(string)


#---------------------------------------------------------------------------------------------------

def is_number(string, german_decimal):
    
    # Idea of this code is taken from
    # https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float
    
    if (german_decimal == False):
        string = string.replace('.', '', 1)
    else:
        string = string.replace(',', '', 1)
    string = string.replace('e-', '', 1)
    string = string.replace('e+', '', 1)
    string = string.replace('E-', '', 1)
    string = string.replace('E+', '', 1)
    string = string.replace('-', '', 2)
    string = string.replace('+', '', 1)
    
    return string.isdigit()


#---------------------------------------------------------------------------------------------------


def remove_columns(data, columns):
    
    return numpy.delete(data, columns, axis = 1)


#---------------------------------------------------------------------------------------------------


def move_label_in_front(data, label_column):
    
    number_of_rows = numpy.shape(data)[0]
    
    labels = numpy.reshape(data[:, label_column], newshape = (number_of_rows, 1))
    unlabeled_data = remove_columns(data, [label_column])    
    
    data = numpy.concatenate((labels, unlabeled_data), axis = 1)
    
    return data
    
    
#---------------------------------------------------------------------------------------------------

def count_bin_columns(data):

    cols = numpy.shape(data)[1]
	
    count = 0
    for i in range(1, cols):
        if len(set(data[:, i])) == 2:
            count = count + 1

    return count


#---------------------------------------------------------------------------------------------------


def convert_time_to_seconds(time, sep):

    time_tmp = time.split(sep)
    seconds = 3600 * int(time_tmp[0]) + 60 * int(time_tmp[1]) + int(time_tmp[2])

    return seconds


================================================
FILE: pytabkit/bench/eval/__init__.py
================================================


================================================
FILE: pytabkit/bench/eval/analysis.py
================================================
from typing import Optional, Callable, Tuple, Dict, List, Union

import numpy as np
import scipy

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.eval.evaluation import FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector, TaskWeighting, \
    get_ranks
from pytabkit.models import utils
from pytabkit.models.data.nested_dict import NestedDict


class ResultsTables:
    def __init__(self, paths: Paths):
        self.paths = paths
        self.tables = NestedDict()

    def get(self, coll_name: str, n_cv: int = 1, tag: str = 'paper') -> MultiResultsTable:
        idxs = (coll_name, n_cv, tag)
        if idxs in self.tables:
            return self.tables[idxs]
        else:
            # load table from disk
            task_collection = TaskCollection.from_name(coll_name, self.paths)
            alg_filter = FunctionAlgFilter(lambda an, tags, config, my_tag=tag: my_tag in tags)
            table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=self.paths, alg_filter=alg_filter)
            self.tables[idxs] = table
            return table


def _get_t_mean_confidence_interval_single(values: np.ndarray) -> Tuple[float, float]:
    # following https://www.geeksforgeeks.org/how-to-calculate-confidence-intervals-in-python/
    # see also https://stats.stackexchange.com/questions/358408/confidence-interval-for-the-mean-normal-distribution-or-students-t-distributi
    # and http://stla.github.io/stlapblog/posts/ModelReduction.html
    sem = scipy.stats.sem(values)
    if sem == 0.0:
        mean = np.mean(values)
        return mean, mean
    else:
        interval = scipy.stats.t.interval(confidence=0.95, df=len(values) - 1, loc=np.mean(values),
                                          scale=sem)
        return interval[0], interval[1]


def get_t_mean_confidence_interval(values: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    # takes the confidence intervals across the last dimension,
    # the other dimensions are considered to be batch dimensions
    if len(values.shape) == 1:
        lower, upper = _get_t_mean_confidence_interval_single(values)
        return np.asarray(lower), np.asarray(upper)

    pairs = [get_t_mean_confidence_interval(values[i]) for i in range(values.shape[0])]
    lower = np.asarray([pair[0] for pair in pairs])
    upper = np.asarray([pair[1] for pair in pairs])
    return lower, upper


def get_benchmark_results(paths: Paths, table: MultiResultsTable, coll_name: str,
                          use_relative_score: bool = True, return_percentages: bool = True,
                          val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                          rel_alg_name: str = 'BestModel', use_ranks: bool = False,
                          use_normalized_errors: bool = False,
                          use_grinnorm_errors: bool = False,
                          use_task_mean: bool = True,
                          use_geometric_mean: bool = True, shift_eps: float = 1e-2,
                          filter_alg_names_list: Optional[List[str]] = None,
                          simplify_name_fn: Optional[Callable[[str], str]] = None,
                          n_splits: int = 10, use_validation_errors: bool = False) -> \
        Tuple[
            Dict[str, Union[float, np.ndarray]], Dict[str, Tuple[Union[float, np.ndarray], Union[float, np.ndarray]]]]:
    # returns means and confidence intervals for each alg_name (converted using get_display_name())
    # relative confidence intervals for arithmetic mean are a bit wrong
    # because the uncertainty in the divisor is not incorporated

    f = (lambda x: np.log(x + shift_eps)) if use_geometric_mean else (lambda x: x)
    post_f = (lambda x: np.exp(x)) if use_geometric_mean else (lambda x: x)

    if simplify_name_fn is None:
        simplify_name_fn = get_simplified_name

    task_collection = TaskCollection.from_name(coll_name, paths)
    task_infos = task_collection.load_infos(paths)
    task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg'
    opt_groups = get_opt_groups(task_type_name)
    alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{
        f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans)
        for group_name, alg_names in opt_groups.items()
    }}
    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict,
                                              test_metric_name=test_metric_name,
                                              val_metric_name=val_metric_name,
                                              use_validation_errors=use_validation_errors)
    test_table = test_table.rename_algs(simplify_name_fn)
    # print(f'{test_table.alg_names=}')
    # print(f'{filter_alg_names_list=}')
    if filter_alg_names_list is not None:
        test_table = test_table.filter_algs(filter_alg_names_list)

    # new code
    test_table = test_table.filter_n_splits(n_splits)
    # shape: [n_algs, n_tasks, n_splits]
    errors = test_table.to_array()
    if len(errors) == 0:
        return dict(), dict()
    # print(f'{errors.shape=}, {errors=}')
    if use_ranks:
        errors = get_ranks(errors)
    elif use_normalized_errors:
        min_arr = np.min(errors, axis=0, keepdims=True)
        max_arr = np.max(errors, axis=0, keepdims=True)
        errors = (errors - min_arr) / (max_arr - min_arr + 1e-30)
        errors = np.clip(errors, 0.0, 1.0)
    elif use_grinnorm_errors:
        assert task_type_name in ['class', 'reg']
        min_arr = np.min(errors, axis=0, keepdims=True)
        max_arr = np.quantile(errors, 1.0 if task_type_name == 'class' else 0.9, axis=0, keepdims=True)
        errors = (errors - min_arr) / (max_arr - min_arr + 1e-30)
        if task_type_name == 'reg':
            errors = np.clip(errors, 0.0, 1.0)
        else:
            errors = np.clip(errors, 0.0, np.inf)

    idx_best = test_table.alg_names.index(rel_alg_name) if use_relative_score else 0

    use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci')
    if use_task_weighting:
        separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
        task_weights = TaskWeighting(test_table.task_infos, separate_task_names).get_task_weights()
    else:
        n_tasks = len(test_table.task_infos)
        task_weights = np.ones(n_tasks) / n_tasks

    f_errors = f(errors)
    mean_f_errors = np.mean(f_errors, axis=-1)
    # print(f'{f_errors.shape=}, {f_errors=}')
    if use_task_mean:
        mean_f_errors = mean_f_errors @ task_weights
    mean_scores = post_f(mean_f_errors)

    if not use_task_mean:
        assert not use_relative_score
    if return_percentages:
        assert use_relative_score

    base_f_errors = f_errors[idx_best, None] if use_relative_score else np.zeros_like(f_errors)
    # mean_base_f_errors = np.mean(base_f_errors, axis=-1) @ task_weights
    rel_f_errors = f_errors - base_f_errors
    # print(f'{rel_f_errors.shape=}, {rel_f_errors=}')

    mean_rel_f_errors = np.mean(rel_f_errors, axis=-1)
    if use_task_mean:
        mean_rel_f_errors = mean_rel_f_errors @ task_weights

    # # unbiased estimate of variance of mean estimator
    # variances_algs_tasks = np.var(rel_f_errors, axis=-1) / (n_splits - 1)
    # variances_algs = variances_algs_tasks @ (task_weights ** 2)
    # stds_algs = np.sqrt(variances_algs)
    # lower_rel_mean_f_errors = mean_rel_f_errors - 1.96 * stds_algs
    # upper_rel_mean_f_errors = mean_rel_f_errors + 1.96 * stds_algs

    if use_task_mean:
        # take the mean over tasks first, then do the confidence interval for
        rel_f_errors = np.einsum('ats,t->as', rel_f_errors, task_weights)
    lower_rel_mean_f_errors, upper_rel_mean_f_errors = get_t_mean_confidence_interval(rel_f_errors)
    # lower_rel_mean_f_errors = []
    # upper_rel_mean_f_errors = []
    # for i in range(means_algs_splits.shape[0]):
    #     # following https://www.geeksforgeeks.org/how-to-calculate-confidence-intervals-in-python/
    #     # see also https://stats.stackexchange.com/questions/358408/confidence-interval-for-the-mean-normal-distribution-or-students-t-distributi
    #     # and http://stla.github.io/stlapblog/posts/ModelReduction.html
    #     means_splits = means_algs_splits[i]
    #     sem = scipy.stats.sem(means_splits)
    #     if sem == 0.0:
    #         mean = np.mean(means_splits)
    #         interval = [mean, mean]
    #     else:
    #         interval = scipy.stats.t.interval(confidence=0.95, df=len(means_splits) - 1, loc=np.mean(means_splits),
    #                                           scale=sem)
    #     lower_rel_mean_f_errors.append(interval[0])
    #     upper_rel_mean_f_errors.append(interval[1])
    # lower_rel_mean_f_errors = np.array(lower_rel_mean_f_errors)
    # upper_rel_mean_f_errors = np.array(upper_rel_mean_f_errors)
    # 2.5% and 97.5% quantiles for normal distribution
    lower_rel_mean_scores = post_f(lower_rel_mean_f_errors)
    upper_rel_mean_scores = post_f(upper_rel_mean_f_errors)
    rel_mean_scores = post_f(mean_rel_f_errors)

    # lower_f_errors = mean_f_errors - 1.96 * stds_algs
    # upper_f_errors = mean_f_errors + 1.96 * stds_algs
    # lower_scores = post_f(lower_f_errors)
    # upper_scores = post_f(upper_f_errors)

    def transform(scores: np.ndarray) -> np.ndarray:
        if use_relative_score and not use_geometric_mean:
            # we computed the arithmetic mean of the difference, need to normalize and add 1
            scores = scores / mean_scores[idx_best, None] + 1.0
        if return_percentages:
            scores = 100 * (scores - 1.0)
        return scores

    # print(f'{rel_mean_scores=}')

    scores = transform(rel_mean_scores)
    lower_scores = transform(lower_rel_mean_scores)
    upper_scores = transform(upper_rel_mean_scores)

    # print(f'{scores=}')

    scores_dict = {alg_name: score
                   for alg_name, score in zip(test_table.alg_names, scores)}
    intervals_dict = {alg_name: (lower, upper)
                      for alg_name, lower, upper in zip(test_table.alg_names, lower_scores, upper_scores)}
    # scores_dict = {display_name_fn(alg_name): score
    #                for alg_name, score in zip(test_table.alg_names, scores)}
    # intervals_dict = {display_name_fn(alg_name): (lower, upper)
    #                   for alg_name, lower, upper in zip(test_table.alg_names, lower_scores, upper_scores)}
    return scores_dict, intervals_dict


def get_opt_groups(task_type_name: str) -> Dict[str, List[str]]:
    """
    Generates a groups of methods that should be evaluated.

    :param task_type_name: 'class' or 'reg'
    :return: A dict of lists {alg_group_name: [alg_name_1, alg_name_2, ...]}
    """
    opt_groups = utils.join_dicts(get_ensemble_groups(task_type_name), {
        '_LGBM-HPO+TD': ['LGBM-HPO', f'LGBM-TD-{task_type_name}'],
        '_XGB-HPO+TD': ['XGB-HPO', f'XGB-TD-{task_type_name}'],
        '_CatBoost-HPO+TD': ['CatBoost-HPO', f'CatBoost-TD-{task_type_name}'],
        '_RealMLP-HPO+TD': ['RealMLP-HPO', f'RealMLP-TD-{task_type_name}'],
        '_MLP-HPO+TD': ['MLP-HPO', f'MLP-TD-{task_type_name}'],
        '-TD_val-ce': [f'RealMLP-TD-{task_type_name}_val-ce_no-ls', f'XGB-TD-{task_type_name}_val-ce',
                       f'LGBM-TD-{task_type_name}_val-ce', f'CatBoost-TD-{task_type_name}_val-ce'],
        '-D_val-ce': [f'MLP-PLR-D-{task_type_name}_val-ce', f'XGB-D-{task_type_name}_val-ce',
                      f'LGBM-D-{task_type_name}_val-ce', f'CatBoost-D-{task_type_name}_val-ce'],
    })

    for method in ['MLP-RTDL-D', 'ResNet-RTDL-D', 'MLP-PLR-D', 'FTT-D', 'TabR-S-D']:
        opt_groups[f'_{method}_prep'] = [f'{method}-{task_type_name}', f'{method}-{task_type_name}_rssc']

    return opt_groups


def get_ensemble_groups(task_type_name: str) -> Dict[str, List[str]]:
    """
    Generates a groups of methods that should be evaluated.

    :param task_type_name: 'class' or 'reg'
    :return: A dict of lists {alg_group_name: [alg_name_1, alg_name_2, ...]}
    """

    return {
        '_GBDTs-TD': [f'XGB-TD-{task_type_name}', f'LGBM-TD-{task_type_name}', f'CatBoost-TD-{task_type_name}'],
        '-TD': [f'XGB-TD-{task_type_name}', f'LGBM-TD-{task_type_name}', f'CatBoost-TD-{task_type_name}',
                f'RealMLP-TD-{task_type_name}'],
        '_GBDTs-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO'],
        # 'GBDTs-HPO_MLP-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO', 'MLP-HPO'],  # todo: duplicate
        '-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO', 'RealMLP-HPO'],
        '_MLP-TD_MLP-TD-S': [f'RealMLP-TD-{task_type_name}', f'RealMLP-TD-S-{task_type_name}'],
        '-D': ['XGB-D', 'LGBM-D', 'CatBoost-D', f'MLP-PLR-D-{task_type_name}'],
    }


def get_simplified_name(alg_name: str):
    alg_name = alg_name.replace(' [bag-1]', '')
    alg_name = alg_name.replace('-class', '').replace('-reg', '')
    # the rest is not happening in get_display_name after merging the names with the names from the runtimes
    # alg_name = alg_name.replace('RF-SKL', 'RF')
    # alg_name = alg_name.replace('-RTDL', '')
    # alg_name = alg_name.replace('_val-ce', '')
    # if alg_name == 'XGBoost-HPO':
    #     return 'XGB-HPO'
    # elif alg_name == 'Ensemble_GBDTs-TD_MLP-TD':
    #     return 'Ensemble_TD'
    # elif alg_name == 'Ensemble_GBDTs-HPO_MLP-HPO':
    #     return 'Ensemble_HPO'
    return alg_name


def get_display_name(alg_name: str) -> str:
    alg_name = alg_name.replace('BestModel', 'Best')
    # alg_name = alg_name.replace('_rssc', '')
    alg_name = alg_name.replace('_rssc', ' (RS+SC)')
    alg_name = alg_name.replace('_no-ls', ' (no LS)')
    alg_name = alg_name.replace('_val-ce', '')
    alg_name = alg_name.replace('RF-SKL', 'RF')
    alg_name = alg_name.replace('-RTDL', '')
    alg_name = alg_name.replace('_best-1-auc-ovr', '')
    if alg_name.endswith('_prep') and alg_name.startswith('Best_'):
        alg_name = alg_name[len('Best_'):-len('_prep')]
        alg_name = alg_name + ' (best of both)'
    return alg_name


================================================
FILE: pytabkit/bench/eval/colors.py
================================================
from typing import List, Tuple, Callable


def bilin_int(x: float, values: List[Tuple[float, float]]) -> float:
    # integrates a bilinear interpolation of the values
    sum_of_integrals = 0.0
    x0, y0 = values[0]
    for x1, y1 in values[1:]:
        if x <= x0:
            return sum_of_integrals
        if x <= x1:
            y1 = y0 + (x-x0)/(x1-x0)*(y1-y0)
            x1 = x

        sum_of_integrals += (x1-x0) * (y1+y0) / 2
        x0, y0 = x1, y1

    return sum_of_integrals


def bisection_find(f: Callable[[float], float], y: float, xmin: float, xmax: float, n=50) -> float:
    # find x with f(x) = y, assuming increasing f
    a = xmin
    b = xmax
    c = (a+b)/2  # middle

    fa = f(a)
    fb = f(b)
    fc = f(c)

    if fa >= y:
        return a
    if fb <= y:
        return b

    for _ in range(n):
        if fc >= y:
            b, fb = c, fc
        else:
            a, fa = c, fc

        c = (a+b)/2
        fc = f(c)

    return c


def more_percep_uniform_hue(x: float) -> float:
    """
    Returns a hue-value that should change perceptually somewhat uniformly with x
    :param x: a value between 0 and 1.
    :return: Hue value for HSV space.
    """
    # eye-balled perceptual "rate of change" scores at different hues
    hue_percep_deriv = [(0, 0.3), (30, 0.6), (60, 1.0), (90, 0.3), (150, 0.3), (180, 0.8), (220, 0.4), (260, 0.4), (280, 0.8),
           (300, 0.6), (360, 0.3)]
    f = lambda val: bilin_int(val, hue_percep_deriv)
    fmax = f(360)
    return bisection_find(f, x*fmax, 0, 360)/360


================================================
FILE: pytabkit/bench/eval/evaluation.py
================================================
import distutils.command.build_ext
from typing import List, Dict, Any, Tuple, Optional, Callable, Union

import numpy as np

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection, TaskInfo
from pytabkit.models import utils
from pytabkit.models.training.metrics import Metrics


class AlgFilter:
    def __call__(self, alg_name: str, tags: List[str], alg_config: Dict[str, Any]) -> bool:
        raise NotImplementedError()


class FunctionAlgFilter(AlgFilter):
    def __init__(self, f):
        self.f = f

    def __call__(self, alg_name: str, tags: List[str], alg_config: Dict[str, Any]) -> bool:
        return self.f(alg_name, tags, alg_config)


class EvalModeSelector:  # base class
    def select_eval_modes(self, eval_modes: List[Tuple[str, str, str]]) -> List[Tuple[str, Tuple[str, str, str]]]:
        # gets a list of (cv_type, n_models, start_idx) tuples, returns a sublist of them
        # but with a suffix-str in for each element
        raise NotImplementedError()

    def select(self, alg_name: str, task_results: List) -> Tuple[List[str], List[List]]:
        # task results should be indexed by [task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)]
        # returns a list of alg names and a list new_alg_task_results indexed by [task_idx][split_idx]

        # determine all combinations that occur in all task results
        sets = [set((cv_type, n_models, start_idx)
                    for cv_type, d1 in split_dict.items()
                    for n_models, d2 in d1.items()
                    for start_idx, d3 in d2.items())
                for task_result in task_results for split_dict in task_result]
        eval_modes = list(set.intersection(*sets))

        # select using function overridden in subclass
        selected = self.select_eval_modes(eval_modes)

        # select elements for selected eval modes
        new_alg_names = []
        new_alg_task_results = []
        for suffix, (cv_type, n_models, start_idx) in selected:
            new_alg_names.append(alg_name + suffix)
            new_alg_task_results.append([[split[cv_type][n_models][start_idx] for split in task_result]
                                         for task_result in task_results])
        return new_alg_names, new_alg_task_results


class DefaultEvalModeSelector(EvalModeSelector):
    def select_eval_modes(self, eval_modes: List[Tuple[str, str, str]]) -> List[Tuple[str, Tuple[str, str, str]]]:
        # out of different numbers of ensemble members,
        # select only the largest ensemble/bagging combinations and single ensemble member
        result = []
        # if ('refit', '1', '0') in eval_modes:
        #     # refit with 1 model, standard
        #     result.append(('', ('refit', '1', '0')))

        for name, val in [('bag', 'cv'), ('ens', 'refit')]:
            modes = [mode for mode in eval_modes if mode[0] == val]
            if len(modes) > 0:
                # maximize n_models
                bag_sizes = [int(mode[1]) for mode in modes]
                max_cv = np.max(bag_sizes)
                min_cv = np.min(bag_sizes)
                bag_sizes = list({max_cv, min_cv})  # only have one element if they're equal

                for bag_size in bag_sizes:
                    # make sure to always select model '0' to avoid non-determinism
                    result.append((f' [{name}-{bag_size}]', (val, str(bag_size), '0')))

                # idx = np.argmax([int(mode[1]) for mode in modes])
                # idx_min = np.argmin([int(mode[1]) for mode in modes])
                # mode = modes[idx]
                # result.append((f' [{name}-{mode[1]}]', mode))
                # if idx_min != idx:
                #     result.append((f' [{name}-{modes[idx_min][1]}]', modes[idx_min]))

        return result


class AlgTaskTable:
    def __init__(self, alg_names: List[str], task_infos: List[TaskInfo], alg_task_results: List[List[Any]]):
        self.alg_names = alg_names
        self.task_infos = task_infos
        self.alg_task_results = alg_task_results

    def map(self, f):
        return AlgTaskTable(self.alg_names, self.task_infos,
                            [[[f(r) for r in splits] for splits in task_results]
                             for task_results in self.alg_task_results])

    def filter_n_splits(self, n_splits: int) -> 'AlgTaskTable':
        """
        Limits the number of split results to n_splits
        and removes all algs where there exists a task with less than n_splits split results.
        :param n_splits:
        :return:
        """
        alg_valid = [all(len(split_results) >= n_splits for split_results in task_results)
                     for task_results in self.alg_task_results]
        alg_names = [alg_name for is_valid, alg_name in zip(alg_valid, self.alg_names) if is_valid]
        alg_task_results = [[split_results[:n_splits] for split_results in task_results]
                            for is_valid, task_results in zip(alg_valid, self.alg_task_results) if is_valid]
        return AlgTaskTable(alg_names, self.task_infos, alg_task_results)

    def to_array(self) -> np.ndarray:
        return np.asarray(self.alg_task_results)

    def rename_algs(self, f: Callable[[str], str]) -> 'AlgTaskTable':
        return AlgTaskTable(alg_names=[f(an) for an in self.alg_names], task_infos=self.task_infos,
                            alg_task_results=self.alg_task_results)

    def filter_algs(self, alg_names: List[str]) -> 'AlgTaskTable':
        return AlgTaskTable(alg_names=[an for an in self.alg_names if an in alg_names], task_infos=self.task_infos,
                            alg_task_results=[tr for tr, an in zip(self.alg_task_results, self.alg_names)
                                              if an in alg_names])


class MultiResultsTable:
    def __init__(self, train_table: AlgTaskTable, val_table: AlgTaskTable, test_table: AlgTaskTable,
                 alg_tags: List[List[str]], alg_configs: List[Dict[str, Any]]):
        # val_table.alg_task_table and test_table.alg_task_table are indexed by
        # [alg_idx][task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)][metric_name]
        self.train_table = train_table
        self.val_table = val_table
        self.test_table = test_table
        self.alg_tags = alg_tags
        self.alg_configs = alg_configs

    def get_test_results_table(self, eval_mode_selector: EvalModeSelector, val_metric_name: Optional[str] = None,
                               test_metric_name: Optional[str] = None,
                               alg_group_dict: Optional[Dict[str, AlgFilter]] = None,
                               val_test_groups: Optional[Dict[str, Dict[str, str]]] = None,
                               use_validation_errors: bool = False,
                               use_train_errors: bool = False) \
            -> AlgTaskTable:
        """
        :param eval_mode_selector:
            Decides how to select results from the different available ensembled/bagged results and how to name them
        :param val_metric_name: Name of the validation metric (used for optimizing over multiple algorithms)
        :param test_metric_name: Name of the test metric
        :param alg_group_dict: Optional dictionary of name: alg_filter.
            For each such pair, an additional algorithm with the given name will be added to the resulting table.
            Its results are computed as follows: On each split of each task,
            out of all the algorithms where the alg_filter returns True, the one with the best validation error is chosen,
            and then its test error is used.
        :param val_test_groups: Similar to alg_group_dict,
            but allows to use a different alg for the test score associated with the one with the best validation error.
            Specifically, for name: pairs in val_test_groups.items(),
            the best validation error among the keys of pairs will be determined,
            and then the test score of the value associated to this best key will be returned.
        :param use_validation_errors: If True, use validation errors instead of test errors.
        :param use_train_errors: If True, use train errors instead of test errors.
        :return:
        """
        # the selector assigns new alg names (e.g. with [ens-5] for an ensemble)
        # but the alg_group selects based on configs and new names
        assert not (use_train_errors and use_validation_errors)

        # extract only default metric values from self.val_table
        val_metric_name = val_metric_name or Metrics.default_eval_metric_name(self.val_table.task_infos[0].task_type)
        test_metric_name = test_metric_name or Metrics.default_eval_metric_name(self.val_table.task_infos[0].task_type)

        if '1-r2' in [val_metric_name, test_metric_name]:
            for table in [self.val_table, self.test_table, self.train_table]:
                table.alg_task_results = utils.map_nested(table.alg_task_results, lambda metrics_dict: utils.join_dicts(metrics_dict, {'1-r2': metrics_dict['nrmse']**2}), dim=6)

        # tables indexed by [alg_idx][task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)][metric_name]
        val_results = utils.select_nested(self.val_table.alg_task_results, val_metric_name, dim=6)
        if use_validation_errors:
            test_results = val_results
        elif use_train_errors:
            test_results = utils.select_nested(self.train_table.alg_task_results, val_metric_name, dim=6)
        else:
            test_results = utils.select_nested(self.test_table.alg_task_results, test_metric_name, dim=6)

        # take mean over all single model validation scores in cross-validation
        # now indexed by [alg_idx][task_idx][split_idx]
        # print(np.asarray(val_results[0][0][0]['cv']['1'].values()))
        val_results = utils.map_nested(val_results, lambda dct: np.mean(np.asarray(list(dct['cv']['1'].values()))),
                                       dim=3)

        # create new test table by selecting for eval modes (multiple eval modes can be selected for an alg_name)
        # hence the table can get longer
        new_alg_names = []
        new_alg_task_results = []

        # Meaning: new_alg_names[new_alg_idxs[i]] is first algorithm corresponding to self.val_table.alg_names[i]
        new_alg_idxs = []

        for alg_name, task_results in zip(self.test_table.alg_names, test_results):
            # generates a list of alg names and of alg_task_results
            an, atr = eval_mode_selector.select(alg_name, task_results)
            if len(an) == 0:
                raise RuntimeError(f'No eval mode selected from alg {alg_name}')
            new_alg_idxs.append(len(new_alg_names))
            new_alg_names.extend(an)
            new_alg_task_results.extend(atr)

        # test_results_table.alg_task_results is indexed by [alg_idx][task_idx][split_idx]
        test_results_table = AlgTaskTable(new_alg_names, self.test_table.task_infos, new_alg_task_results)

        if val_test_groups is None:
            val_test_groups = dict()

        if alg_group_dict is not None:
            more_val_test_groups = {key: {alg_name: alg_name
                                          for alg_name, alg_tags, alg_config in
                                          zip(self.val_table.alg_names, self.alg_tags, self.alg_configs)
                                          if filter(alg_name, alg_tags, alg_config)}
                                    for key, filter in alg_group_dict.items()}

            val_test_groups = utils.join_dicts(val_test_groups, more_val_test_groups)

        # add algorithms optimized over a group, selecting the one with the best validation score
        # (or one associated to the best one)
        group_names = []
        group_task_results = []
        for group_name, val_test_dict in val_test_groups.items():
            if len(val_test_dict) == 0:
                continue  # could happen if the alg_filter does not apply to anything
            all_alg_names = self.val_table.alg_names
            val_alg_names = list(val_test_dict.keys())
            val_alg_idxs = [all_alg_names.index(alg_name) if alg_name in all_alg_names else None
                            for alg_name in val_alg_names]
            test_alg_idxs = [all_alg_names.index(val_test_dict[alg_name])
                             if val_test_dict[alg_name] in all_alg_names else None
                             for alg_name in val_alg_names]
            # print(f'{group_name=}, {val_alg_idxs=}, {test_alg_idxs=}')
            if None in (val_alg_idxs + test_alg_idxs):
                continue  # not all algs found

            max_n_splits = np.min([len(splits)
                                   for i in (val_alg_idxs + test_alg_idxs)
                                   for splits in val_results[i]])
            # shape: n_algs x n_tasks x max_n_splits
            cut_splits = [[splits[:max_n_splits] for splits in val_results[i]]
                          for i in val_alg_idxs]
            # shape: n_tasks x max_n_splits
            best_idxs = np.argmin(np.asarray(cut_splits), axis=0)
            test_atr = test_results_table.alg_task_results

            group_names.append(group_name)
            group_task_results.append(
                [[test_atr[new_alg_idxs[test_alg_idxs[best_idxs[task_idx, split_idx]]]][task_idx][split_idx]
                  for split_idx in range(best_idxs.shape[1])]
                 for task_idx in range(best_idxs.shape[0])])
        test_results_table = AlgTaskTable(test_results_table.alg_names + group_names, test_results_table.task_infos,
                                          test_results_table.alg_task_results + group_task_results)

        # # add alg groups - on each task, alg groups take the alg from the group with the best val error
        # # (val error is always minimized here, not maximized)
        # if alg_group_dict is not None:
        #     group_names = []
        #     group_task_results = []
        #     for key, alg_filter in alg_group_dict.items():
        #         alg_idxs = [i for i in range(len(self.val_table.alg_names))
        #                     if alg_filter(self.val_table.alg_names[i], self.alg_tags[i], self.alg_configs[i])]
        #         if len(alg_idxs) == 0:
        #             continue
        #         max_n_splits = np.min([len(splits)
        #                                for i in alg_idxs
        #                                for splits in val_results[i]])
        #         # shape: n_algs x n_tasks x max_n_splits
        #         cut_splits = [[splits[:max_n_splits] for splits in val_results[i]]
        #                       for i in alg_idxs]
        #         # shape: n_tasks x max_n_splits
        #         best_idxs = np.argmin(np.asarray(cut_splits), axis=0)
        #         test_atr = test_results_table.alg_task_results
        #
        #         group_names.append(key)
        #         group_task_results.append(
        #             [[test_atr[new_alg_idxs[alg_idxs[best_idxs[task_idx, split_idx]]]][task_idx][split_idx]
        #               for split_idx in range(best_idxs.shape[1])]
        #              for task_idx in range(best_idxs.shape[0])])
        #     test_results_table = AlgTaskTable(test_results_table.alg_names + group_names, test_results_table.task_infos,
        #                                       test_results_table.alg_task_results + group_task_results)

        return test_results_table

    @staticmethod
    def load(task_collection: TaskCollection, n_cv: int, paths: Paths, alg_filter: Optional[AlgFilter] = None,
             split_type=SplitType.RANDOM, max_n_splits: Optional[int] = None, max_n_algs: Optional[int] = None):
        # load only summaries (faster)
        alg_names = [alg_path.name for alg_path in paths.result_summaries().iterdir()]
        # now only keep algs where all tasks from task_collection have been evaluated
        alg_names = [an for an in alg_names if np.all([utils.existsDir(paths.summary_alg_task(task_desc, an, n_cv))
                                                       for task_desc in task_collection.task_descs])]

        print('computed alg names')

        alg_tags = [utils.deserialize(paths.algs() / alg_name / 'tags.yaml', use_yaml=True) for alg_name in alg_names]
        alg_configs = [utils.deserialize(paths.algs() / alg_name / 'extended_config.yaml', use_yaml=True)
                       for alg_name in alg_names]

        if alg_filter is None:
            alg_filter = lambda an, tags, aw: True

        alg_dict = {an: (tags, config) for an, tags, config in zip(alg_names, alg_tags, alg_configs)
                    if alg_filter(an, tags, config)}
        if max_n_algs is not None and max_n_algs >= 0:
            alg_dict = {key: value for i, (key, value) in enumerate(alg_dict.items()) if i < max_n_algs}
        alg_names = list(alg_dict.keys())
        alg_tags = [alg_dict[an][0] for an in alg_names]
        alg_configs = [alg_dict[an][1] for an in alg_names]

        task_infos = task_collection.load_infos(paths)

        # val_metric_name = Metrics.default_metric_name(task_infos[0].task_type)

        # indexed by
        # [alg_idx][task_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name][split_idx]
        alg_task_results = [[utils.deserialize(paths.summary_alg_task(task_desc, alg_name, n_cv)
                                               / f'metrics.msgpack.gz', use_msgpack=True, compressed=True)[split_type]
                             for task_desc in task_collection.task_descs]
                            for alg_name in alg_names]

        # swap split_idx dimension to after task_idx, now indexed by
        # [alg_idx][task_idx][split_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name]
        alg_task_results = utils.shift_dim_nested(alg_task_results, 7, 2)

        if max_n_splits is not None and max_n_splits >= 1:
            alg_task_results = utils.map_nested(alg_task_results,
                                                lambda lst: lst[:max_n_splits] if len(lst) > max_n_splits else lst, 2)

        def select_valtest(dct: Dict, name: str):
            # helper function because for the 'refit' results,
            # we have to take the validation results from the 'cv' part
            # because 'refit' did not have a validation set
            if name != 'val':
                return {key: value[name] for key, value in dct.items()}
            else:
                return {key: dct['cv']['val'] for key in dct}

        tables = {name: AlgTaskTable(alg_names=alg_names, task_infos=task_infos,
                               alg_task_results=utils.map_nested(alg_task_results,
                                                                 lambda dct: select_valtest(dct, name), dim=3))
                  for name in ['train', 'val', 'test']}

        # does not work since 'refit' does not have 'val'
        # tables = [AlgTaskTable(alg_names=alg_names, task_infos=task_infos,
        #                        alg_task_results=utils.select_nested(alg_task_results, name, dim=4))
        #           for name in ['val', 'test']]
        return MultiResultsTable(train_table=tables['train'], val_table=tables['val'], test_table=tables['test'],
                                 alg_tags=alg_tags, alg_configs=alg_configs)


class TableAnalyzer:
    def __init__(self, post_f: Optional[Callable[[float], float]] = None):
        self.post_f = post_f or (lambda x: x)

    def _print_table(self, alg_names: List[str], means, stds=None, is_higher_better: bool = False,
                     perm: Optional[np.ndarray] = None):
        means = np.asarray(means)
        if perm is None:
            perm = np.argsort(means)
            if is_higher_better:
                perm = perm[::-1]
        means = means[perm]
        alg_names = [alg_names[i] for i in perm]
        if stds is None:
            str_table = [[an + ': ', f'{self.post_f(m):6.4f}'] for an, m in zip(alg_names, means)]
        else:
            stds = np.asarray(stds)[perm]
            str_table = [[an + ': ', f'{self.post_f(m):6.4f} ',
                          f'[{self.post_f(m - 2 * s):6.4f}, {self.post_f(m + 2 * s):6.4f}]']
                         for an, m, s in zip(alg_names, means, stds)]

        print(utils.pretty_table_str(str_table))

    def print_analysis(self, alg_task_table: AlgTaskTable):
        raise NotImplementedError()


class TaskWeighting:
    def __init__(self, task_infos: List[TaskInfo], separate_task_names: Optional[List[str]]):
        """
        Computes a weighting of tasks, downweighting tasks that have similar tasks.
        :param task_infos: Task infos.
        :param separate_task_names: Names of tasks that should not be grouped together with other tasks
        """
        self.task_infos = task_infos
        separate_task_names = separate_task_names or []
        task_names = [task_info.task_desc.task_name.split('_')[0] for task_info in task_infos]
        task_prefixes = [task_name if task_name in separate_task_names else task_name.split('_')[0]
                         for task_name in task_names]
        self.prefix_counts = {}
        for prefix in task_prefixes:
            if prefix in self.prefix_counts:
                self.prefix_counts[prefix] += 1
            else:
                self.prefix_counts[prefix] = 1
        self.task_weights = np.asarray([1.0 / self.prefix_counts[prefix] for prefix in task_prefixes])
        self.task_weights /= np.sum(self.task_weights)

    def get_n_groups(self) -> int:
        return len(self.prefix_counts)

    def get_task_weights(self) -> np.ndarray:
        return self.task_weights


class MeanTableAnalyzer(TableAnalyzer):
    def __init__(self, f=None, use_weighting=False, separate_task_names: Optional[List[str]] = None, post_f=None):
        super().__init__(post_f=post_f)
        self.f = f
        self.use_weighting = use_weighting
        self.separate_task_names = separate_task_names

    def print_analysis(self, alg_task_table: AlgTaskTable) -> None:
        if self.use_weighting:
            task_weights = TaskWeighting(alg_task_table.task_infos, self.separate_task_names).get_task_weights()
            # task_weights = get_task_weights(alg_task_table.task_infos)
        else:
            n = len(alg_task_table.task_infos)
            task_weights = np.ones(n) / n
        if self.f is not None:
            alg_task_table = alg_task_table.map(self.f)
        alg_task_results = alg_task_table.alg_task_results
        # if self.f is not None:
        #     alg_task_results = [[[self.f(x) for x in c] for c in b] for b in alg_task_results]

        means = [np.dot(task_weights, [np.mean(splits) for splits in task_results])
                 for task_results in alg_task_results]
        stds = [np.sqrt(np.dot(task_weights ** 2, [np.std(splits) ** 2 / len(splits) for splits in task_results]))
                for task_results in alg_task_results]

        self._print_table(alg_task_table.alg_names, means, stds)

    def get_means(self, alg_task_table: AlgTaskTable) -> List[float]:
        if self.use_weighting:
            separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
            task_weights = TaskWeighting(alg_task_table.task_infos, separate_task_names).get_task_weights()
        else:
            n = len(alg_task_table.task_infos)
            task_weights = np.ones(n) / n
        if self.f is not None:
            alg_task_table = alg_task_table.map(self.f)
        alg_task_results = alg_task_table.alg_task_results
        return [self.post_f(np.dot(task_weights, [np.mean(splits) for splits in task_results]))
                for task_results in alg_task_results]

    def get_intervals(self, alg_task_table: AlgTaskTable, std_factor: float = 2.0) -> List[Tuple[float, float]]:
        # e.g. if std_factor=2, then the +-2 sigma interval will be used
        if self.use_weighting:
            separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
            task_weights = TaskWeighting(alg_task_table.task_infos, separate_task_names).get_task_weights()
        else:
            n = len(alg_task_table.task_infos)
            task_weights = np.ones(n) / n
        if self.f is not None:
            alg_task_table = alg_task_table.map(self.f)
        alg_task_results = alg_task_table.alg_task_results
        means = [np.dot(task_weights, [np.mean(splits) for splits in task_results])
                 for task_results in alg_task_results]
        stds = [np.sqrt(np.dot(task_weights ** 2, [np.std(splits) ** 2 / len(splits) for splits in task_results]))
                for task_results in alg_task_results]
        post_intervals = [(self.post_f(mean - std_factor * std), self.post_f(mean + std_factor * std))
                          for mean, std in zip(means, stds)]
        return post_intervals


class ArrayTableAnalyzer(TableAnalyzer):
    """
    Intermediate class that analyzes using the same number of splits for each method
    """

    def __init__(self, f=None, use_weighting=False, separate_task_names: Optional[List[str]] = None, post_f=None):
        super().__init__(post_f=post_f)
        self.f = f
        self.use_weighting = use_weighting
        self.separate_task_names = separate_task_names

    def _is_higher_better(self) -> bool:
        # can be overridden if necessary
        return False

    def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \
            -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        # optional second tuple can be the permutation of configurations that should be used for displaying them
        raise NotImplementedError()

    def print_analysis(self, alg_task_table: AlgTaskTable, val_table: Optional[AlgTaskTable] = None) -> None:
        if self.use_weighting:
            task_weights = TaskWeighting(alg_task_table.task_infos, self.separate_task_names).get_task_weights()
            # task_weights = get_task_weights(alg_task_table.task_infos)
        else:
            n = len(alg_task_table.task_infos)
            task_weights = np.ones(n) / n
        if self.f is not None:
            alg_task_table = alg_task_table.map(self.f)
            if val_table is not None:
                val_table = val_table.map(self.f)
        alg_task_results = alg_task_table.alg_task_results
        # if self.f is not None:
        #     alg_task_results = [[[self.f(x) for x in c] for c in b] for b in alg_task_results]

        min_n_splits = np.min([len(splits) for task_results in alg_task_results for splits in task_results])

        loss_arr = np.asarray([[splits[:min_n_splits] for splits in task_results] for task_results in alg_task_results])
        val_loss_arr = None
        if val_table is not None:
            val_loss_arr = np.asarray(
                [[splits[:min_n_splits] for splits in task_results] for task_results in val_table.alg_task_results])
        results_arr = self._process_losses(loss_arr, val_loss_arr)
        perm = None
        if isinstance(results_arr, Tuple):
            results_arr, perm = results_arr

        means = np.mean(results_arr, axis=-1) @ task_weights

        # todo: could implement better confidence intervals from plotting code
        stds = np.sqrt((np.std(results_arr, axis=-1) ** 2 / results_arr.shape[-1]) @ (task_weights ** 2))

        self._print_table(alg_task_table.alg_names, means, stds, is_higher_better=self._is_higher_better(), perm=perm)


class WinsTableAnalyzer(ArrayTableAnalyzer):
    def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \
            -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        return (loss_arr == np.min(loss_arr, axis=0, keepdims=True)).astype(np.float32)

    def _is_higher_better(self) -> bool:
        return True


def get_ranks(values: np.ndarray) -> np.ndarray:
    # computes ranks across the first axis
    return np.sum(values[:, None] > values[None, :], axis=1) + 1
    # ranks_per_method = []
    # for i in range(values.shape[0]):
    #     ranks_per_method.append(np.sum((values[i, None] > values).astype(np.int32), axis=0) + 1)
    # return np.stack(ranks_per_method, axis=0)


class RankTableAnalyzer(ArrayTableAnalyzer):
    def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \
            -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        return get_ranks(loss_arr)


class NormalizedLossTableAnalyzer(ArrayTableAnalyzer):
    def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \
            -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        min_arr = np.min(loss_arr, axis=0, keepdims=True)
        max_arr = np.max(loss_arr, axis=0, keepdims=True)
        return (loss_arr - min_arr) / (max_arr - min_arr + 1e-30)


class GreedyAlgSelectionTableAnalyzer(ArrayTableAnalyzer):
    """
    Greedy selection of a portfolio of methods
    such that the addition improves the best performance in the portfolio the most
    """
    def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \
            -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        # val_loss_arr = loss_arr  # todo
        assert val_loss_arr is not None
        n_algs = loss_arr.shape[0]
        non_selected_algs = np.arange(n_algs)
        # alg_selected = np.zeros(loss_arr.shape[0], dtype=np.bool_)

        perm = []

        for i in range(loss_arr.shape[0]):
            # losses are updated, tracking the loss of the alg after optimizing over best models and the given one
            # find best model
            best_non_selected_idx = np.argmin(np.mean(val_loss_arr, axis=(1, 2))[non_selected_algs])
            best_idx = non_selected_algs[best_non_selected_idx]

            perm.append(best_idx)
            non_selected_algs = np.concatenate(
                [non_selected_algs[:best_non_selected_idx], non_selected_algs[best_non_selected_idx + 1:]], axis=0)

            for alg_idx in non_selected_algs:
                is_better = val_loss_arr[best_idx] <= val_loss_arr[alg_idx]
                val_loss_arr[alg_idx] = np.where(is_better, val_loss_arr[best_idx], val_loss_arr[alg_idx])
                loss_arr[alg_idx] = np.where(is_better, loss_arr[best_idx], loss_arr[alg_idx])

        return loss_arr, np.asarray(perm, dtype=np.int32)


def alg_results_str(alg_task_table: AlgTaskTable, alg_name: str):
    alg_task_results = alg_task_table.alg_task_results
    if alg_name not in alg_task_table.alg_names:
        alg_name = alg_name + ' [bag-1]'
    # todo: could throw an exception
    alg_idx = alg_task_table.alg_names.index(alg_name)
    task_results = alg_task_results[alg_idx]
    means = [np.mean(splits) for splits in task_results]
    stds = [np.std(splits) / np.sqrt(len(splits)) for splits in task_results]
    task_names = [str(task_info.task_desc) for task_info in alg_task_table.task_infos]
    str_table = [[f'Task ', 'Error', 'Interval']]
    for name, mean, std in zip(task_names, means, stds):
        str_table.append([f'{name}: ', f'{mean:6.4f} ', f'[{mean - 2 * std:6.4f}, {mean + 2 * std:6.4f}]'])
    return utils.pretty_table_str(str_table)


def alg_comparison_str(alg_task_table: AlgTaskTable, alg_names: List[str]):
    alg_task_results = alg_task_table.alg_task_results
    alg_names = [an if an in alg_task_table.alg_names else an + ' [bag-1]' for an in alg_names]
    # todo: could throw an exception
    alg_idxs = [alg_task_table.alg_names.index(alg_name) for alg_name in alg_names]
    means = [[np.mean(splits) for splits in alg_task_results[alg_idx]] for alg_idx in alg_idxs]
    task_names = [str(task_info.task_desc) for task_info in alg_task_table.task_infos]
    str_table = [[f'Task '] + [f'Alg {i + 1} ' for i in range(len(alg_names))]]
    for i, name in enumerate(task_names):
        str_table.append([f'{name}: '] + [f'{alg_means[i]:6.4f} ' for alg_means in means])
    str_table.append([''] * 3)
    min_means = [np.min([alg_means[i] for alg_means in means]) for i in range(len(task_names))]
    n_wins_list = [sum([int(alg_means[i] == min_means[i]) for i in range(len(task_names))]) for alg_means in means]
    str_table.append(['Wins:'] + [str(n_wins) for n_wins in n_wins_list])
    return utils.pretty_table_str(str_table)

# CLI:
# task collection
# n_cv (default=1?)
# preference regarding is_cv and ensembling?
# optionally whether default splits should be used or not?
# tags (connect by and or or?)


================================================
FILE: pytabkit/bench/eval/plotting.py
================================================
import copy
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Callable

import matplotlib
import numpy as np
import pandas as pd
from matplotlib.pyplot import arrow

from pytabkit.bench.eval.analysis import get_opt_groups, get_simplified_name, ResultsTables, \
    get_benchmark_results, get_display_name
from pytabkit.bench.eval.colors import more_percep_uniform_hue

matplotlib.use('agg')
# matplotlib.use('pdf')
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'font.size': 10.95,
    'text.usetex': True,
    'pgf.rcfonts': False,
    # 'legend.framealpha': 0.5,
    'text.latex.preamble': r'\usepackage{times} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{xcolor}'
})
from tueplots import bundles, fonts, fontsizes, figsizes

matplotlib.rcParams.update(bundles.icml2022())
matplotlib.rcParams.update(fonts.icml2022_tex())
matplotlib.rcParams.update(fontsizes.icml2022())

matplotlib.rcParams['text.latex.preamble'] = matplotlib.rcParams['text.latex.preamble'] + r'\usepackage{xcolor}'

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib import patches as mpatches

import seaborn as sns
from adjustText import adjust_text
import matplotlib.patheffects as PathEffects
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.eval.evaluation import MultiResultsTable, DefaultEvalModeSelector, FunctionAlgFilter, TaskWeighting
from pytabkit.bench.eval.runtimes import get_avg_train_times
from pytabkit.models import utils
from pytabkit.models.training.scheduling import get_schedule


# import distinctipy
# class CustomPalette:
#     default = distinctipy.get_colors(n_colors=14,
#                                      exclude_colors=[(a, b, c) for a in [1, 0.8] for b in [1, 0.8] for c in [1, 0.8]],
#                                      pastel_factor=0.5, rng=0)


def get_plot_color_idx(alg_name: str):
    parts = ['BestModel', 'Ensemble', 'MLP-RTDL', 'MLP-PLR', 'RealMLP', 'ResNet', 'FTT',
             ['TabR', 'RealTabR'],
             # 'SAINT',
             'XGB', 'LGBM', 'CatBoost',
             # 'GBT',
             'RF']

    # don't use prefixes and reverse to get better colors for BestModel_FTT-D_prep etc.
    for i, part_or_list in reversed(list(enumerate(parts))):
        lst = part_or_list if isinstance(part_or_list, list) else [part_or_list]
        for part in lst:
            if part in alg_name:
                return i
    raise ValueError(f'Unknown method: {alg_name}')


def gg_color_hue(n: int, saturation: float = 1.0, value: float = 0.65):
    # hues = np.linspace(13, 375, num=n + 1)[:-1]  # exclude the last element to avoid a duplicate of the first color
    # return [tuple(matplotlib.colors.hsv_to_rgb((h / 360.0, saturation, value)).tolist()) for h in hues]
    hues = np.linspace(0.0, 1.0, n + 1)[:-1]
    hues = [more_percep_uniform_hue(hue) for hue in hues]
    return [tuple(matplotlib.colors.hsv_to_rgb((h, saturation, value)).tolist()) for h in hues]


def get_plot_color(alg_name: str):
    idx = get_plot_color_idx(alg_name)
    special = ('rssc' in alg_name or 'TPE' in alg_name or 'no-ls' in alg_name)
    half_special = '_prep' in alg_name
    colors = gg_color_hue(12, saturation=0.6 if special else (0.8 if half_special else 1.0),
                          value=0.9 if special else (0.775 if half_special else 0.65))
    return colors[idx]


def coll_name_to_title(coll_name: str) -> str:
    if coll_name == 'meta-train-class':
        title = r'Meta-train classification benchmark'
    elif coll_name == 'meta-train-reg':
        title = r'Meta-train regression benchmark'
    elif coll_name == 'meta-test-class':
        title = r'Meta-test classification benchmark'
    elif coll_name == 'meta-test-reg':
        title = r'Meta-test regression benchmark'
    elif coll_name == 'meta-test-class-no-missing':
        title = r'$\mathcal{B}^{\mathrm{test}}_{\mathrm{class}}$ without missing value datasets'
    elif coll_name == 'meta-test-reg-no-missing':
        title = r'$\mathcal{B}^{\mathrm{test}}_{\mathrm{reg}}$ without missing value datasets'
    elif coll_name == 'grinsztajn-class-filtered':
        title = r'Grinsztajn et al.\ (2022) classification benchmark'
    elif coll_name == 'grinsztajn-reg':
        title = r'Grinsztajn et al.\ (2022) regression benchmark'
    else:
        title = coll_name
    title = r'\textbf{' + title + r'}'
    return title


def plot_schedule(paths: Paths, filename: str, sched_name: str) -> None:
    with plt.rc_context(figsizes.icml2022_half()):
        plt.figure()
        ts = np.linspace(0.0, 1.0, 400)
        sched = get_schedule(sched_name)
        sched_values = [sched.call_time_(t) for t in ts]
        plt.plot(ts, sched_values, 'tab:blue')
        plt.xlabel('$t$')
        plt.ylabel('$f(t)$')
        # plt.tight_layout()
        plot_name = paths.plots() / filename
        utils.ensureDir(plot_name)
        plt.savefig(plot_name)
        plt.close()


def plot_schedules(paths: Paths, filename: str, sched_names: List[str], sched_labels: List[str]) -> None:
    with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=0.4)):
        plt.figure()
        ts = np.linspace(0.0, 1.0, 400)
        for sched_name, sched_label in zip(sched_names, sched_labels):
            sched = get_schedule(sched_name)
            sched_values = [sched.call_time_(t) for t in ts]
            plt.plot(ts, sched_values, label=sched_label)
        plt.legend(loc='best')
        plt.xlabel('$t$')
        plt.ylabel('$f(t)$')
        # plt.tight_layout()
        plot_name = paths.plots() / filename
        utils.ensureDir(plot_name)
        plt.savefig(plot_name)
        plt.close()


def _create_benchmark_result_plot(file_path: Path, benchmark_results: Dict[str, Dict[str, float]],
                                  alg_names: List[str], colors: List):
    # generated mostly using ChatGPT
    df = pd.DataFrame(benchmark_results)

    # Reorder DataFrame based on alg_names
    df = df.reindex(alg_names)

    # Plotting
    # todo: use ICML compatible size
    fig, axs = plt.subplots(nrows=1, ncols=len(df.columns), figsize=(10, 7), sharey=True)

    for i, col in enumerate(df.columns):
        ax = axs[i]
        values = df[col].values
        bar_height = 1.0
        bar_positions = np.arange(len(df), dtype=np.float64)[::-1] * bar_height

        # Handle empty strings in alg_names to create gaps between bars
        mask = df.index != ''
        non_empty_indices = np.where(mask)[0]

        ax.xaxis.grid(True)

        # Plot only if the method name is not an empty string
        non_empty_values = values[mask]
        non_empty_bar_positions = bar_positions[non_empty_indices]
        # ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=colors[:len(non_empty_bar_positions)], alpha=0.8, height=bar_height)
        ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=[colors[j] for j in non_empty_indices],
                alpha=0.8, height=bar_height)

        # Add method names on the y-axis
        # ax.invert_yaxis()  # Invert y-axis to have Method A on top
        ax.tick_params(left=False)
        ax.set_yticks(bar_positions + 0.5 * bar_height)
        ax.set_yticklabels(df.index)

        ax.set_xlabel(r'Error increase in \% vs best')
        ax.set_title(col)

        # Remove frame around plot
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)

        # Set x-axis ticks and gridlines
        ax.xaxis.set_ticks_position('bottom')

        # Highlight x=0 tick and corresponding gridline
        ax.axvline(x=0, color='black', linewidth=1.5)

    # Set common labels and adjust layout
    # fig.text(0.5, 0.04, 'Performance', ha='center')
    # fig.suptitle('Method Performance Comparison', y=1.05)
    plt.tight_layout()
    utils.ensureDir(file_path)
    plt.savefig(file_path)
    plt.close(fig)


def _create_benchmark_result_plot_with_intervals(file_path: Path, benchmark_results: Dict[str, Dict[str, float]],
                                                 benchmark_intervals: Dict[str, Dict[str, Tuple[float, float]]],
                                                 alg_names: List[str], colors: List):
    n_benchmarks = len(benchmark_results)

    with plt.rc_context(figsizes.icml2022_full(height_to_width_ratio=1.3)):
        # Plotting
        fig, axs = plt.subplots(nrows=1, ncols=n_benchmarks, sharey=True)

        # for i, col in enumerate(df.columns):
        for i, (col, results) in enumerate(benchmark_results.items()):
            ax = axs[i]
            # values = df[col].values
            bar_height = 1.0
            bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * bar_height

            # Handle empty strings in alg_names to create gaps between bars
            # mask = df.index != ''
            mask = [alg_name != '' for alg_name in alg_names]
            non_empty_indices = np.where(mask)[0]

            non_empty_alg_names = [alg_name for alg_name in alg_names if alg_name != '']
            values = [results[alg_name] if alg_name in results else 0.0 for alg_name in non_empty_alg_names]

            ax.xaxis.grid(True)

            # Plot only if the method name is not an empty string
            non_empty_values = values
            non_empty_bar_positions = bar_positions[non_empty_indices]
            intervals = np.array([benchmark_intervals[col][alg_name]
                                  if alg_name in results else (0.0, 0.0)
                                  for alg_name in non_empty_alg_names]).transpose()
            rel_intervals = intervals - non_empty_values
            errors = np.array([-rel_intervals[0], rel_intervals[1]])  # turn them into (absolute) errors

            # ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=colors[:len(non_empty_bar_positions)], alpha=0.8, height=bar_height)
            ax.barh(non_empty_bar_positions, non_empty_values, align='edge',
                    color=[colors[j] for j in non_empty_indices],
                    alpha=0.8, height=bar_height)
            ax.errorbar(non_empty_values, non_empty_bar_positions + 0.5 * bar_height,
                        xerr=errors, fmt='none', color='black')

            # Add method names on the y-axis
            # ax.invert_yaxis()  # Invert y-axis to have Method A on top
            ax.tick_params(left=False)
            ax.set_yticks(bar_positions + 0.5 * bar_height)
            ax.set_yticklabels(alg_names)

            # ax.set_xlabel(r'Error increase in \% vs best ($\downarrow$)')
            ax.set_title(col)

            # Remove frame around plot
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['bottom'].set_visible(False)

            # Set x-axis ticks and gridlines
            ax.xaxis.set_ticks_position('bottom')

            # Highlight x=0 tick and corresponding gridline
            ax.axvline(x=0, color='black', linewidth=1.5)

        # Set common labels and adjust layout
        fig.text(0.6, -0.02, r'Error increase in \% vs best ($\downarrow$)', ha='center')
        # fig.suptitle('Method Performance Comparison', y=1.05)
        plt.tight_layout()
        utils.ensureDir(file_path)
        plt.savefig(file_path)
        plt.close(fig)


def get_equidistant_colors(n: int):
    cmap = plt.get_cmap('viridis')
    norm = matplotlib.colors.Normalize(vmin=0, vmax=n - 1)
    colors = [cmap(norm(i)) for i in range(n)]
    return colors


def plot_benchmark_bars(paths: Paths, tables: ResultsTables, filename: str = None,
                        coll_names: Optional[List[str]] = None,
                        val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                        alg_names: Optional[List[str]] = None,
                        simplify_name_fn: Optional[Callable[[str], str]] = None,
                        use_geometric_mean: bool = True, shift_eps: float = 1e-2):
    benchmark_results = {}
    benchmark_intervals = {}

    if coll_names is None:
        coll_names = ['meta-train-class', 'meta-test-class', 'meta-train-reg', 'meta-test-reg']

    for coll_name in coll_names:
        table = tables.get(coll_name)
        rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                                   val_metric_name=val_metric_name,
                                                                   test_metric_name=test_metric_name,
                                                                   use_geometric_mean=use_geometric_mean,
                                                                   shift_eps=shift_eps,
                                                                   simplify_name_fn=simplify_name_fn)

        benchmark_results[coll_name] = rel_means_dict
        benchmark_intervals[coll_name] = rel_intervals_dict

    # ens_group_names = ['GBDTs-TD_MLP-TD', 'MLP-TD_MLP-TD-S', 'GBDTs-HPO', 'GBDTs-TD']
    ens_group_names = ['-HPO', '-TD']

    ens_alg_names = sum([[f'Ensemble{gn}', f'BestModel{gn}', ''] for gn in ens_group_names], [])

    # ens_alg_names = ['BestModel_GBDTs-HPO_MLP-HPO', ''] + ens_alg_names  # todo
    # ens_alg_names = ['HPO', ''] + ens_alg_names  # todo

    # single_alg_names = [
    #     # 'MLP-TD', 'MLP-TD-S', 'MLP-SKLD', '',
    #     'BestModel_MLP-HPO+TD', 'MLP-HPO', 'MLP-TD', 'MLP-TD-S', '',
    #     'BestModel_CatBoost-HPO+TD', 'CatBoost-HPO', 'CatBoost-TD', 'CatBoost-D', '',
    #     'BestModel_LGBM-HPO+TD', 'LGBM-HPO', 'LGBM-TD', 'LGBM-D', '',
    #     'BestModel_XGB-HPO+TD', 'XGB-HPO', 'XGB-TD', 'XGB-D', '',
    #     'RF-SKLD',
    # ]
    single_alg_names = [
        # 'MLP-TD', 'MLP-TD-S', 'MLP-SKLD', '',
        'MLP-HPO', 'MLP-TD', 'MLP-TD-S', '',
        'CatBoost-HPO', 'CatBoost-TD', 'CatBoost-D', '',
        'LGBM-HPO', 'LGBM-TD', 'LGBM-D', '',
        'XGB-HPO', 'XGB-TD', 'XGB-D', 'XGB-PBB-D', '',
        'RF-SKL-D',
    ]

    if alg_names is None:
        alg_names = ens_alg_names + single_alg_names

    mean_name = f'geometric_eps-{shift_eps:g}' if use_geometric_mean else 'arithmetic'
    if filename is None:
        filename = f'benchmarks_bars_{mean_name}.pdf'
    file_path = paths.plots() / filename

    # todo
    # colors = ['b'] * len(alg_names)
    colors = get_equidistant_colors(len(alg_names))

    _create_benchmark_result_plot_with_intervals(file_path=file_path, benchmark_results=benchmark_results,
                                                 benchmark_intervals=benchmark_intervals, alg_names=alg_names,
                                                 colors=colors)
    # _create_benchmark_result_plot(file_path=file_path, benchmark_results=benchmark_results, alg_names=alg_names,
    #                               colors=colors)


def plot_scatter_ax(paths: Paths, tables: ResultsTables, ax: matplotlib.axes.Axes, coll_name: str, alg_name_1: str,
                    alg_name_2: str,
                    test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None,
                    use_validation_errors: bool = False):
    task_collection = TaskCollection.from_name(coll_name, paths)
    task_infos = task_collection.load_infos(paths)
    task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg'
    table = tables.get(coll_name=coll_name, n_cv=1, tag='paper')
    opt_groups = get_opt_groups(task_type_name)
    alg_group_dict = {'BestModel': (lambda an, tags, config: True), **{
        f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans)
        for group_name, alg_names in opt_groups.items()
    }}
    val_test_groups = {'HPO-on-BestModel-TD': {f'{family}-TD-{task_type_name}': f'{family}-HPO'
                                               for family in ['XGB', 'LGBM', 'CatBoost', 'MLP']}}
    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict,
                                              test_metric_name=test_metric_name,
                                              val_metric_name=val_metric_name,
                                              val_test_groups=val_test_groups,
                                              use_validation_errors=use_validation_errors)
    test_table = test_table.filter_n_splits(n_splits=10)
    test_table.alg_names = [get_simplified_name(alg_name) for alg_name in test_table.alg_names]
    test_arr = test_table.to_array()
    mean_results = np.mean(test_arr, axis=-1)
    alg_1_results = mean_results[test_table.alg_names.index(alg_name_1)]
    alg_2_results = mean_results[test_table.alg_names.index(alg_name_2)]

    with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1)):
        max_err = max(np.max(alg_1_results), np.max(alg_2_results))
        lim_err = max_err * 1.02
        ax.set_xlim(0.0, lim_err)
        ax.set_ylim(0.0, lim_err)
        # ax.set_xscale('symlog')
        # ax.set_yscale('symlog')
        ax.plot([0.0, lim_err], [0.0, lim_err], 'k-')
        ax.scatter(alg_1_results, alg_2_results, color='tab:blue', s=8.0, zorder=3)

        display_name_1 = get_display_name(alg_name_1)
        display_name_2 = get_display_name(alg_name_2)

        if test_metric_name is not None:
            raise NotImplementedError(f'Correct label for custom test metric name is not implemented')
        metric = 'Classification error' if task_type_name == 'class' else 'nRMSE'
        ax.set_xlabel(f'{metric} for {display_name_1}' + r' ($\downarrow$)')
        ax.set_ylabel(f'{metric} for {display_name_2}' + r' ($\downarrow$)')
        ax.set_title(coll_name_to_title(coll_name))

        # diagonal text version
        # eps = 0.3
        # # upper left text
        # ax.text(eps*lim_err, (1-eps)*lim_err, f'{alg_name_1} better',
        #         ha="center", va="center", rotation=45, size=11, zorder=-2)
        # # bottom right text
        # ax.text((1-eps) * lim_err, eps * lim_err, f'{alg_name_2} better',
        #         ha="center", va="center", rotation=45, size=11, zorder=-2)

        eps = 0.05
        # upper left text
        ax.text(eps * lim_err, (1 - eps) * lim_err, f'{display_name_1} better',
                ha="left", va="top", rotation=0, size=11, zorder=-2)
        # bottom right text
        ax.text((1 - eps) * lim_err, eps * lim_err, f'{display_name_2} better',
                ha="right", va="bottom", rotation=0, size=11, zorder=-2)


def plot_scatter(paths: Paths, filename: str, tables: ResultsTables, coll_names: List[str], alg_name_1: str,
                 alg_name_2: str,
                 test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None,
                 use_validation_errors: bool = False):
    print(f'Creating scatterplot: {filename}')
    context_mgr = plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1)) if len(coll_names) == 1 \
        else plt.rc_context(figsizes.icml2022_full(
        height_to_width_ratio=3 if len(coll_names) == 6 else (2 if len(coll_names) == 4 else 0.5)))
    with context_mgr:
        if len(coll_names) == 1:
            fig, ax = plt.subplots(1, 1)
            axs_list = [ax]
        elif len(coll_names) == 2:
            fig, axs = plt.subplots(1, 2)
            axs_list = [axs[0], axs[1]]
        elif len(coll_names) == 4:
            fig, axs = plt.subplots(2, 2)
            axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]]
        elif len(coll_names) == 6:
            fig, axs = plt.subplots(3, 2)
            axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]]
        else:
            raise ValueError(f'{len(coll_names)=} is not in [1, 2, 4, 6]')

        for coll_name, ax in zip(coll_names, axs_list):
            plot_scatter_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name,
                            alg_name_1=alg_name_1, alg_name_2=alg_name_2,
                            val_metric_name=val_metric_name, test_metric_name=test_metric_name,
                            use_validation_errors=use_validation_errors)

        file_path = paths.plots() / filename
        utils.ensureDir(file_path)
        plt.savefig(file_path)
        plt.close(fig)


def _plot_scatter_with_labels(x_dict: Dict[str, float], y_dict: Dict[str, float],
                              y_intervals: Optional[Dict[str, Tuple[float, float]]],
                              ax: matplotlib.axes.Axes,
                              xlabel: str, ylabel: str, title: Optional[str] = None,
                              name_tfm_func: Optional[Callable[[str], str]] = None,
                              plot_pareto_frontier: bool = True,
                              arrow_alg_names: Optional[List[Tuple[str, str]]] = None,
                              pareto_frontier_width: float = 2.,
                              alg_names_to_hide: Optional[List[str]] = None):
    if alg_names_to_hide is None:
        alg_names_to_hide = []

    # First, convert dictionaries to a format suitable for seaborn
    # take shared models
    models = list(set(x_dict.keys()).intersection(set(y_dict.keys())))
    models.sort()
    print(f'{models=}')
    # show models not in both
    # print("Models not in both x and y dicts")
    # print(set(x_dict.keys()).symmetric_difference(set(y_dict.keys())))
    x_vals = [x_dict[model] for model in models]
    y_vals = [y_dict[model] for model in models]

    # Now, create a DataFrame from the dictionaries for easy plotting
    import pandas as pd
    df = pd.DataFrame({'model': models, 'x_value': x_vals, 'y_value': y_vals})

    # split model into model_name and model_type
    # replace underscores with -
    # df['model'] = df['model'].str.replace('_', '-')
    # df['model_name'] = df['model'].str.split('-', expand=True)[0]

    def get_model_type(alg_name: str) -> str:
        if '-HPO' in alg_name:
            return 'HPO'
        elif '-TD' in alg_name:
            return 'TD'
        # elif '-PBB-D' in alg_name:
        #     return 'PBB-D'
        elif '-D' in alg_name:
            return 'D'
        else:
            return 'unknown'

    # df['model_type'] = df['model'].str.split('-', expand=True)[1].str.split('(', expand=True)[0]
    df['model_type'] = [get_model_type(alg_name) for alg_name in df['model']]
    df['color'] = [get_plot_color(alg_name) for alg_name in models]
    df['alpha'] = [1.0 if alg_name not in alg_names_to_hide else 0.0 for alg_name in models]

    # Set up the figure size and style
    # fig = plt.figure(figsize=(10, 10))
    # fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    # sns.set_theme(style="whitegrid", font_scale=2)
    print(f'{df=}')

    color_mapping = {color: color for color in df['color'].unique()}

    # Create the scatter plot
    ax = sns.scatterplot(
        x="x_value",
        y="y_value",
        hue="color",
        style="model_type",
        data=df,
        s=400,  # size of the points
        palette=color_mapping,
        markers={'D': 'o', 'TD': 's', 'HPO': 'X', 'PBB-D': 'P'},
        # palette='tab10',  # palette can be changed as needed
        legend=False,  # No need to draw legend at this point
        ax=ax,
        alpha=df['alpha'],
    )

    ax.set_xscale('log')
    # ax.set_yscale('log')

    # Get the color of each point to set the color of the text
    point_colors = ax.collections[0].get_facecolor()

    if y_intervals is not None:
        y_intervals_arr = np.array([y_intervals[model] for model in models])
        y_errors_arr = np.stack([np.array(y_vals) - y_intervals_arr[:, 0],
                                 y_intervals_arr[:, 1] - np.array(y_vals)], axis=1)
        for x, y, errors, color in zip(x_vals, y_vals, y_errors_arr, point_colors):
            ax.errorbar(x, y, elinewidth=4, yerr=errors[:, None], fmt='none', color=color)

    # Prepare to annotate the points
    texts = []
    for i, point in enumerate(ax.collections[0].get_offsets()):
        model_name = df.iloc[i]['model']
        if model_name in alg_names_to_hide:
            continue
        x, y = point
        text_color = point_colors[i]
        # Annotate the model names
        display_name = model_name
        if name_tfm_func is not None:
            display_name = name_tfm_func(display_name)
        # bold if it's an arrow end
        is_arrow_end = False if arrow_alg_names is None else any(
            model_name == end_name for _, end_name in arrow_alg_names)
        if is_arrow_end:
            display_name = rf'\textbf{{{display_name}}}'
        text = ax.text(x, y, display_name, color=text_color, fontsize=20, ha='center', va='center')
        text.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='white')])
        texts.append(text)

    # Use adjust_text to repel the labels
    # Use adjust_text to repel the labels from each other and the points
    adjust_text(texts,
                x=df['x_value'].values,
                y=df['y_value'].values,
                avoid_self=False,
                expand=(1.15, 1.3),
                ax=ax,
                )

    x_min, x_max = ax.get_xlim()
    y_min, y_max = ax.get_ylim()

    eps = 0.12
    text_x = x_min ** (1 - eps) * x_max ** eps
    text_y = y_min + eps * (y_max - y_min)

    ax.set_axisbelow(True)

    # scatter.annotate('lower is better', xy=(text_x, text_y), rotation=)
    # ax.text(text_x, text_y, "lower is better",
    #         ha="center", va="center", rotation=-45, size=30)
    ax.text(text_x, text_y, "better",
            ha="center", va="center", rotation=45, size=30,
            bbox=dict(boxstyle="larrow,pad=0.5",
                      fc="lightgreen", ec="forestgreen", lw=4), zorder=50)

    # Set arrow coordinates based on the plot limits
    # arrow_x = x_min ** 0.1 * x_max ** 0.9  # Adjust 0.1 as needed
    # arrow_y = y_min + 0.1 * (y_max - y_min)  # Adjust 0.1 as needed
    #
    # # Set the corrected arrow properties
    # arrow_props = dict(facecolor='red', edgecolor='red', shrink=0.05, width=2, headwidth=10)
    #
    # # Add the arrow to the plot
    # ax.annotate('', xy=(arrow_x, arrow_y), xytext=(x_min, y_min),
    #                  arrowprops=arrow_props, annotation_clip=False)

    if plot_pareto_frontier:
        xs = np.array(x_vals)
        ys = np.array(y_vals)
        perm = np.argsort(xs)
        xs = xs[perm]
        ys = ys[perm]

        xs_pareto = [xs[0], xs[0]]
        ys_pareto = [ax.get_ylim()[1], ys[0]]
        for i in range(1, len(xs)):
            if ys[i] < ys_pareto[-1]:
                xs_pareto.append(xs[i])
                ys_pareto.append(ys_pareto[-1])
                xs_pareto.append(xs[i])
                ys_pareto.append(ys[i])
        xs_pareto.append(ax.get_xlim()[1])
        ys_pareto.append(ys_pareto[-1])

        ax.plot(xs_pareto, ys_pareto, '--', color='k', linewidth=pareto_frontier_width, zorder=0.8)

    if arrow_alg_names is not None:
        # arrow_head_length =
        for first, second in arrow_alg_names:
            if first in alg_names_to_hide or second in alg_names_to_hide:
                continue
            x1 = x_dict[first]
            y1 = y_dict[first]
            x2 = x_dict[second]
            y2 = y_dict[second]
            # plt.arrow(x1, y1, x2-x1, y2-y1, length_includes_head=True,
            #           head_width=0.08, head_length=0.00002)
            color = get_plot_color(second)
            color = tuple(list(color) + [0.5])  # add alpha channel
            # color = tuple(0.5 + 0.5*v for v in color)

            ax.annotate("", xy=(x2, y2), xytext=(x1, y1), zorder=5,
                        # arrowprops=dict(arrowstyle="->"),
                        arrowprops=dict(  # facecolor='#444444',
                            facecolor=color,
                            # width=3.0, headwidth=10.0, headlength=8.0,
                            shrink=0.01, edgecolor='none'))

    # Set the axis labels
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if title is not None:
        ax.set_title(title)

    # sns.reset_orig()


def extend_runtimes(times: Dict[str, float], task_type_name: str, keep_gpu: bool = True) -> Dict[str, float]:
    times = copy.copy(times)
    opt_groups = get_opt_groups(task_type_name)
    # for device in ['CPU', 'GPU']:
    for device in ['CPU']:
        # compute HPO times
        for method_name in ['RealMLP', 'MLP-RTDL', 'MLP-PLR', 'ResNet-RTDL', 'XGB', 'LGBM', 'CatBoost', 'TabR',
                            'RF', 'FTT']:
            if f'{method_name}-HPO-2_{device}' in times:
                times[f'{method_name}-HPO_{device}'] = (50. / 2.) * times[f'{method_name}-HPO-2_{device}']
            elif f'{method_name}-HPO-1_{device}' in times:
                times[f'{method_name}-HPO_{device}'] = 50. * times[f'{method_name}-HPO-1_{device}']
            elif f'{method_name}-TD_{device}' in times:
                # simple surrogate time
                print(f'Warning: Guessing HPO time for {method_name} on device {device} from TD time')
                times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-TD_{device}']
            elif f'{method_name}-S-D_{device}' in times:
                # simple surrogate time
                print(f'Warning: Guessing HPO time for {method_name} on device {device} from S-D time')
                times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-S-D_{device}']
            elif f'{method_name}-D_{device}' in times:
                # simple surrogate time
                print(f'Warning: Guessing HPO time for {method_name} on device {device} from D time')
                times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-D_{device}']

            if f'{method_name}-HPO_{device}' in times:
                times[f'{method_name}-HPO_best-1-auc-ovr_{device}'] = times[f'{method_name}-HPO_{device}']

        # print(f'Warning: Guessing no-ls time for RealMLP on device {device} from ls time')
        # times[f'RealMLP-TD_no-ls_{device}'] = times[f'RealMLP-TD_{device}']
        # times[f'RealMLP-TD-S_no-ls_{device}'] = times[f'RealMLP-TD-S_{device}']

        for model in ['XGB', 'LGBM', 'CatBoost']:
            # simple surrogate times
            if f'{model}-HPO_{device}' not in times and f'{model}-TD_{device}' in times:
                print(f'Warning: Guessing HPO time for {model} on device {device} from TD time')
                times[f'{model}-HPO_{device}'] = 50 * times[f'{model}-TD_{device}']

        # raw_names = list(set('_'.join(name.split('_')[:-1]) for name in times))
        # print(f'Warning: Guessing additional times')
        # for name in raw_names:
        #     for new_suffix in ['_no-ls', '_val-ce', '_val-ce_no-ls', '_rssc']:
        #         old_name = f'{name}_CPU'
        #         new_name = f'{name}{new_suffix}_CPU'
        #         if new_name not in times and old_name in times:
        #             times[new_name] = times[old_name]

        for group_name, alg_names in opt_groups.items():
            if group_name not in ['-D', '-TD', '-HPO', '-D_val-ce', '-TD_val-ce'] and not group_name.endswith('_prep'):
                continue  # exclude the other ones for now
            alg_names = [
                alg_name.replace('-class', '').replace('-reg', '')
                for alg_name in alg_names]
            alg_device_names = [f'{alg_name}_{device}' for alg_name in alg_names]
            if all(alg_device_name in times for alg_device_name in alg_device_names):
                sum_time = sum([times[alg_device_name] for alg_device_name in alg_device_names])
                times[f'BestModel{group_name}_{device}'] = sum_time
                times[f'Ensemble{group_name}_{device}'] = sum_time

    if not keep_gpu:
        times = {key: value for key, value in times.items() if not 'GPU' in key}

    times = {get_simplified_name(key): value for key, value in times.items()}

    return times


def plot_pareto_ax(ax: matplotlib.axes.Axes, paths: Paths, tables: ResultsTables, coll_name: str,
                   alg_names: List[str],
                   val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                   use_ranks: bool = False, use_normalized_errors: bool = False, tag: Optional[str] = None,
                   use_geometric_mean: bool = True, use_grinnorm_errors: bool = False,
                   shift_eps: float = 1e-2, use_validation_errors: bool = False,
                   arrow_alg_names: Optional[List[Tuple[str, str]]] = None, plot_pareto_frontier: bool = True,
                   alg_names_to_hide: Optional[List[str]] = None,
                   pareto_frontier_width: float = 2.):
    print(f'Creating plot for {coll_name}')
    is_reg = TaskCollection.from_name(coll_name, paths).load_infos(paths)[0].tensor_infos[
                 'y'].get_cat_size_product() == 0
    default_metric_name = ('1-r2' if use_grinnorm_errors else 'nrmse') if is_reg else 'class_error'
    if val_metric_name is None:
        val_metric_name = default_metric_name
    if test_metric_name is None:
        test_metric_name = default_metric_name
    table = tables.get(coll_name, n_cv=1, tag=tag or 'paper')
    rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                               use_relative_score=False,
                                                               return_percentages=False,
                                                               val_metric_name=val_metric_name,
                                                               test_metric_name=test_metric_name,
                                                               use_ranks=use_ranks,
                                                               use_normalized_errors=use_normalized_errors,
                                                               use_grinnorm_errors=use_grinnorm_errors,
                                                               filter_alg_names_list=alg_names,
                                                               use_geometric_mean=use_geometric_mean,
                                                               shift_eps=shift_eps,
                                                               use_validation_errors=use_validation_errors)

    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg'
    time_coll_name = f'meta-train-{task_type_name}'

    # get runtimes
    avg_train_times = get_avg_train_times(paths, time_coll_name, per_1k_samples=True)
    # print(f'{avg_train_times=}')
    avg_train_times = extend_runtimes(avg_train_times, task_type_name=task_type_name, keep_gpu=False)

    # print(f'After extending: {avg_train_times=}')

    # def tfm_key(key: str) -> str:
    #     return key.replace('_CPU', ' (CPU)').replace('_GPU', ' (GPU)')
    def tfm_key(key: str) -> str:
        return key.replace('_CPU', '')

    avg_train_times = {tfm_key(key): value for key, value in avg_train_times.items()}

    # remove sklearn MLP
    if 'MLP-SKL-D' in avg_train_times:
        del avg_train_times['MLP-SKL-D']
    # if 'ResNet-RTDL-D' in avg_train_times:
    #     del avg_train_times['ResNet-RTDL-D']

    # convert MLP-HPO runtime
    # get simplified associated names (without the CPU/GPU thing)
    # generate ensemble/BestModel runtimes?
    # add CPU/GPU to rel_means_dict keys

    extended_means_dict = rel_means_dict
    extended_intervals_dict = rel_intervals_dict

    print(f'{list(avg_train_times.keys())=}')
    print(f'{list(extended_means_dict.keys())=}')

    common_keys = set(avg_train_times.keys()).intersection(set(extended_means_dict.keys()))

    # avg_train_times = {tfm_alg_name(key): value for key, value in avg_train_times.items() if key in common_keys}
    # extended_means_dict = {tfm_alg_name(key): value for key, value in extended_means_dict.items() if key in common_keys}
    # extended_intervals_dict = {tfm_alg_name(key): value for key, value in extended_intervals_dict.items() if
    #                            key in common_keys}

    # extended_means_dict = utils.join_dicts(
    #     *[{f'{key} ({device})': value for key, value in rel_means_dict.items()} for device in ['CPU', 'GPU']]
    # )
    # extended_intervals_dict = utils.join_dicts(
    #     *[{f'{key} ({device})': value for key, value in rel_intervals_dict.items()} for device in ['CPU', 'GPU']]
    # )

    # print('times keys:', sorted(list(avg_train_times.keys())))
    # print('means keys:', sorted(list(extended_means_dict.keys())))
    #
    # print(f'x_dict = {avg_train_times}')
    # print(f'y_dict = {extended_means_dict}')

    title = coll_name_to_title(coll_name)
    # coll_name_latex = coll_name
    # for split_name in ['train', 'test']:
    #     if coll_name == f'meta-{split_name}-{task_type_name}':
    #         coll_name_latex = r'$\mathcal{B}^{\mathrm{' + split_name + r'}}_{\mathrm{' + task_type_name + r'}}$'
    ylabel = ('Shifted geometric mean' if use_geometric_mean else 'Arithmetic mean') + ' of '
    if use_ranks:
        ylabel = ylabel + r'\textbf{ranks}'
    else:
        if use_normalized_errors:
            ylabel = ylabel + r'\textbf{normalized} '
        elif use_grinnorm_errors:
            ylabel = ylabel + r'\textbf{custom-normalized} '
        if task_type_name == 'class':
            if test_metric_name is None or test_metric_name == 'class_error':
                ylabel = ylabel + r'\textbf{classification errors}'
            elif test_metric_name == '1-auc_ovr':
                ylabel = ylabel + r'\textbf{1-AUC(one-vs-rest)}'
            elif test_metric_name == 'cross_entropy':
                ylabel = ylabel + r'\textbf{cross-entropies}'
            else:
                raise ValueError(f'Test metric {test_metric_name} not implemented')
        else:
            if test_metric_name is None or test_metric_name == 'rmse':
                ylabel = ylabel + r'\textbf{RMSEs}'
            elif test_metric_name == 'nrmse':
                ylabel = ylabel + r'\textbf{nRMSEs}'
            elif test_metric_name == '1-r2':
                ylabel = ylabel + r'$1-R^2$'
            else:
                raise ValueError(f'Test metric {test_metric_name} not implemented')
    _plot_scatter_with_labels(avg_train_times, extended_means_dict,
                              y_intervals=extended_intervals_dict,
                              xlabel=r'Average training \textbf{time (CPU)} per 1K samples [s]',
                              # + r' ($\downarrow$)',
                              ylabel=ylabel,
                              ax=ax,
                              title=title,
                              name_tfm_func=get_display_name,
                              arrow_alg_names=arrow_alg_names,
                              plot_pareto_frontier=plot_pareto_frontier,
                              alg_names_to_hide=alg_names_to_hide,
                              pareto_frontier_width=pareto_frontier_width,
                              # ylabel=r'Benchmark score relative to best model',
                              # ylabel=r'Error increase in \% vs best ($\downarrow$)',
                              # title=f'Benchmark scores on {coll_name_latex} vs train time',
                              )


def shorten_coll_names(coll_names: List[str]) -> List[str]:
    coll_name_dict = {'meta-train-class': 'mtrc', 'meta-train-reg': 'mtrr',
                      'meta-test-class': 'mtec', 'meta-test-reg': 'mter',
                      'grinsztajn-class-filtered': 'gcf', 'grinsztajn-reg': 'gr'}
    short_coll_names = [coll_name if coll_name not in coll_name_dict else coll_name_dict[coll_name] for coll_name in
                        coll_names]
    return short_coll_names


def plot_pareto(paths: Paths, tables: ResultsTables, coll_names: List[str], alg_names: List[str],
                val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                use_ranks: bool = False, use_normalized_errors: bool = False, filename: Optional[str] = None,
                filename_suffix: Optional[str] = None,
                tag: Optional[str] = None, use_grinnorm_errors: bool = False,
                use_geometric_mean: bool = True, shift_eps: float = 1e-2, use_validation_errors: bool = False,
                arrow_alg_names: Optional[List[Tuple[str, str]]] = None, plot_pareto_frontier: bool = True,
                alg_names_to_hide: Optional[List[str]] = None,
                subfolder: Optional[str] = None,
                pareto_frontier_width: float = 2., use_2x3: bool = False):
    print(f'Plotting pareto plot for {coll_names}')
    sns.set_theme(style="whitegrid", font_scale=2)
    if len(coll_names) == 1:
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        axs_list = [ax]
    elif len(coll_names) == 2:
        fig, axs = plt.subplots(1, 2, figsize=(20, 10))
        axs_list = [axs[0], axs[1]]
    elif len(coll_names) == 3:
        fig, axs = plt.subplots(1, 3, figsize=(30, 10))
        axs_list = [axs[0], axs[1], axs[2]]
    elif len(coll_names) == 4:
        fig, axs = plt.subplots(2, 2, figsize=(20, 20))
        axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]]
    elif len(coll_names) == 6:
        if use_2x3:
            fig, axs = plt.subplots(2, 3, figsize=(30, 20))
            axs_list = [axs[0, 0], axs[1, 0], axs[0, 1], axs[1, 1], axs[0, 2], axs[1, 2]]
        else:
            fig, axs = plt.subplots(3, 2, figsize=(20, 30))
            axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]]
    else:
        raise ValueError(f'{len(coll_names)=} is not in [1, 2, 3, 4, 6]')

    for coll_name, ax in zip(coll_names, axs_list):
        # print(f'{val_metric_name=}, {test_metric_name=}, {coll_name=}')
        plot_pareto_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names,
                       val_metric_name=val_metric_name, test_metric_name=test_metric_name,
                       use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, tag=tag,
                       use_grinnorm_errors=use_grinnorm_errors,
                       use_geometric_mean=use_geometric_mean, shift_eps=shift_eps,
                       use_validation_errors=use_validation_errors,
                       arrow_alg_names=arrow_alg_names, plot_pareto_frontier=plot_pareto_frontier,
                       alg_names_to_hide=alg_names_to_hide,
                       pareto_frontier_width=pareto_frontier_width)

    mean_name = f'geometric_eps-{shift_eps:g}' if use_geometric_mean else 'arithmetic'
    if use_ranks:
        mean_name = 'ranks_' + mean_name
    elif use_normalized_errors:
        mean_name = 'normerrors_' + mean_name
    elif use_grinnorm_errors:
        mean_name = 'grinnormerrors_' + mean_name

    name_parts = shorten_coll_names(coll_names) + [mean_name]
    if use_validation_errors:
        name_parts = ['validation'] + name_parts

    if use_2x3:
        name_parts = ['2x3'] + name_parts

    plots_path = paths.plots()
    if subfolder is not None:
        plots_path = plots_path / subfolder

    if filename is None:
        file_path = plots_path / f'pareto_{"_".join(name_parts)}.pdf'
    else:
        file_path = plots_path / filename
    if filename_suffix is not None:
        file_path = file_path.with_stem(f'{file_path.stem}{filename_suffix}')

    if len(coll_names) in [4, 6]:
        labels = ['D = defaults {} {} {} {} {} TD = tuned defaults {} {} {} {} {} HPO = hyperparameter optimization',
                  'Best/Ensemble: out of XGB, LGBM, CatBoost, (Real)MLP']
        r = matplotlib.patches.Rectangle((0, 0), 1, 1, fill=False, edgecolor='none',
                                         visible=False)
        fig.legend(handles=[r] * len(labels), labels=labels, fontsize=30,
                   handlelength=0, handletextpad=0, loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=1)

        # plt.tight_layout(rect=[0, 0.09, 1.0, 1.0])

    utils.ensureDir(file_path)
    plt.savefig(file_path, bbox_inches='tight')
    plt.close(fig)
    sns.reset_orig()
    print(f'Created plot {file_path}')


def plot_winrates(paths: Paths, tables: ResultsTables, coll_name: str, alg_names: List[str],
                  val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None):
    print(f'Plotting winrate matrix plot for {coll_name}')
    table = tables.get(coll_name)
    task_collection = TaskCollection.from_name(coll_name, paths)
    task_infos = task_collection.load_infos(paths)
    task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg'
    opt_groups = get_opt_groups(task_type_name)
    alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{
        f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans)
        for group_name, alg_names in opt_groups.items()
    }}
    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict,
                                              test_metric_name=test_metric_name,
                                              val_metric_name=val_metric_name)
    simplify_name_fn = get_simplified_name
    test_table = test_table.rename_algs(simplify_name_fn)
    test_table = test_table.filter_algs(alg_names)

    use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci')
    if use_task_weighting:
        separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
        task_weights = TaskWeighting(test_table.task_infos, separate_task_names).get_task_weights()
    else:
        n_tasks = len(test_table.task_infos)
        task_weights = np.ones(n_tasks) / n_tasks

    n_splits = 10
    test_table = test_table.filter_n_splits(n_splits)
    # shape: [n_algs, n_tasks, n_splits]
    errors = test_table.to_array()

    # do it once with < and once with <= to make sure that ties count as half a win
    wins_tensor = 0.5 * ((errors[:, None] <= errors[None, :]).astype(np.float32)
                         + (errors[:, None] < errors[None, :]).astype(np.float32))

    avg_wins_per_task = np.mean(wins_tensor, axis=-1)  # average over splits
    # average wins by task weights
    winrate_matrix = np.einsum('ijt,t->ij', avg_wins_per_task, task_weights)
    win_percentage_matrix = 100.0 * winrate_matrix

    perm = np.argsort(np.mean(win_percentage_matrix, axis=-1))  # sort by average winrate
    win_percentage_matrix = win_percentage_matrix[perm, :][:, perm]
    alg_names = [test_table.alg_names[i] for i in perm]
    alg_names = [alg_name.replace('_', r'\_') for alg_name in alg_names]

    # with matplotlib.rc_context():
    # Create a heatmap using seaborn
    fig = plt.figure(figsize=(10, 8))
    sns.set_theme(style="white", font_scale=0.6)
    mask = np.eye(win_percentage_matrix.shape[0], dtype=bool)
    heatmap = sns.heatmap(win_percentage_matrix, annot=True, fmt=".1f", cmap="YlGnBu",
                          vmin=0, vmax=100, linewidths=0.5, mask=mask,
                          square=True, cbar_kws={"shrink": 0.8})

    display_alg_names = [get_display_name(an) for an in alg_names]

    # Set labels for rows and columns
    heatmap.set_xticklabels(display_alg_names, rotation=90, fontsize=8)  # Adjust font size
    heatmap.set_yticklabels(display_alg_names, rotation=0, fontsize=8)  # Adjust font size

    # Remove x and y labels
    heatmap.set_xlabel('')
    heatmap.set_ylabel('')

    # Add a label to the color scale
    cbar = heatmap.collections[0].colorbar
    # cbar.set_label("Percentage of row wins", fontsize=10)

    heatmap.set_title(coll_name_to_title(coll_name) + ', percentage of row wins', fontsize=15)

    file_path = paths.plots() / f'winrate_matrix_{coll_name}.pdf'
    utils.ensureDir(file_path)
    plt.savefig(file_path)
    plt.close(fig)
    sns.reset_orig()


def plot_stopping_ax(ax: plt.Axes, paths: Paths, tables: ResultsTables, method: str, classification: bool):
    esr_list = [10, 20, 50, 100, 300, 1000]

    ax.set_xscale('log')

    ax.plot([10, 1000], [0.0, 0.0], 'k--')

    if classification:
        combinations = [('meta-train-class', 'stopped on classification error', '', 'tab:blue'),
                        ('meta-train-class', 'stopped on Brier loss', '_val-brier', 'tab:orange'),
                        ('meta-train-class', 'stopped on cross-entropy loss', '_val-ce',
                         'tab:green')]
    else:
        combinations = [('meta-train-reg', 'stopped on RMSE', '', 'tab:blue')]

    for coll_name, label, suffix, color in combinations:
        table = tables.get(coll_name, n_cv=1, tag='paper_early_stopping')
        # print(f'{table.test_table.alg_names=}')
        rel_alg_name = method + '_esr-1000'  # stopped on standard metric
        rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                           rel_alg_name=rel_alg_name)
        alg_names = [method + suffix + f'_esr-{esr}' for esr in esr_list]
        results_list = [rel_results[alg_name] for alg_name in alg_names]
        lower_list = [rel_intervals[alg_name][0] for alg_name in alg_names]
        upper_list = [rel_intervals[alg_name][1] for alg_name in alg_names]

        ax.plot(esr_list, results_list, '.-', color=color, label=label)
        ax.fill_between(esr_list, lower_list, upper_list, color=color, alpha=0.3)
        ax.set_xlabel('Stopping patience')
        ax.set_xticks(esr_list, labels=[str(esr) for esr in esr_list])
        ax.grid(True)


def plot_stopping(paths: Paths, tables: ResultsTables, classification: bool):
    print(f'Generating stopping plot')

    with plt.rc_context(figsizes.icml2022_full(height_to_width_ratio=0.9)):
        fig, axs = plt.subplots(1, 3, sharey='all')
        for i, method in enumerate(['XGB-TD', 'LGBM-TD', 'CatBoost-TD']):
            ax = axs[i]
            ax.set_title(method)
            plot_stopping_ax(ax, paths, tables, method=method, classification=classification)

        # axs[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3)

        axs[0].set_ylabel(r'Error increase in \%')

        fig.legend(*axs[0].get_legend_handles_labels(), loc='upper center', bbox_to_anchor=(0.5, 0.15), ncol=3)

        task_type_name = 'class' if classification else 'reg'

        file_path = paths.plots() / f'stopping_{task_type_name}.pdf'
        plt.tight_layout(rect=[0, 0.15, 1.0, 1.0])

        if classification:
            y_min, y_max = axs[0].get_ylim()
            y_max = min(y_max, 15)
            axs[0].set_ylim(y_min, y_max)

        utils.ensureDir(file_path)
        plt.savefig(file_path)
        plt.close(fig)


def get_equidistant_blue_colors(n: int):
    # cmap = plt.get_cmap('viridis')
    cmap = sns.color_palette("ch:s=.25,rot=-.25", n)
    # cmap = sns.color_palette("viridis", n)
    # norm = matplotlib.colors.Normalize(vmin=0, vmax=n - 1)
    # colors = [cmap(norm(i)) for i in range(n)]
    colors = [cmap[i] for i in range(n)]
    return colors


def _create_cumul_abl_plot(file_path: Path, benchmark_results: Dict[str, Dict[str, float]],
                           benchmark_intervals: Dict[str, Dict[str, Tuple[float, float]]],
                           alg_names: List[str], colors: List, contribs: List[str],
                           improv_groups: List[str]):
    n_benchmarks = len(benchmark_results)

    n_improvements = len(list(benchmark_results.values())[0])
    start_color = mcolors.to_rgb('tab:blue')  # Color for vanilla MLP
    end_color = mcolors.to_rgb('tab:green')  # Color for final MLP
    gradient_colors = [mcolors.to_hex(c) for c in np.linspace(start_color, end_color, n_improvements)]

    start_alpha = 0.3
    end_alpha = 0.6
    alpha_cumulative_list = np.linspace(start_alpha, end_alpha, n_improvements)
    start_alpha_improvement = 0.65
    end_alpha_improvement = 1.
    alpha_improvement_list = np.linspace(start_alpha_improvement, end_alpha_improvement, n_improvements)

    with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1.5)):
        # Plotting
        fig, axs = plt.subplots(nrows=1, ncols=n_benchmarks, sharey=True)

        # for i, col in enumerate(df.columns):
        for i, (col, results) in enumerate(benchmark_results.items()):
            ax = axs[i]
            # values = df[col].values
            # bar_height = 1.0
            # bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * bar_height
            bar_height = 1.0
            bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * (bar_height + 0.1)

            # Handle empty strings in alg_names to create gaps between bars
            # mask = df.index != ''
            mask = [alg_name != '' for alg_name in alg_names]
            non_empty_indices = np.where(mask)[0]

            non_empty_alg_names = [alg_name for alg_name in alg_names if alg_name != '']
            values = [results[alg_name] if alg_name in results else 0.0 for alg_name in non_empty_alg_names]

            ax.xaxis.grid(True)

            # Plot only if the method name is not an empty string
            non_empty_values = values
            non_empty_bar_positions = bar_positions[non_empty_indices]
            print(non_empty_bar_positions)
            intervals = np.array([benchmark_intervals[col][alg_name]
                                  if alg_name in results else (0.0, 0.0)
                                  for alg_name in non_empty_alg_names]).transpose()
            rel_intervals = intervals - non_empty_values
            errors = np.array([-rel_intervals[0], rel_intervals[1]])  # turn them into (absolute) errors

            for j in range(len(non_empty_values)):
                value = non_empty_values[j]
                last_value = value if j == 0 else non_empty_values[j - 1]
                ax.barh(non_empty_bar_positions[j], min(value, last_value), align='edge', color=gradient_colors[j],
                        alpha=alpha_cumulative_list[j],
                        height=bar_height)
                if value > last_value:
                    ax.barh(non_empty_bar_positions[j], value - last_value, left=last_value, align='edge',
                            color=gradient_colors[j],
                            alpha=alpha_improvement_list[j], height=bar_height)
                elif value < last_value:
                    ax.barh(non_empty_bar_positions[j], last_value - value, left=value, align='edge',
                            color=gradient_colors[j],
                            # color='white', edgecolor='red', hatch='/', linewidth=2,
                            # color='red', fill=False,
                            # color='white',
                            edgecolor='tab:green', hatch='//' * 3,
                            facecolor='none',
                            linewidth=0,
                            alpha=alpha_improvement_list[j], height=bar_height)
            ax.errorbar(non_empty_values, non_empty_bar_positions + 0.5 * bar_height,
                        xerr=errors, fmt='none', color='gray', linewidth=0.8)

            # Add method names on the y-axis
            ax.tick_params(left=False)
            ax.set_yticks(bar_positions + 0.5 * bar_height)
            # Get the default font size for y-tick labels
            default_fontsize = plt.rcParams['ytick.labelsize']
            font_properties = {'family': 'sans-serif', 'size': default_fontsize + 1}
            ax.set_yticklabels(alg_names, fontdict=font_properties)

            # ax.set_xlabel(r'Error increase in \% vs best ($\downarrow$)')
            ax.set_title(col)

            # Remove frame around plot
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['bottom'].set_visible(False)

            # Set x-axis ticks and gridlines
            ax.xaxis.set_ticks_position('bottom')

            # Highlight x=0 tick and corresponding gridline
            # ax.axvline(x=-0.09, color='black', linewidth=1.5) #FIXME it'd be better but right now it's a bit off compared to the grid lines

            color_map = {'New': '#ff7f0e', 'Unusual': '#2ca02c',
                         'default': (0.35, 0.35, 0.35)}
            colors_contrib = [color_map.get(key, 'black') for key in contribs]

            for label, color in zip(ax.get_yticklabels(), colors_contrib):
                label.set_color(color)

        max_value = max(max(results.values()) for results in benchmark_results.values())
        for ax in axs:
            ax.set_xlim(0, max_value * 1.1)  # Add some padding

        # Identify the unique categories and their y-coordinates
        # unique values with the right order
        unique_groups = list(dict.fromkeys(improv_groups))
        group_indices = {group: [] for group in unique_groups}

        # Loop over improvements and store the indices for each group
        for i, group in enumerate(improv_groups):
            if group in group_indices:
                group_indices[group].append(i)

        # Calculate the bracket positions
        bracket_positions = {}
        bracket_widths = {}
        for group, indices_ in group_indices.items():
            # add 1 to the indices to take into account the first bar
            indices = [i + 1 for i in indices_]
            start_pos = non_empty_bar_positions[min(indices)] - 0.0
            end_pos = non_empty_bar_positions[max(indices)]
            bracket_positions[group] = (start_pos + end_pos) / 2 + bar_height / 2 + 0.0
            bracket_widths[group] = (start_pos - end_pos) * 0.9 + bar_height - 0.4

        # Call the draw_bracket function for each unique group
        text_offset = 0.3  # Offset for the text annotation from the bracket

        # find the very left of the figure in ax[0] coordinates
        left = -31.5  # TODO

        for group in unique_groups:
            y = bracket_positions[group]
            width = bracket_widths[group]
            # show text on the left side of the figure
            axs[0].annotate(group, xy=(left, y), xytext=(left - text_offset, y),
                            ha='right', va='center', color='black', fontsize='small',
                            rotation=90,
                            arrowprops=dict(arrowstyle=f'-[, widthB={width}, lengthB=0.5', lw=1., color='black'),
                            annotation_clip=False,
                            font_properties={'family': 'sans-serif', 'size': default_fontsize - 1})

        # make a legend
        legend_elements = [mpatches.Patch(facecolor=color_map[key], edgecolor='black', label=key) for key in color_map \
                           if key != "default"]
        font_properties = {'family': 'sans-serif', 'size': default_fontsize}
        fig.legend(handles=legend_elements, loc='lower left', bbox_to_anchor=(0.09, 0.00),
                   prop=font_properties)

        # # Set common labels and adjust layout
        fig.text(0.65, -0.02, r'Benchmark score improvement (\%) vs. vanilla', ha='center')
        # fig.suptitle('Method Performance Comparison', y=1.05)
        # plt.show()
        # plt.tight_layout() # break the annotations
        # utils.ensureDir(file_path)
        plt.savefig(file_path)
        plt.close(fig)


def plot_cumulative_ablations(paths: Paths, tables: ResultsTables, filename: str = None,
                              val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                              use_geometric_mean: bool = True, shift_eps: float = 1e-2):
    print(f'Creating cumulative ablations plot')

    improvements = {
        'vanilla': (r'\textbf{Vanilla MLP}', 'default'),
        'robust-scale-smooth-clip': ('Robust scale + smooth clip', 'New', "Preprocessing"),
        'one-hot-small-cat': ('One-hot for small cat.', 'default', "Preprocessing"),
        'no-early-stop': ('No early stopping', 'default', "Hyperparameters"),
        'last-best-epoch': ('Last best epoch', 'Unusual', "Hyperparameters"),
        'lr-multi-cycle': (r'$\mathrm{coslog}_4$ lr sched', 'Unusual', "Hyperparameters"),
        'beta2-0.95': (r'Adam $\beta_2 = 0.95$', 'Unusual', "Hyperparameters"),
        'label-smoothing': (r'Label smoothing (class.)', 'Unusual', "Hyperparameters"),
        'output-clipping': (r'Output clipping (reg.)', 'Unusual', "Hyperparameters"),
        'ntp': (r'NT parametrization', 'Unusual', "Architecture"),
        'different-act': (r'Act. fn. SELU / Mish', 'default', "Architecture"),
        'param-act': (r'Parametric act. fn.', 'Unusual', "Architecture"),
        'front-scale': (r'Scaling layer', 'New', "Architecture"),
        'num-emb-pl': (r'Num. embeddings: PL', 'default', "Architecture"),
        'num-emb-pbld': (r'PL emb.\ $\to$ PBLD emb.', 'New', "Architecture"),
        'alt-pdrop-0.15': (r'Dropout $p=0.15$', 'default', "Regularization"),
        'alt-pdrop-flat-cos': (r'Dropout sched: $\mathrm{flat\_cos}$', 'New', "Regularization"),
        'alt-wd-0.02': (r'Weight decay wd $= 0.02$', 'default', "Regularization"),
        'alt-wd-flat-cos': (r'wd sched: $\mathrm{flat\_cos}$', 'New', "Regularization"),
        'alt-bias-init-he+5': (r'Bias init: he+5', 'Unusual', "Initialization"),
        'alt-weight-init-std': (r'Weight init: data-driven', 'New', "Initialization"),
        'final': (r'\textbf{= RealMLP}', "default")
    }

    group_labels = {key: value[0] for key, value in improvements.items()}
    contribs = [value[1] for key, value in improvements.items()]
    improv_groups = [value[2] for key, value in improvements.items() if len(value) > 2]

    coll_names = ['meta-train-class', 'meta-train-reg']
    benchmark_results = {}
    benchmark_intervals = {}

    for coll_name in coll_names:
        table = tables.get(coll_name, tag='paper_cumulative_ablations_new')
        rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                                   val_metric_name=val_metric_name,
                                                                   test_metric_name=test_metric_name,
                                                                   use_relative_score=False, return_percentages=False,
                                                                   use_geometric_mean=use_geometric_mean,
                                                                   shift_eps=shift_eps)

        alg_names = list(rel_means_dict.keys())
        vanilla_alg_names = [alg_name for alg_name in alg_names if 'vanilla' in alg_name]
        vanilla_alg_results = [rel_means_dict[alg_name] for alg_name in vanilla_alg_names]
        best_vanilla_alg_name = vanilla_alg_names[np.argmin(vanilla_alg_results)]

        # get the results again, but now relative to the best vanilla alg, in percent
        rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                                   val_metric_name=val_metric_name,
                                                                   test_metric_name=test_metric_name,
                                                                   rel_alg_name=best_vanilla_alg_name,
                                                                   use_geometric_mean=use_geometric_mean,
                                                                   shift_eps=shift_eps)

        # group different lr values together
        alg_group_names = [alg_name.split('_')[-2] if len(alg_name.split('_')) >= 2 else '' for alg_name in alg_names]
        # alg_group_names_unique = list(set(alg_group_names))

        rel_means_dict_group = dict()
        rel_intervals_dict_group = dict()
        for alg_group_name, display_name in group_labels.items():
            # alg names in this group
            group_alg_names = [an for an, agn in zip(alg_names, alg_group_names) if agn == alg_group_name]
            if len(group_alg_names) == 0:
                print(f'No algs for group {alg_group_name}')
                continue
            best_alg_name = group_alg_names[np.argmin([rel_means_dict[an] for an in group_alg_names])]
            print(f'best lr: {best_alg_name.split("_")[-1]} for {alg_group_name}')
            rel_means_dict_group[alg_group_name] = -rel_means_dict[best_alg_name]
            low, high = rel_intervals_dict[best_alg_name]
            rel_intervals_dict_group[alg_group_name] = -high, -low

        benchmark_results[coll_name] = rel_means_dict_group
        benchmark_intervals[coll_name] = rel_intervals_dict_group

    for coll_name in coll_names:
        for mydict in [benchmark_results, benchmark_intervals]:
            # copy the last result because we need it twice but we can't have the same dictionary key twice
            print(f'{list(mydict[coll_name].keys())=}')
            mydict[coll_name]['final'] = mydict[coll_name]['alt-weight-init-std']

    # change keys to descriptions
    def map_keys(f: Dict, to_be_mapped: Dict):
        return {f[key]: value for key, value in to_be_mapped.items()}

    for coll_name in benchmark_results:
        benchmark_results[coll_name] = map_keys(group_labels, benchmark_results[coll_name])
        benchmark_intervals[coll_name] = map_keys(group_labels, benchmark_intervals[coll_name])

    alg_names = list(benchmark_results['meta-train-class'].keys())

    # colors = ['b'] * len(alg_names)
    colors = get_equidistant_blue_colors(len(list(group_labels.keys())))
    # colors = ['tab:blue'] * len(list(group_labels.keys()))

    if filename is None:
        filename = f'cumulative_ablations.pdf'
    file_path = paths.plots() / filename
    _create_cumul_abl_plot(file_path=file_path, benchmark_results=benchmark_results,
                           benchmark_intervals=benchmark_intervals, alg_names=alg_names,
                           colors=colors, contribs=contribs, improv_groups=improv_groups)


def plot_cdd_ax(ax: matplotlib.axes.Axes, paths: Paths, tables: ResultsTables, coll_name: str,
                alg_names: List[str],
                val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
                tag: Optional[str] = None, use_validation_errors: bool = False):
    print(f'Creating plot for {coll_name}')
    table = tables.get(coll_name, n_cv=1, tag=tag or 'paper')
    simplify_name_fn = get_simplified_name
    n_splits = 10

    task_collection = TaskCollection.from_name(coll_name, paths)
    task_infos = task_collection.load_infos(paths)
    task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg'
    opt_groups = get_opt_groups(task_type_name)
    alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{
        f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans)
        for group_name, alg_names in opt_groups.items()
    }}
    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict,
                                              test_metric_name=test_metric_name,
                                              val_metric_name=val_metric_name,
                                              use_validation_errors=use_validation_errors)
    test_table = test_table.rename_algs(simplify_name_fn)
    # print(f'{test_table.alg_names=}')
    # print(f'{filter_alg_names_list=}')
    test_table = test_table.filter_algs(alg_names)

    # new code
    test_table = test_table.filter_n_splits(n_splits)
    # shape: [n_algs, n_tasks, n_splits]
    errors = test_table.to_array()
    errors = np.mean(errors, axis=2)  # average over splits

    # adapted from https://sherbold.github.io/autorank/
    data = pd.DataFrame()
    for i, alg_name in enumerate(test_table.alg_names):
        data[get_display_name(alg_name)] = errors[i]
    from autorank import autorank, plot_stats, create_report, latex_table
    result = autorank(data, alpha=0.05, verbose=False, order='ascending', force_mode='nonparametric')
    plot_stats(result, ax=ax, allow_insignificant=True)
    print(create_report(result))
    ax.set_title('grinsztajn-class' if coll_name == 'grinsztajn-class-filtered' else coll_name)


def plot_cdd(paths: Paths, tables: ResultsTables, coll_names: List[str], alg_names: List[str],
             val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
             filename: Optional[str] = None,
             tag: Optional[str] = None,
             use_validation_errors: bool = False):
    print(f'Plotting pareto plot for {coll_names}')
    old_value = plt.rcParams['text.usetex']
    plt.rcParams['text.usetex'] = False  # apparently doesn't work with the cdd plot package (autorank)
    assert len(coll_names) in [1, 2, 4, 6]
    if len(coll_names) == 1:
        fig, ax = plt.subplots(1, 1, figsize=(6, 4))
        axs_list = [ax]
    elif len(coll_names) == 2:
        fig, axs = plt.subplots(1, 2, figsize=(12, 4))
        axs_list = [axs[0], axs[1]]
    elif len(coll_names) == 4:
        fig, axs = plt.subplots(2, 2, figsize=(12, 8))
        axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]]
    else:
        fig, axs = plt.subplots(3, 2, figsize=(12, 12))
        axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]]

    for coll_name, ax in zip(coll_names, axs_list):
        plot_cdd_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names,
                    val_metric_name=val_metric_name, test_metric_name=test_metric_name, tag=tag,
                    use_validation_errors=use_validation_errors)

    name_parts = shorten_coll_names(coll_names)
    if use_validation_errors:
        name_parts = ['validation'] + name_parts
    if filename is None:
        file_path = paths.plots() / f'cdd_{"_".join(name_parts)}.pdf'
    else:
        file_path = paths.plots() / filename

    utils.ensureDir(file_path)
    plt.savefig(file_path, bbox_inches='tight')
    plt.close(fig)
    plt.rcParams['text.usetex'] = old_value


================================================
FILE: pytabkit/bench/eval/runtimes.py
================================================
from typing import Dict

import numpy as np

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.models import utils


def get_avg_train_times(paths: Paths, coll_name: str, per_1k_samples: bool = False) -> Dict[str, float]:
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    alg_names = [path.name for path in paths.times().iterdir()]
    result = dict()
    for alg_name in alg_names:
        file_paths = [paths.times_alg_task(alg_name, task_desc=task_info.task_desc) / 'times.yaml'
                      for task_info in task_infos]
        if all(utils.existsFile(file_path) for file_path in file_paths):
            single_times = [utils.deserialize(file_path, use_yaml=True)['fit_time'] for file_path in file_paths]
            if per_1k_samples:
                # use 0.6 since that is the fraction of training samples
                single_times = [single_time / ((0.6 * task_info.n_samples) / 1000)
                                for single_time, task_info in zip(single_times, task_infos)]
            mean_time = np.mean(single_times)
            result[alg_name] = mean_time
    return result


def get_avg_predict_times(paths: Paths, coll_name: str, per_1k_samples: bool = False) -> Dict[str, float]:
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    alg_names = [path.name for path in paths.times().iterdir()]
    result = dict()
    for alg_name in alg_names:
        file_paths = [paths.times_alg_task(alg_name, task_desc=task_info.task_desc) / 'times.yaml'
                      for task_info in task_infos]
        if all(utils.existsFile(file_path) for file_path in file_paths):
            single_times = [utils.deserialize(file_path, use_yaml=True)['predict_time'] for file_path in file_paths]
            if per_1k_samples:
                # use 0.6 since that is the fraction of training samples
                single_times = [single_time / ((0.2 * task_info.n_samples) / 1000)
                                for single_time, task_info in zip(single_times, task_infos)]
            mean_time = np.mean(single_times)
            result[alg_name] = mean_time
    return result


================================================
FILE: pytabkit/bench/eval/tables.py
================================================
from typing import List, Optional

import numpy as np

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results, get_opt_groups, get_simplified_name, \
    get_display_name
from pytabkit.bench.eval.evaluation import TaskWeighting, FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector
from pytabkit.models import utils
from pytabkit.models.data.data import TaskType
from pytabkit.models.data.nested_dict import NestedDict

def _get_table_str(*parts: List[List[str]]):
    part_rows = [[' & '.join(row) + r' \\' for row in part] for part in parts]
    n_cols = max(len(row) for part in parts for row in part)
    begin_table_str = r'\begin{tabular}{' + ('c' * n_cols) + r'}' + '\n' + r'\toprule'
    end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}'
    all_row_strs = [begin_table_str]
    for part in part_rows[:-1]:
        all_row_strs.extend(part)
        all_row_strs.append(r'\midrule')
    all_row_strs.extend(part_rows[-1])
    all_row_strs.append(end_table_str)
    complete_str = '\n'.join(all_row_strs)
    return complete_str


def generate_ds_table(paths: Paths, task_collection: TaskCollection, include_openml_ids: bool = False):
    print(f'Generating dataset table for {task_collection.coll_name}')
    task_infos = task_collection.load_infos(paths)
    task_infos.sort(key=lambda ti: ti.task_desc.task_name)
    file_path = paths.plots() / f'datasets_{task_collection.coll_name}.tex'

    is_classification = any(ti.task_type == TaskType.CLASSIFICATION for ti in task_infos)

    # columns to include: name, n_samples, n_numerical, n_categorical, largest_category, openml id,
    # (link), (subsampled), (n_classes), (citation), (weight)
    table_rows = [['Name', r'\#samples', r'\#num.\ features', r'\#cat.\ features', r'largest \#categories']]
    if is_classification:
        table_rows[0].append(r'\#classes')
    if include_openml_ids:
        table_rows[0].append('OpenML task ID')
    for task_info in task_infos:
        row = []
        row.append(task_info.task_desc.task_name.replace('_', r'\_'))
        row.append(str(task_info.n_samples))
        row.append(str(task_info.tensor_infos['x_cont'].get_n_features()))
        n_cat = task_info.tensor_infos['x_cat'].get_n_features()
        row.append(str(n_cat))
        # subtract 1 for the missing class
        row.append(str(task_info.tensor_infos['x_cat'].get_cat_sizes().max().item() - 1) if n_cat > 0 else '')
        if is_classification:
            row.append(str(task_info.tensor_infos['y'].get_cat_size_product()))
        if include_openml_ids:
            row.append(str(task_info.more_info_dict.get('openml_task_id', '')))
        table_rows.append(row)

    begin_table_str = r'\begin{tabular}{' + ('c' * len(table_rows[0])) + r'}' + '\n' + r'\toprule'
    row_strs = [' & '.join(row) + r' \\' for row in table_rows]
    end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}'
    all_row_strs = [begin_table_str, row_strs[0], r'\midrule'] + row_strs[1:] + [end_table_str]
    complete_str = '\n'.join(all_row_strs)
    utils.writeToFile(file_path, complete_str)


def generate_collections_table(paths: Paths):
    print(f'Creating collections table')
    coll_display_names = {'meta-train-class': r'$\mathcal{B}^{\operatorname{train}}_{\mathrm{class}}$',
                          'meta-test-class': r'$\mathcal{B}^{\operatorname{test}}_{\mathrm{class}}$',
                          'grinsztajn-class-filtered': r'$\mathcal{B}^{\operatorname{Grinsztajn}}_{\mathrm{class}}$',
                          'meta-train-reg': r'$\mathcal{B}^{\operatorname{train}}_{\mathrm{reg}}$',
                          'meta-test-reg': r'$\mathcal{B}^{\operatorname{test}}_{\mathrm{reg}}$',
                          'grinsztajn-reg': r'$\mathcal{B}^{\operatorname{Grinsztajn}}_{\mathrm{reg}}$'}
    coll_names = list(coll_display_names.keys())

    # todo: number of distinct data sets
    rows = [r'\#datasets', r'\#dataset groups', r'min \#samples', r'max \#samples', r'max \#classes', r'max \#features',
            r'max \#categories']

    table_columns = {'': rows}

    for coll_name in coll_names:
        task_collection = TaskCollection.from_name(coll_name, paths)
        task_infos = task_collection.load_infos(paths)
        task_infos.sort(key=lambda ti: ti.task_desc.task_name)

        is_classification = any(ti.task_type == TaskType.CLASSIFICATION for ti in task_infos)

        n_samples_list = []
        n_features_list = []
        max_cat_size_list = []
        n_classes_list = []

        for task_info in task_infos:
            n_samples_list.append(task_info.n_samples)
            n_features_list.append(task_info.tensor_infos['x_cont'].get_n_features()
                                   + task_info.tensor_infos['x_cat'].get_n_features())
            n_cat = task_info.tensor_infos['x_cat'].get_n_features()
            # subtract 1 for the missing class
            max_cat_size_list.append(
                task_info.tensor_infos['x_cat'].get_cat_sizes().max().item() - 1 if n_cat > 0 else 0)
            if is_classification:
                n_classes_list.append(task_info.tensor_infos['y'].get_cat_size_product())
            else:
                n_classes_list.append(0)

        separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
        if coll_name.startswith('meta-train'):
            n_dataset_groups = TaskWeighting(task_infos, separate_task_names).get_n_groups()
        else:
            n_dataset_groups = len(task_infos)

        table_columns[coll_display_names[coll_name]] = \
            [str(len(task_infos)), str(n_dataset_groups), str(min(n_samples_list)), str(max(n_samples_list)),
             str(max(n_classes_list)), str(max(n_features_list)), str(max(max_cat_size_list))]

    keys = list(table_columns.keys())
    n_info_rows = len(table_columns[keys[0]])
    table_rows = [keys] + [[table_columns[key][i] for key in keys] for i in range(n_info_rows)]

    begin_table_str = r'\begin{tabular}{' + ('c' * len(table_rows[0])) + r'}' + '\n' + r'\toprule'
    row_strs = [' & '.join(row) + r' \\' for row in table_rows]
    end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}'
    all_row_strs = [begin_table_str, row_strs[0], r'\midrule'] + row_strs[1:] + [end_table_str]
    complete_str = '\n'.join(all_row_strs)
    file_path = paths.plots() / f'collections_summary.tex'
    utils.writeToFile(file_path, complete_str)


def generate_individual_results_table(paths: Paths, tables: ResultsTables, filename: str, coll_name: str,
                                      alg_names: List[str],
                                      test_metric_name: Optional[str] = None,
                                      val_metric_name: Optional[str] = None):

    table = tables.get(coll_name)

    means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False,
                                             test_metric_name=test_metric_name, val_metric_name=val_metric_name,
                                             return_percentages=False, use_task_mean=False, use_geometric_mean=False)

    alg_names = [an for an in alg_names if an in means]

    table_head = [['Dataset'] + [get_display_name(an) for an in alg_names]]
    table_body = []

    enumerated_task_infos = list(enumerate(table.test_table.task_infos))
    enumerated_task_infos.sort(key=lambda tup: tup[1].task_desc.task_name.lower())

    print(f'{coll_name=}')
    print(f'{list(means.keys())=}')

    for task_idx, task_info in enumerated_task_infos:
        row_scores = [means[alg_name][task_idx] for alg_name in alg_names]
        row_errs = [means[alg_name][task_idx] - intervals[alg_name][0][task_idx] for alg_name in alg_names]
        min_row_score = np.min(row_scores)
        is_best_list = [score == min_row_score for score in row_scores]
        is_significant_list = [score <= min_row_score + stderr for score, stderr in zip(row_scores, row_errs)]
        row_strs = []
        for is_best, is_significant, row_score, row_err in zip(is_best_list, is_significant_list, row_scores, row_errs):
            cur_str = f'{row_score:4.3f}'
            if is_best:
                cur_str = r'\textbf{' + cur_str + r'}'
            elif is_significant:
                cur_str = r'\underline{' + cur_str + r'}'

            cur_str = cur_str + r'$\pm$' + f'{row_err:4.3f}'
            row_strs.append(cur_str)

        table_body.append([task_info.task_desc.task_name] + row_strs)

    # escape underscores for latex
    table_head = [[val.replace('_', r'\_') for val in row] for row in table_head]
    table_body = [[val.replace('_', r'\_') for val in row] for row in table_body]

    table_str = _get_table_str(table_head, table_body)
    file_path = paths.plots() / filename
    utils.writeToFile(file_path, table_str)


def generate_ablations_table(paths: Paths, tables: ResultsTables):
    print(f'Generating ablations table')
    # load results from the right tag (maybe with MLP-best-ablation)
    # problem: relative model should be the best one of the defaults (with best lr)
    # group by and optimize lrfactor
    coll_names = ['meta-train-class', 'meta-train-reg']
    # all_group_names = dict()
    # all_best_lrfactors = dict()

    abl_names = [
        (r'MLP-TD (without ablation)', 'default'),
        # (r'MLP-TD (fixed lr factor = 1.0)', 'default_lrfactor-1.0'),
        ('', ''),
        (r'Num.\ embeddings: PL', 'num-embeddings-pl'),
        (r'Num.\ embeddings: PLR', 'num-embeddings-plr'),
        (r'Num.\ embeddings: None', 'num-embeddings-none'),
        ('', ''),
        (r'Adam $\beta_2=0.999$ instead of $\beta_2=0.95$', 'beta2-0.999'),
        ('', ''),
        ('Learning rate schedule = cosine decay', 'lr-cos-decay'),
        ('Learning rate schedule = constant', 'lr-constant'),
        ('', ''),
        ('No label smoothing', 'no-label-smoothing'),
        ('', ''),
        (r'No learnable scaling', 'no-front-scale'),
        ('', ''),
        ('Non-parametric activation', 'non-parametric-act'),
        ('', ''),
        (r'Activation=Mish', 'act-mish'),
        (r'Activation=ReLU', 'act-relu'),
        (r'Activation=SELU', 'act-selu'),
        ('', ''),
        ('No dropout', 'pdrop-0.0'),
        (r'Dropout prob.\ $0.15$ (constant)', 'pdrop-0.15'),
        ('', ''),
        ('No weight decay', 'wd-0.0'),
        # ('Weight decay = 0.02 ($\operatorname{flat\_cos}$)', 'wd-0.02-flatcos'),
        ('Weight decay = 0.02 (constant)', 'wd-0.02'),
        ('', ''),
        (r'Standard param + no weight decay', 'standard-param_no-wd'),
        ('', ''),
        ('No data-dependent init', 'normal-init'),
        ('', ''),
        ('First best epoch instead of last best', 'first-best-epoch'),
        ('', ''),
        ('Only one-hot encoding', 'no-cat-embs'),
        # ('First best epoch (fixed lr factor = 0.5)', 'first-best-epoch_lrfactor-0.5'),
    ]

    results_dict = NestedDict()  # index by [short_group_name][coll_name][property]
    # possible properties: 'score', 'lower', 'upper', 'best_lr_factor',

    for coll_name in coll_names:
        table = tables.get(coll_name, n_cv=1, tag='paper_mlp_ablations')
        results, _ = get_benchmark_results(paths, table=table, coll_name=coll_name, use_relative_score=False,
                                           return_percentages=False,
                                           simplify_name_fn=lambda x: x.replace(' [bag-1]', ''))
        default_keys = [key for key in results if 'default' in key]
        # print(f'{default_keys=}')
        default_scores = [results[key] for key in default_keys]
        best_key = default_keys[np.argmin(default_scores)]
        rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                           rel_alg_name=best_key,
                                                           simplify_name_fn=lambda x: x.replace(' [bag-1]', ''))
        keys = list(key for key in rel_results.keys() if key.startswith('RealMLP-TD-'))
        # keys = list(rel_results.keys())
        group_names = list(set([key.split('lrfactor-')[0] for key in keys]))
        # all_group_names[coll_name] = group_names
        for group_name in group_names:
            # remove the 'MLP-TD-reg-ablation_' and last '_'
            short_group_name = r'_'.join(group_name.split('_')[1:-1])
            group_keys = [key for key in keys if key.startswith(group_name)]
            group_results = [rel_results[key] for key in group_keys]
            best_key = group_keys[np.argmin(group_results)]
            # print(f'{best_key=}')
            results_dict[short_group_name, coll_name, 'best_lr_factor'] = best_key.split('lrfactor-')[1]
            results_dict[short_group_name, coll_name, 'score'] = rel_results[best_key]
            best_interval = rel_intervals[best_key]
            results_dict[short_group_name, coll_name, 'lower'] = best_interval[0]
            results_dict[short_group_name, coll_name, 'upper'] = best_interval[1]

        for key in keys:
            # also add non-optimized versions to the table
            # add default with default lr
            # short_group_name = 'default_lrfactor-1.0'
            # key = [key for key in rel_results.keys() if key.endswith('default_lrfactor-1.0')][0]
            short_group_name = '_'.join(key.split('_')[1:])
            results_dict[short_group_name, coll_name, 'best_lr_factor'] = ''
            results_dict[short_group_name, coll_name, 'score'] = rel_results[key]
            best_interval = rel_intervals[key]
            results_dict[short_group_name, coll_name, 'lower'] = best_interval[0]
            results_dict[short_group_name, coll_name, 'upper'] = best_interval[1]

        # all_best_lrfactors[coll_name] = best_lrfactors

    table_head = [[''] + [r'\multicolumn{2}{c}{' + coll_name + r'}' for coll_name in coll_names],
                  ['Ablation'] + [r'Error increase in \%', 'best lr factor'] * len(coll_names)]

    # all_group_names = sorted(list(results_dict.get_dict().keys()))
    table_body = []
    # for group_name in all_group_names:
    for label, short_group_name in abl_names:
        # short_group_name = r'\_'.join(group_name.split('_')[1:-1])
        row = [label]
        for coll_name in coll_names:
            if (short_group_name, coll_name, 'best_lr_factor') in results_dict:
                results = results_dict[short_group_name, coll_name]
                score = results['score']
                lower = results['lower']
                upper = results['upper']
                row.append(f'{score:2.1f} [{lower:2.1f}, {upper:2.1f}]')
                row.append(results['best_lr_factor'])
            else:
                row.append('')
                row.append('')
        table_body.append(row)

    table_str = _get_table_str(table_head, table_body)
    file_path = paths.plots() / 'ablations.tex'
    utils.writeToFile(file_path, table_str)


def generate_refit_table(paths: Paths, tables: ResultsTables, alg_family: str):
    print(f'Generating refit table for {alg_family}')
    coll_names = ['meta-train-class', 'meta-test-class', 'meta-train-reg', 'meta-test-reg']

    table_head = [['', r'\multicolumn{4}{c}{Error \textbf{reduction} relative to 1 fold in \%}'],
                  ['Method'] + coll_names]

    methods_labels_names = [
        (f' (bagging, 1 model, indiv. stopping)', f'_mean-cv-False_mean-refit-False [bag-1]'),
        (f' (bagging, 1 model, joint stopping)', f'_mean-cv-True_mean-refit-True [bag-1]'),
        (f' (bagging, 5 models, indiv. stopping)', f'_mean-cv-False_mean-refit-False [bag-5]'),
        (f' (bagging, 5 models, joint stopping)', f'_mean-cv-True_mean-refit-True [bag-5]'),
        (f' (refitting, 1 model, indiv. stopping)', f'_mean-cv-False_mean-refit-False [ens-1]'),
        (f' (refitting, 1 model, joint stopping)', f'_mean-cv-True_mean-refit-True [ens-1]'),
        (f' (refitting, 5 models, indiv. stopping)', f'_mean-cv-False_mean-refit-False [ens-5]'),
        (f' (refitting, 5 models, joint stopping)', f'_mean-cv-True_mean-refit-True [ens-5]')
    ]

    labels = [f'{alg_family}-TD{label_suffix}' for label_suffix, _ in methods_labels_names]
    table_body_columns = [labels[0:]]

    for coll_name in coll_names:
        column = []
        table = tables.get(coll_name, n_cv=5, tag='paper')
        # print(f'{table.test_table.alg_names=}')
        task_type_name = 'class' if 'class' in coll_name else 'reg'
        rel_alg_name = f'{alg_family}-TD-{task_type_name}_mean-cv-False_mean-refit-False [bag-1]'
        rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                           rel_alg_name=rel_alg_name,
                                                           simplify_name_fn=lambda x: x)
        alg_names = [f'{alg_family}-TD-{task_type_name}{suffix}' for _, suffix in methods_labels_names]
        results_list = [rel_results[alg_name] for alg_name in alg_names]
        for alg_name in alg_names[0:]:
            result = rel_results[alg_name]
            lower, upper = rel_intervals[alg_name]
            is_best = (result == np.min(results_list))
            not_significantly_worse = (np.min(results_list) >= lower)
            result_str = f'{-result:2.1f}'
            if is_best:
                result_str = r'\textbf{' + result_str + r'}'
            elif not_significantly_worse:
                result_str = r'\underline{' + result_str + r'}'
            column.append(result_str + f' [{-upper:2.1f}, {-lower:2.1f}]')
        table_body_columns.append(column)

    table_body = utils.shift_dim_nested(table_body_columns, 0, 1)

    table_str = _get_table_str(table_head, table_body)
    file_path = paths.plots() / f'refit_table_{alg_family}.tex'
    utils.writeToFile(file_path, table_str)


def generate_preprocessing_table(paths: Paths, tables: ResultsTables):
    print(f'Generating preprocessing table')
    coll_names = ['meta-train-class', 'meta-train-reg']

    table_head = [['', r'\multicolumn{2}{c}{Error \textbf{increase} relative to robust scale + smooth clip in \%}'],
                  ['Method'] + coll_names]

    methods_labels_names = [
        (r'Robust scale + smooth clip', f'RealMLP-TD-S_tfms-mc-rs-sc-oh'),
        (r'Robust scale', f'RealMLP-TD-S_tfms-mc-rs-oh'),
        (r'Standardize + smooth clip', f'RealMLP-TD-S_tfms-std-sc-oh'),
        (r'Standardize', f'RealMLP-TD-S_tfms-std-oh'),
        (r'Quantile transform (output dist.\ = normal)', f'RealMLP-TD-S_tfms-quantile-oh'),
        (r'Quantile transform (RTDL version)', f'RealMLP-TD-S_tfms-quantiletabr-oh'),
        (r'KDI transform ($\alpha = 1$, output dist.\ = normal)', f'RealMLP-TD-S_tfms-kdi1-oh'),
    ]

    labels = [label for label, _ in methods_labels_names]
    table_body_columns = [labels]

    for coll_name in coll_names:
        column = []
        table = tables.get(coll_name, n_cv=1, tag='paper_preprocessing')
        # print(f'{table.test_table.alg_names=}')
        rel_alg_name = f'RealMLP-TD-S_tfms-mc-rs-sc-oh'
        rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                           rel_alg_name=rel_alg_name)
        alg_names = [alg_name for _, alg_name in methods_labels_names]
        results_list = [rel_results[alg_name] for alg_name in alg_names]
        for alg_name in alg_names:
            result = rel_results[alg_name]
            lower, upper = rel_intervals[alg_name]
            is_best = (result == np.min(results_list))
            not_significantly_worse = (np.min(results_list) >= lower)
            result_str = f'{result:2.1f}'
            if is_best:
                result_str = r'\textbf{' + result_str + r'}'
            elif not_significantly_worse:
                result_str = r'\underline{' + result_str + r'}'
            column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]')
        table_body_columns.append(column)

    table_body = utils.shift_dim_nested(table_body_columns, 0, 1)

    table_str = _get_table_str(table_head, table_body)
    file_path = paths.plots() / f'preprocessing_ablation.tex'
    utils.writeToFile(file_path, table_str)


def generate_stopping_table(paths: Paths, tables: ResultsTables):
    print(f'Generating stopping table')
    coll_names = ['meta-train-class', 'meta-train-reg']

    table_head = [['', r'\multicolumn{2}{c}{Error \textbf{increase} relative to no early stopping in \%}'],
                  ['Method'] + coll_names]

    table_body = []

    for i, method in enumerate(['XGB-TD', 'LGBM-TD', 'CatBoost-TD']):
        esr_list = [1000, 300, 100, 50, 20, 10]
        labels = [method + f' (patience = {esr})' for esr in esr_list]
        table_body_columns = [labels]

        for coll_name in coll_names:
            column = []
            table = tables.get(coll_name, n_cv=1, tag='paper_early_stopping')
            # print(f'{table.test_table.alg_names=}')
            rel_alg_name = method + '_esr-1000'
            rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                               rel_alg_name=rel_alg_name)
            alg_names = [method + f'_esr-{esr}' for esr in esr_list]
            results_list = [rel_results[alg_name] for alg_name in alg_names]
            for alg_name in alg_names:
                result = rel_results[alg_name]
                lower, upper = rel_intervals[alg_name]
                is_best = (result == np.min(results_list))
                result_str = f'{result:2.1f}'
                if is_best:
                    result_str = r'\textbf{' + result_str + r'}'
                column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]')
            table_body_columns.append(column)

        new_rows = utils.shift_dim_nested(table_body_columns, 0, 1)

        if i > 0:
            new_rows[0][0] = r'\midrule' + '\n' + new_rows[0][0]

        table_body.extend(new_rows)

    table_str = _get_table_str(table_head, table_body)
    file_path = paths.plots() / f'early_stopping_table.tex'
    utils.writeToFile(file_path, table_str)


def generate_architecture_table(paths: Paths, tables: ResultsTables):
    print(f'Generating architecture table')
    coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg']

    table_head = [['', r'\multicolumn{4}{c}{Error \textbf{reduction} relative to MLP-D in \%}'],
                  ['Method'] + coll_names]

    methods_labels_names = [
        (r'MLP-D', f'MLP-RTDL-D'),
        (r'MLP-D (RS+SC)', f'MLP-RTDL-D_rssc'),
        (r'MLP-D (RS+SC, no wd, meta-tuned lr)', f'MLP-RTDL-reprod'),
        (r'MLP-D (RS+SC, no wd, meta-tuned lr, PL embeddings)', f'MLP-RTDL-reprod-pl'),
        (r'MLP-D (RS+SC, no wd, meta-tuned lr, RealMLP architecture)', f'MLP-RTDL-reprod-RealMLP-arch'),
        (r'RealMLP-TD-S', f'RealMLP-TD-S'),
        (r'RealMLP-TD', f'RealMLP-TD'),
        (r'TabR-S-D', f'TabR-S-D'),
        (r'TabR-S-D (RS+SC)', f'TabR-S-D_rssc'),
        (r'ResNet-D', f'ResNet-RTDL-D'),
        (r'ResNet-D (RS+SC)', f'ResNet-RTDL-D_rssc'),
    ]

    labels = [label for label, _ in methods_labels_names]
    table_body_columns = [labels]

    for coll_name in coll_names:
        column = []
        table = tables.get(coll_name, n_cv=1, tag='paper')
        # print(f'{table.test_table.alg_names=}')
        rel_alg_name = f'MLP-RTDL-D'
        rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name,
                                                           rel_alg_name=rel_alg_name)
        alg_names = [alg_name for _, alg_name in methods_labels_names]
        results_list = [rel_results[alg_name] for alg_name in alg_names]
        for alg_name in alg_names:
            result = rel_results[alg_name]
            lower, upper = rel_intervals[alg_name]
            is_best = (result == np.min(results_list))
            not_significantly_worse = (np.min(results_list) >= lower)

            # flip sign
            result = -result
            lower, upper = -upper, -lower
            result_str = f'{result:2.1f}'
            if is_best:
                result_str = r'\textbf{' + result_str + r'}'
            elif not_significantly_worse:
                result_str = r'\underline{' + result_str + r'}'
            column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]')
        table_body_columns.append(column)

    table_body = utils.shift_dim_nested(table_body_columns, 0, 1)

    table_str = _get_table_str(table_head, table_body)
    table_str = table_str.replace('ccccc', 'lcccc')  # make first column left-aligned
    file_path = paths.plots() / f'arch_and_preprocessing.tex'
    utils.writeToFile(file_path, table_str)


================================================
FILE: pytabkit/bench/run/__init__.py
================================================


================================================
FILE: pytabkit/bench/run/results.py
================================================
from pathlib import Path
from typing import Dict, List

import numpy as np

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskInfo
from pytabkit.models import utils


class ResultManager:
    """
    Stores experimental results and can save and load them.
    """

    def __init__(self):
        # indexing convention:
        # self.metrics_dict['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name] = float
        self.metrics_dict = {}

        # indexed by ['cv'/'refit'], then for example fields like ['y_preds'], ['fit_params']
        # or ['sub_info'] for hyperopt sub-results
        self.other_dict = {}

        # should be a numpy array of shape [n_models, n_samples, output_dim]
        self.y_preds_cv = None
        self.y_preds_refit = None

    def add_results(self, is_cv: bool, results_dict: Dict) -> None:
        """
        Add a dictionary of results.
        :param is_cv: Whether these results are from cross-validation (True) or refitting (False).
        :param results_dict: Dictionary of results
        """
        cv_str = 'cv' if is_cv else 'refit'
        if cv_str not in self.metrics_dict:
            self.metrics_dict[cv_str] = {}
        if cv_str not in self.other_dict:
            self.other_dict[cv_str] = {}
        for key, value in results_dict.items():
            if key == 'metrics':
                self.metrics_dict[cv_str] = value
            elif key == 'y_preds':
                if is_cv:
                    self.y_preds_cv = value
                else:
                    self.y_preds_refit = value
            else:
                self.other_dict[cv_str][key] = value

    def save(self, path: Path) -> None:
        utils.serialize(path / 'metrics.yaml', self.metrics_dict, use_yaml=True)
        # random search hpo often generates numpy datatype scalars, but these cannot be saved by msgpack,
        # so we convert them
        other_dict = utils.numpy_to_native_rec(self.other_dict)
        utils.serialize(path / 'other.msgpack.gz', other_dict, use_msgpack=True, compressed=True)
        # also save as yaml for readability
        utils.serialize(path / 'other.yaml', other_dict, use_yaml=True)

        if self.y_preds_cv is not None:
            np.savez_compressed(path / 'y_preds_cv.npz', y_preds=self.y_preds_cv)
        if self.y_preds_refit is not None:
            np.savez_compressed(path / 'y_preds_refit.npz', y_preds=self.y_preds_refit)

    @staticmethod
    def load(path: Path, load_other: bool = True, load_preds: bool = True):
        """
        Load results.
        :param path: Data path.
        :param load_other: If True, load other_dict.
        :param load_preds: If True, load the model predictions.
        :return:
        """
        rm = ResultManager()
        rm.metrics_dict = utils.deserialize(path / 'metrics.yaml', use_yaml=True)
        if load_other:
            rm.other_dict = utils.deserialize(path / 'other.msgpack.gz', use_msgpack=True, compressed=True)
            for mode in ['cv', 'refit']:
                if mode in rm.other_dict and 'y_preds' in rm.other_dict[mode]:
                    # other_dict was created by old code and still contains y_preds
                    if mode == 'cv':
                        rm.y_preds_cv = rm.other_dict[mode]['y_preds']
                    else:
                        rm.y_preds_refit = rm.other_dict[mode]['y_preds']

        if load_preds:
            if utils.existsFile(path / 'y_preds_cv.npz'):
                rm.y_preds_cv = np.load(path / 'y_preds_cv.npz')['y_preds']
            if utils.existsFile(path / 'y_preds_refit.npz'):
                rm.y_preds_refit = np.load(path / 'y_preds_refit.npz')['y_preds']
        return rm


def save_summaries(paths: Paths, task_infos: List[TaskInfo], alg_name: str, n_cv: int, rerun=False) -> None:
    """
    Compress the results into result_summaries that can be loaded faster for evaluation.
    :param paths: Path configuration.
    :param task_infos: Task infos of tasks that should be summarized.
    :param alg_name: Name of the method whose results should be summarized.
    :param n_cv: Number of cross-validation splits for which the results should be summarized.
    :param rerun: Whether to re-compute the summaries even if summaries are already present.
    """
    for task_info in task_infos:
        task_desc = task_info.task_desc
        src_path = paths.results_alg_task(task_desc, alg_name, n_cv)
        dest_path = paths.summary_alg_task(task_desc, alg_name, n_cv)
        if not rerun and utils.existsDir(dest_path):
            continue

        # indexed by [split_type][split_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_index)][metric_name]
        metrics_dict = {}
        for split_type_path in src_path.iterdir():
            split_type = split_type_path.name
            split_id_metrics_list = []
            split_id = 0
            while True:
                split_id_path = split_type_path / str(split_id)
                if not utils.existsDir(split_id_path):
                    break
                rm = ResultManager.load(split_id_path, load_other=False, load_preds=False)

                split_id_metrics_list.append(rm.metrics_dict)
                split_id += 1
            if split_id >= 1:
                # there exists a split
                metrics_dict[split_type] = split_id_metrics_list

        if len(metrics_dict) > 0:
            # shift split_idx dimension to the end
            results_dict = utils.shift_dim_nested(metrics_dict, 1, 6)
            # print(f'{results_dict=}')
            # results_dict[split_type]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name][split_idx]
            utils.serialize(dest_path / 'metrics.msgpack.gz', results_dict, use_msgpack=True, compressed=True)


================================================
FILE: pytabkit/bench/run/task_execution.py
================================================
import shutil
import traceback
from typing import List, Optional

import numpy as np

from pytabkit.bench.alg_wrappers.general import AlgWrapper
from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskPackage, TaskInfo
from pytabkit.bench.run.results import save_summaries, ResultManager
from pytabkit.bench.scheduling.schedulers import BaseJobScheduler
from pytabkit.models import utils
from pytabkit.models.training.logging import StdoutLogger
import glob
import math

from pytabkit.bench.scheduling.jobs import AbstractJob
from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.training.metrics import Metrics


class TabBenchJob(AbstractJob):
    """
    Internal helper class implementing AbstractJob for running tabular benchmarking jobs with our scheduling code.
    """

    def __init__(self, alg_name: str, alg_wrapper: AlgWrapper, task_package: TaskPackage, paths: Paths,
                 metrics: Optional[Metrics] = None):
        """
        :param alg_name: Unique name of the method (for saving results).
        :param alg_wrapper: Wrapper implementing the ML method.
        :param task_package: Task package containing information on dataset and splits.
        :param paths: Data path configuration.
        """
        self.alg_name = alg_name
        self.alg_wrapper = alg_wrapper
        self.task_package = task_package
        self.paths = paths
        self.metrics = metrics

    def get_group(self) -> str:
        """
        :return: Group name, in this case just the name of the AlgWrapper class.
        """
        return self.alg_wrapper.__class__.__name__

    def __call__(self, assigned_resources: NodeResources) -> bool:
        """
        Run the experiment with the given resources.

        :param assigned_resources: Assigned resources.
        :return: False if the job completed more quickly because results were partially already saved.
        """
        task_desc = self.task_package.task_info.task_desc
        print(f'Running {self.alg_name} on {len(self.task_package.split_infos)} splits of dataset {task_desc} '
              f'with {assigned_resources.get_n_threads()} threads'
              , flush=True)
        logger = StdoutLogger()
        # check whether any data directories exist, i.e. whether data is already available
        dirs_exist = [utils.existsDir(self.paths.results_alg_task_split(task_desc, self.alg_name,
                                                                        self.task_package.n_cv, split_info.split_type,
                                                                        split_info.id))
                      for split_info in self.task_package.split_infos]
        # check whether the run is a normal run which does not have unusually short runtime due to pre-computed data
        finished_normally = self.task_package.rerun or not any(dirs_exist)
        # create tmp_folders for saving temporary data in case the run is interrupted and needs to be restarted
        tmp_folders = [self.paths.results_alg_task_split(task_desc,
                                                         alg_name=self.task_package.alg_name,
                                                         n_cv=self.task_package.n_cv,
                                                         split_type=split_info.split_type,
                                                         split_id=split_info.id) / 'tmp' for split_info in
                       self.task_package.split_infos]
        result_managers_dict = self.alg_wrapper.run(self.task_package, logger, assigned_resources, tmp_folders,
                                                    self.metrics)
        for alg_name_suffix, result_managers in result_managers_dict.items():
            for rm, split_info in zip(result_managers, self.task_package.split_infos):
                rm.save(self.paths.results_alg_task_split(task_desc, self.alg_name + alg_name_suffix,
                                                          self.task_package.n_cv,
                                                          split_info.split_type, split_info.id))

        # delete tmp_folders to save disk space
        for tmp_folder in tmp_folders:
            if utils.existsDir(tmp_folder):
                shutil.rmtree(tmp_folder)

        print(f'Finished running {self.alg_name} on {len(self.task_package.split_infos)} splits of dataset {task_desc}',
              flush=True)
        return finished_normally

    def get_required_resources(self) -> RequiredResources:
        return self.alg_wrapper.get_required_resources(self.task_package)

    def get_desc(self) -> str:
        split_ids = [split_info.id for split_info in self.task_package.split_infos]
        split_str = f'splits {sorted(split_ids)}'
        if len(split_ids) == 1:
            split_str = f'split {split_ids[0]}'
        elif all([split_id == split_ids[0] + i for i, split_id in enumerate(split_ids)]):
            # we have a range
            split_str = f'splits {split_ids[0]}-{split_ids[-1]}'
        return self.alg_name + f' on {split_str} of task {self.task_package.task_info.task_desc}'


class RunConfig:
    """
    This class stores some benchmark settings that a method can be run with.
    """

    def __init__(self, n_tt_splits: int, n_cv: int = 1, n_refit: int = 0, use_default_split: bool = False,
                 trainval_fraction: float = 0.8, train_fraction: float = 0.75,
                 save_y_pred: bool = False, min_split_idx: int = 0, metrics: Optional[Metrics] = None):
        """
        :param n_tt_splits: Number of trainval-test-splits to evaluate the method with.
        :param n_cv: Number of cross-validation folds. If n_cv=1, use a single random split.
        :param n_refit: Number of models that should be refitted (and ensembled) on the training and validation set.
        :param use_default_split: Whether the default split of the datasets should be used.
        :param trainval_fraction: Fraction in (0, 1) of the data that should be used for training and validation set.
        The rest will be used for the test set.
        :param train_fraction: Only used if n_cv=1.
        In this case, out of the training+validation data, the given fraction of the data is used for training.
        :param save_y_pred: Whether the predictions on the whole dataset should be saved
        (can use a considerable amount of disk storage, e.g. 3 GB
        for running a single method on meta-train and meta-test benchmarks).
        :param min_split_idx: Minimum index of the split that should be used.
        Can be set larger than zero if only a sub-range of the splits should be run.
        :param metrics: Metrics object that specifies which metrics should be evaluated.
        """
        self.n_tt_splits = n_tt_splits
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.use_default_split = use_default_split
        self.trainval_fraction = trainval_fraction
        self.train_fraction = train_fraction
        self.save_y_pred = save_y_pred
        self.min_split_idx = min_split_idx
        self.metrics = metrics


class TabBenchJobManager:
    """
    This class can be used to add and run jobs for tabular benchmarks.
    """

    def __init__(self, paths: Paths):
        """
        :param paths: Data path configuration.
        """
        self.paths = paths
        self.jobs = []
        self.save_args = []

    def add_jobs(self, task_infos: List[TaskInfo], run_config: RunConfig, alg_name: str, alg_wrapper: AlgWrapper,
                 tags: Optional[List[str]] = None, rerun: bool = False) -> None:
        """
        Add jobs for the given method with the given run configuration on all task infos
        where results are not already available (except if rerun=True).
        Will also store the algorithm configuration and copy the current source files
        to the corresponding algorithm folder.
        :param task_infos: List of TaskInfo objects representing the datasets on which the method should be run.
        :param run_config: Run configuration.
        :param alg_name: Name of the method, should be unique (is used for storing and printing the results)
        :param alg_wrapper: Wrapper implementing the ML method.
        :param tags: List of tags associated to the method (can be used for selecting a subset of methods later).
        :param rerun: If True, run all combinations even if there are already computed results stored for it.
        (For large reruns, we rather recommend renaming the old method with rename_alg.py
        and then running the jobs again with the new name and rerun=False.
        This avoids problems if the rerun crashes and preserves the old results for comparison.)
        """
        # todo: update after updating project structure
        if tags is None:
            tags = ['default']

        dummy_task_package = TaskPackage(task_infos[0],
                                         split_infos=task_infos[0].get_random_splits(run_config.n_tt_splits,
                                                                                     trainval_fraction=run_config.trainval_fraction,
                                                                                     train_fraction=run_config.train_fraction)[
                                                     0:1],
                                         n_cv=run_config.n_cv, n_refit=run_config.n_refit,
                                         paths=self.paths, rerun=rerun, alg_name=alg_name,
                                         save_y_pred=run_config.save_y_pred)

        # possible versions of the same alg that are generated
        alg_suffixes = alg_wrapper.get_pred_param_names(dummy_task_package)

        task_packages = []
        for task_info in task_infos:
            if run_config.use_default_split:
                tt_split_infos = task_info.get_default_splits(run_config.n_tt_splits)
            else:
                tt_split_infos = task_info.get_random_splits(run_config.n_tt_splits,
                                                             trainval_fraction=run_config.trainval_fraction,
                                                             train_fraction=run_config.train_fraction)
            tt_split_infos = tt_split_infos[run_config.min_split_idx:]

            if not rerun:
                # filter out splits where results have already been computed
                tt_split_infos = [split_info for split_info in tt_split_infos
                                  if not all(utils.existsFile(
                        self.paths.results_alg_task_split(task_info.task_desc, alg_name + suffix, run_config.n_cv,
                                                          split_info.split_type, split_info.id) / 'metrics.yaml') for
                                             suffix in alg_suffixes)]

            n_tt_splits = len(tt_split_infos)
            if n_tt_splits == 0:
                continue

            max_n_vectorized = alg_wrapper.get_max_n_vectorized(task_info)
            n_splits_per_package = min(n_tt_splits,
                                       max(1, max_n_vectorized // max(run_config.n_cv, run_config.n_refit)))
            n_packages_per_task = math.ceil(n_tt_splits / n_splits_per_package)
            # distribute load more evenly across packages
            # (e.g. have split sizes (4, 4, 4) instead of (5, 5, 2) for n_tt_splits=12)
            n_splits_per_package = math.ceil(n_tt_splits / n_packages_per_task)

            batch_idxs = [n_splits_per_package * i for i in range((n_tt_splits - 1) // n_splits_per_package + 1)] \
                         + [n_tt_splits]

            for start, stop in zip(batch_idxs[:-1], batch_idxs[1:]):
                task_packages.append(TaskPackage(task_info, split_infos=tt_split_infos[start:stop],
                                                 n_cv=run_config.n_cv, n_refit=run_config.n_refit,
                                                 paths=self.paths, rerun=rerun, alg_name=alg_name,
                                                 save_y_pred=run_config.save_y_pred))

        for tp in task_packages:
            self.jobs.append(TabBenchJob(alg_name=alg_name, alg_wrapper=alg_wrapper, task_package=tp, paths=self.paths,
                                         metrics=run_config.metrics))

        if len(task_packages) > 0:
            for suffix in alg_suffixes:
                full_alg_name = alg_name + suffix
                # store alg info because something is actually being run
                # todo: this might not work on Windows
                # copy python files
                py_files = glob.glob('scripts/*.py') + glob.glob('pytabkit/**/*.py', recursive=True)
                utils.serialize(self.paths.algs() / full_alg_name / 'wrapper.pkl', alg_wrapper)
                extended_config = utils.join_dicts(alg_wrapper.config,
                                                   {'alg_name': alg_name,
                                                    'pred_params_name': suffix,
                                                    'wrapper_class_name': alg_wrapper.__class__.__name__})
                utils.serialize(self.paths.algs() / full_alg_name / 'extended_config.yaml', extended_config,
                                use_yaml=True)
                utils.serialize(self.paths.algs() / full_alg_name / 'tags.yaml', tags, use_yaml=True)
                for py_file in py_files:
                    utils.copyFile(py_file, self.paths.algs() / full_alg_name / 'src' / py_file)

        for suffix in alg_suffixes:
            rerun_summary = True  # always create the summary since a part of the results might have changed.
            self.save_args.append((self.paths, task_infos, alg_name + suffix, run_config.n_cv, rerun_summary))

    def run_jobs(self, scheduler: BaseJobScheduler) -> None:
        """
        Runs the added jobs with the given scheduler.
        After all jobs are done, creates the result summaries for faster loading of results.
        :param scheduler: Scheduler for running the jobs.
        """
        print(f'Starting scheduler')
        scheduler.add_jobs(self.jobs)
        scheduler.run()

        for args in self.save_args:
            try:
                save_summaries(*args)
            except Exception as e:
                traceback.print_exc()


def run_alg_selection(paths: Paths, config: RunConfig, task_infos: List[TaskInfo],
                      target_alg_name: str, alg_names: List[str], val_metric_name: str, tags: List[str] = ['paper'],
                      rerun: bool = False):
    n_cv = config.n_cv
    split_type = SplitType.DEFAULT if config.use_default_split else SplitType.RANDOM
    assert len(alg_names) > 0
    assert config.n_refit == 0  # not implemented otherwise

    for task_info in task_infos:
        task_desc = task_info.task_desc
        for split_id in range(config.n_tt_splits):
            target_path = paths.results_alg_task_split(task_desc, target_alg_name, n_cv, split_type, split_id)
            if utils.existsFile(target_path / 'metrics.yaml') and not rerun:
                continue

            print(f'Running algorithm selection for {target_alg_name} on split {split_id} of task {task_desc}')
            best_alg_name = None
            best_val_score = np.inf
            best_alg_idx = None

            # find best alg
            for i, alg_name in enumerate(alg_names):
                rm = ResultManager.load(paths.results_alg_task_split(task_desc, alg_name, n_cv, split_type, split_id),
                                        load_other=False, load_preds=False)
                # todo: probably shouldn't use i in both loops
                val_score = np.mean([rm.metrics_dict['cv']['val']['1'][str(j)][val_metric_name] for j in range(n_cv)])
                # print(f'validation score for model {i} with alg_name {alg_name}: {val_score}')
                # print(f'{val_score=}, {alg_name=}, {i=}')

                if val_score < best_val_score or best_alg_name is None:
                    best_val_score = val_score
                    best_alg_name = alg_name
                    best_alg_idx = i

            # print(f'{best_val_score=}, {best_alg_name=}, {best_alg_idx=}')
            # load full results of best alg and save them to target directory
            rm = ResultManager.load(paths.results_alg_task_split(task_desc, best_alg_name, n_cv, split_type, split_id))
            rm.other_dict['cv']['fit_params'] = dict(best_alg_idx=best_alg_idx, best_alg_name=best_alg_name,
                                                     sub_fit_params=rm.other_dict['cv']['fit_params'])
            rm.save(target_path)

    # save alg in algs folder
    py_files = glob.glob('scripts/*.py') + glob.glob('pytabkit/**/*.py', recursive=True)
    # utils.serialize(paths.algs() / target_alg_name / 'wrapper.pkl', alg_wrapper)
    extended_config = dict(sub_algs=alg_names)
    utils.serialize(paths.algs() / target_alg_name / 'extended_config.yaml', extended_config, use_yaml=True)
    utils.serialize(paths.algs() / target_alg_name / 'tags.yaml', tags, use_yaml=True)
    for py_file in py_files:
        utils.copyFile(py_file, paths.algs() / target_alg_name / 'src' / py_file)

    # save summaries
    print(f'Saving summaries')
    save_summaries(paths, task_infos, target_alg_name, n_cv=n_cv, rerun=True)


================================================
FILE: pytabkit/bench/scheduling/__init__.py
================================================


================================================
FILE: pytabkit/bench/scheduling/execution.py
================================================
import os

import time
import multiprocessing as mp
import traceback
from typing import Tuple, Optional, List

import numpy as np

from pytabkit.bench.scheduling.jobs import JobRunner
from pytabkit.bench.scheduling.resource_manager import ResourceManager, JobInfo
from pytabkit.bench.scheduling.resources import NodeResources, SystemResources
from pytabkit.models.utils import FunctionProcess


def get_gpu_rams_gb(use_reserved: bool = True):
    """
    Returns:
      gpu_rams_gb: total GPU memory per visible device (GB)
      gpu_rams_fixed_gb: this process GPU memory per visible device (GB)
        - reserved (default): torch caching allocator reserved bytes (often matches "process used" better)
        - allocated: live tensor bytes only
    """
    # do it in torch, it respects CUDA_VISIBLE_DEVICES and doesn't need the pynvml dependency
    BYTES_TO_GB = 1024.0 ** 3
    import torch

    gpu_rams_gb = []
    gpu_rams_fixed_gb = []

    n = torch.cuda.device_count()  # respects CUDA_VISIBLE_DEVICES ("" => 0)
    for i in range(n):
        with torch.cuda.device(i):
            _free_b, total_b = torch.cuda.mem_get_info()

        gpu_rams_gb.append(total_b / BYTES_TO_GB)

        if use_reserved:
            used_b = torch.cuda.memory_reserved(i)
        else:
            used_b = torch.cuda.memory_allocated(i)

        gpu_rams_fixed_gb.append(used_b / BYTES_TO_GB)

    return gpu_rams_gb, gpu_rams_fixed_gb


def measure_node_resources(node_id: int) -> Tuple[NodeResources, NodeResources]:
    """
    Function that measures available resources.

    :param node_id: Node ID that will be used to identify the node in the returned NodeResources.
    :return: Returns a tuple of NodeResources objects. The first one contains the total available resources,
        and the second one contains the resources that a single process
        (with PyTorch GPU usage) uses without doing anything.
    """
    import torch
    n_gpus = torch.cuda.device_count()

    if n_gpus > 0:
        # init cuda
        # alloc dummy tensors to know how much memory PyTorch uses for its runtime
        dummy_tensors = [torch.ones(1).to(f'cuda:{i}') for i in range(n_gpus)]
        gpu_rams_gb, gpu_rams_fixed_gb = get_gpu_rams_gb()
    else:
        gpu_rams_gb = []
        gpu_rams_fixed_gb = []

    import psutil
    import os
    cpu_ram_gb = psutil.virtual_memory().available / (1024. ** 3)
    cpu_ram_fixed_gb = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3
    n_threads = mp.cpu_count()
    n_physical_cores = n_threads // 2

    node_resources = NodeResources(node_id=node_id, n_threads=n_threads, cpu_ram_gb=cpu_ram_gb,
                                   gpu_usages=np.ones(n_gpus), gpu_rams_gb=np.asarray(gpu_rams_gb),
                                   physical_core_usages=np.ones(n_physical_cores))
    fixed_node_resources = NodeResources(node_id=node_id, n_threads=0.0, cpu_ram_gb=cpu_ram_fixed_gb,
                                         gpu_usages=np.zeros(n_gpus),
                                         gpu_rams_gb=np.asarray(gpu_rams_fixed_gb),
                                         physical_core_usages=np.zeros(n_physical_cores))
    # print('measure_gpu_resources:', gpu_rams_gb, gpu_rams_fixed_gb)
    # return np.asarray(gpu_rams_gb), np.asarray(gpu_rams_fixed_gb)
    return node_resources, fixed_node_resources


def node_runner(feedback_queue, job_queue, node_id: int):
    mp.set_start_method('fork', force=True)

    # get resources in separate process so CUDA runtime is shut down when the process is terminated
    # this means that this process will not use up CUDA memory all the time
    node_resources, fixed_node_resources = FunctionProcess(measure_node_resources, node_id).start().pop_result()

    feedback_queue.put((node_resources, fixed_node_resources))

    processes = []
    process_rams_gb = []

    # print(f'DEBUG: start loop', flush=True)

    while True:
        # get new jobs from queue
        while not job_queue.empty():
            try:
                job_str = job_queue.get(timeout=0.1)
                # print(f'DEBUG: got job str', flush=True)
            except Exception as e:
                print(traceback.format_exc())
                # might have been queue.Empty or ray.util.queue.Empty exception
                break  # queue is empty
            if job_str is False:  # termination signal
                # cannot use None as termination signal since that is already the timeout signal
                return  # or check if processes are still running?

            import dill
            job_data = dill.loads(job_str)
            # print(f'DEBUG: got job data', flush=True)
            processes.append(FunctionProcess(JobRunner(*job_data)).start())
            process_rams_gb.append(0.0)

        # check for finished processes
        for i, p in enumerate(processes):
            process_rams_gb[i] = max(process_rams_gb[i], p.get_ram_usage_gb())
            if p.is_done():
                result = p.pop_result()
                result.set_max_cpu_ram_gb(process_rams_gb[i])
                # print(f'Node {node_id}: Before putting result in feedback_queue', flush=True)
                feedback_queue.put(result)
                # print(f'Node {node_id}: After putting result in feedback_queue', flush=True)
                del processes[i]
                del process_rams_gb[i]

        # print(f'.', end='', flush=True)

        time.sleep(0.01)

    # get RAM statistics of all processes and total RAM usage
    # if any process is finished, send time and RAM statistics of that process through the feedback queue
    # maybe have a logging queue?


class NodeManager:
    def start(self):
        raise NotImplementedError()  # start nodes, return queues and node ids?

    def terminate(self):
        raise NotImplementedError()  # terminate nodes?


class RayJobManager(NodeManager):
    def __init__(self, max_n_threads: Optional[int] = None, available_cpu_ram_multiplier: float = 1.0,
                 available_gpu_ram_multiplier: float = 1.0, **ray_kwargs):
        self.ray_kwargs = ray_kwargs
        self.runner_futures = []  # keep node_runner futures for termination
        self.job_queues = []
        self.feedback_queues = []
        self.resource_manager: Optional[ResourceManager] = None
        self.max_n_threads = max_n_threads
        self.available_cpu_ram_multiplier = available_cpu_ram_multiplier
        self.available_gpu_ram_multiplier = available_gpu_ram_multiplier

    def start(self) -> None:
        import ray
        # take some ray arguments from os.environ if available
        for (ray_name, environ_name) in [('address', 'ip_head'), ('_redis_password', 'redis_password')]:
            if environ_name in os.environ and ray_name not in self.ray_kwargs:
                self.ray_kwargs[ray_name] = os.environ[environ_name]
        ray.init(**self.ray_kwargs)
        from ray.util import queue
        nodes = ray.nodes()
        print(f'Nodes: {nodes}')
        feedback_queues = [queue.Queue() for i in range(len(nodes))]
        job_queues = [queue.Queue() for i in range(len(nodes))]

        for i, node in enumerate(nodes):
            node_id = f'node:{node["NodeManagerAddress"]}'
            num_gpus = 0 if 'GPU' not in node['Resources'] else round(node['Resources']['GPU'])
            future = ray.remote(num_gpus=num_gpus)(node_runner).options(resources={node_id: 1.0}) \
                .remote(feedback_queue=feedback_queues[i], job_queue=job_queues[i], node_id=i)
            self.runner_futures.append(future)

        print(f'Started {len(job_queues)} nodes', flush=True)
        n_nodes = len(job_queues)
        total_resources: List[Optional[NodeResources]] = [None] * n_nodes
        fixed_resources: List[Optional[NodeResources]] = [None] * n_nodes

        for feedback_queue in feedback_queues:
            nr, fnr = feedback_queue.get()  # should be a NodeResources object
            total_resources[nr.node_id] = nr
            fixed_resources[fnr.node_id] = fnr
            if self.max_n_threads is not None:
                total_resources[nr.node_id].set_n_threads(min(total_resources[nr.node_id].get_n_threads(),
                                                              self.max_n_threads))
            total_resources[nr.node_id].set_cpu_ram_gb(
                self.available_cpu_ram_multiplier * total_resources[nr.node_id].get_cpu_ram_gb())
            total_resources[nr.node_id].set_gpu_rams_gb(
                self.available_gpu_ram_multiplier * total_resources[nr.node_id].get_gpu_rams_gb())

        print(f'Acquired node resources', flush=True)

        self.resource_manager = ResourceManager(total_resources=SystemResources(total_resources),
                                                fixed_resources=SystemResources(fixed_resources))

        self.job_queues = job_queues
        self.feedback_queues = feedback_queues

    def get_resource_manager(self) -> ResourceManager:
        if self.resource_manager is None:
            raise RuntimeError('called get_resource_manager() before start()')
        return self.resource_manager

    def submit_job(self, job_info: JobInfo) -> None:
        import dill
        if self.resource_manager is None:
            raise RuntimeError('called submit_job() before start()')
        job = job_info.job
        job_id = job_info.job_id
        assigned_resources = job_info.assigned_resources
        if assigned_resources is None:
            raise RuntimeError('assigned_resources for submitted job must not be None')
        node_id = assigned_resources.node_id
        print(f'Scheduling job {job.get_desc()} on node {node_id}', flush=True)
        job_str = dill.dumps((job, job_id, assigned_resources))
        self.job_queues[node_id].put(job_str)
        self.resource_manager.job_started(job_info)

    def pop_finished_job_infos(self, timeout_s: float = -1.0) -> List[JobInfo]:
        if self.resource_manager is None:
            raise RuntimeError('called pop_results() before start()')
        has_new_result = False
        start_time = time.time()
        job_infos = []

        while not has_new_result:
            if timeout_s > 0.0 and time.time() > start_time + timeout_s:
                # timeout
                return job_infos

            for feedback_queue in self.feedback_queues:
                while not feedback_queue.empty():
                    job_result = feedback_queue.get()
                    job_info = self.resource_manager.job_finished(job_result)
                    job_infos.append(job_info)
                    has_new_result = True

            if not has_new_result:
                time.sleep(0.05)

        return job_infos

    def terminate(self) -> None:
        for jq in self.job_queues:
            jq.put(False)  # termination signal

        import ray
        # maybe wait only a bit and then hard terminate otherwise?
        ray.get(self.runner_futures)
        ray.shutdown()

# class LocalNodeManager(NodeManager):
#     # start node_runner in a thread
#     pass


================================================
FILE: pytabkit/bench/scheduling/jobs.py
================================================
import time
import traceback
import sys
from typing import Optional

from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.models.alg_interfaces.base import RequiredResources


class JobResult:
    """
    Helper class to store information about a job that has been run.
    """
    def __init__(self, job_id: int, time_s: float,
                 oom_cpu: bool = False, oom_gpu: bool = False, finished_normally: bool = True,
                 exception_msg: Optional[str] = None):
        """
        :param job_id: Job id.
        :param time_s: Time in seconds that the job ran for.
        :param oom_cpu: Whether an out-of-memory error occurred on the CPU.
        :param oom_gpu: Whether an out-of-memory error occurred on the GPU.
        :param finished_normally: Whether the job ran normally,
            such that its time and RAM values are representative of how it would normally run.
            For example, if the job ran faster because the results were already partially precomputed,
            it should not count towards the time estimation. Of course, if an exception occurred,
            we should have finished_normally=False.
        :param exception_msg: Exception message (if there was any).
        """
        self.job_id = job_id
        self.time_s = time_s
        self.oom_cpu = oom_cpu
        self.oom_gpu = oom_gpu
        self.finished_normally = finished_normally
        self.exception_msg = exception_msg
        self.failed = exception_msg is not None
        self.max_cpu_ram_gb = 0.0
        assert exception_msg is None or not finished_normally

    def set_max_cpu_ram_gb(self, value: float) -> None:
        """
        Set the maximum RAM usage of the job.
        :param value: maximum RAM usage in GiB.
        """
        self.max_cpu_ram_gb = value


class AbstractJob:
    """
    Abstract base class for jobs that can be scheduled using schedulers in schedulers.py.
    """
    def get_group(self) -> str:
        """
        :return: Should return a "group name" string. All jobs with the same "group name" will have
            a common time factor that is adjusted on-the-fly during scheduling based on already completed jobs.
        """
        raise NotImplementedError()

    def __call__(self, assigned_resources: NodeResources) -> bool:
        """
        Should perform the main computation of the job.
        Problematic exceptions should not be caught within this method,
        they will be caught and printed in the scheduler.

        :param assigned_resources: Resources that are assigned to this job
            (conforming with the resources requested in get_required_resources()).
        :return: Should return True if the execution finished normally
            such that the timing of this job is representative.
            In cases where pre-computed results were available such that the job is shorter than usual, return False.
        """
        raise NotImplementedError()

    def get_required_resources(self) -> RequiredResources:
        """
        :return: Return the resources requested by this job.
        """
        raise NotImplementedError()

    def get_desc(self) -> str:
        """
        :return: Return a description that can be logged, e.g., when the job is started and when it finishes.
        """
        raise NotImplementedError()


class JobRunner:
    """
    Helper class that runs an AbstractJob, catches exceptions, measures time and RAM usage, and returns its result.
    """
    def __init__(self, job: AbstractJob, job_id: int, assigned_resources: NodeResources):
        """
        :param job: The job to be run.
        :param job_id: An ID that will be returned at the end so that the job can be identified.
        :param assigned_resources: Assigned resources to run the job.
        """
        self.job = job
        self.job_id = job_id
        self.assigned_resources = assigned_resources

    def __call__(self) -> JobResult:
        """
        Runs the job computation.

        :return: Returns a JobResult object that includes information about the job.
        """
        start_time = time.time()
        oom_gpu = False
        oom_cpu = False
        exception_msg = None
        try:
            finished_normally = self.job(self.assigned_resources)
        except Exception as e:
            finished_normally = False
            exception_msg = traceback.format_exc()
            print(exception_msg, file=sys.stderr, flush=True)
            if isinstance(e, MemoryError):
                oom_cpu = True
            elif isinstance(e, RuntimeError) and 'cuda out of memory' in exception_msg.lower():
                oom_gpu = True
            elif isinstance(e, KeyboardInterrupt):
                raise e

        end_time = time.time()

        return JobResult(job_id=self.job_id, time_s=end_time-start_time,
                         oom_cpu=oom_cpu, oom_gpu=oom_gpu, finished_normally=finished_normally,
                         exception_msg=exception_msg)


================================================
FILE: pytabkit/bench/scheduling/resource_manager.py
================================================
import copy
import enum
import time
from typing import Optional

from pytabkit.bench.scheduling.jobs import AbstractJob, JobResult
from pytabkit.bench.scheduling.resources import NodeResources, SystemResources


class JobStatus(enum.Enum):
    REMAINING = 0
    RUNNING = 1
    SUCCEEDED = 2
    FAILED = 3


class JobInfo:
    def __init__(self, job: AbstractJob, job_id: int, start_time: Optional[float] = None,
                 assigned_resources: Optional[NodeResources] = None, job_result: Optional[JobResult] = None):
        self.job = job
        self.job_id = job_id
        self.start_time = start_time
        self.assigned_resources = assigned_resources
        self.required_resources = job.get_required_resources()
        self.job_result = job_result

    def get_status(self) -> JobStatus:
        if self.start_time is None:
            return JobStatus.REMAINING
        elif self.job_result is None:
            return JobStatus.RUNNING
        elif self.job_result.failed:
            return JobStatus.FAILED
        else:
            return JobStatus.SUCCEEDED

    def set_started(self, assigned_resources: NodeResources):
        self.start_time = time.time()
        self.assigned_resources = assigned_resources

    def set_finished(self, job_result: JobResult):
        self.job_result = job_result

    def is_remaining(self):
        return self.get_status() == JobStatus.REMAINING

    def is_running(self):
        return self.get_status() == JobStatus.RUNNING

    def is_finished(self):
        return self.get_status() in [JobStatus.FAILED, JobStatus.SUCCEEDED]

    def is_failed(self):
        return self.get_status() == JobStatus.FAILED

    def is_succeed(self):
        return self.get_status() == JobStatus.SUCCEEDED


class ResourceManager:
    """
    Keeps track of running jobs and available resources.
    """
    def __init__(self, total_resources: SystemResources, fixed_resources: SystemResources):
        self.total_resources = total_resources
        self.fixed_resources = fixed_resources
        self.running_job_infos = dict()  # map job_id to job_info

    def get_fixed_resources(self):
        return self.fixed_resources

    def get_total_resources(self):
        return self.total_resources

    def get_free_resources(self):
        free_resources = copy.deepcopy(self.total_resources)

        for ji in self.running_job_infos.values():
                ar = ji.assigned_resources
                free_resources.resources[ar.node_id] -= ar

        return free_resources

    def job_started(self, job_info: JobInfo):
        job_info.start_time = time.time()
        if job_info.job_id in self.running_job_infos:
            raise RuntimeError(f'Trying to start job {job_info.job.get_desc()}, which is already running!')
        self.running_job_infos[job_info.job_id] = job_info

    def job_finished(self, job_result: JobResult) -> JobInfo:
        ji = self.running_job_infos[job_result.job_id]
        ji.set_finished(job_result)
        if job_result.exception_msg is not None:
            print(f'Job failed: {ji.job.get_desc()}\nException: {job_result.exception_msg}')
        del self.running_job_infos[job_result.job_id]
        return ji


================================================
FILE: pytabkit/bench/scheduling/resources.py
================================================
from typing import Optional, List

import numpy as np
import copy

from pytabkit.models.alg_interfaces.base import InterfaceResources, RequiredResources


# already add fixed GPU RAM in assigned resources?  (problem: does try_assign know these fixed resources?)
# or have fixed_resources: NodeResources that are added each time?
# problem: fixed resources only need to be added to those GPUs that are actually assigned
# or maybe a method add_fixed_resources that takes in the fixed GPU RAM assignments


class NodeResources:
    """
    Represents available/used/free resources on a compute node.
    """
    def __init__(self, node_id: int, n_threads: float, cpu_ram_gb: float, gpu_usages: np.ndarray,
                 gpu_rams_gb: np.ndarray, physical_core_usages: np.ndarray):
        self.node_id = node_id
        self.n_gpus = len(gpu_usages)
        self.data: np.ndarray = np.array(np.concatenate(
            [[n_threads, cpu_ram_gb], gpu_usages, gpu_rams_gb, physical_core_usages]))
        self.data.setflags(write=True)

    def get_n_threads(self) -> int:
        return round(self.data[0])

    def set_n_threads(self, n_threads: int):
        # somehow necessary because self.data can get non-writeable after transmitting it from another ray process
        self.data = np.copy(self.data)
        self.data[0] = n_threads

    def get_cpu_ram_gb(self) -> float:
        return self.data[1]

    def set_cpu_ram_gb(self, cpu_ram_gb: float) -> None:
        # somehow necessary because self.data can get non-writeable after transmitting it from another ray process
        self.data = np.copy(self.data)
        self.data[1] = cpu_ram_gb

    def set_gpu_rams_gb(self, gpu_rams_gb: np.ndarray) -> None:
        # somehow necessary because self.data can get non-writeable after transmitting it from another ray process
        self.data = np.copy(self.data)
        self.data[2+self.n_gpus:2+2*self.n_gpus] = gpu_rams_gb

    def get_gpu_usages(self) -> np.ndarray:
        return self.data[2:2+self.n_gpus]

    def get_gpu_rams_gb(self) -> np.ndarray:
        return self.data[2+self.n_gpus:2+2*self.n_gpus]

    def get_physical_core_usages(self) -> np.ndarray:
        return self.data[2+2*self.n_gpus:]

    def get_n_physical_cores(self) -> int:
        return len(self.data) - (2+2*self.n_gpus)

    def get_total_gpu_ram_gb(self) -> float:
        return np.sum(self.get_gpu_rams_gb())

    def get_total_gpu_usage(self) -> float:
        return np.sum(self.get_gpu_usages())

    def get_used_gpu_ids(self) -> np.ndarray:  # todo: naming
        return np.argwhere(self.get_gpu_usages() > 1e-8)[:, 0]

    def get_used_physical_cores(self) -> np.ndarray:
        return np.argwhere(self.get_physical_core_usages() > 1e-8)[:, 0]

    def get_resource_vector(self) -> np.ndarray:
        return np.asarray([self.get_n_threads(), self.get_cpu_ram_gb(),
                           self.get_total_gpu_usage(), self.get_total_gpu_ram_gb()])

    def get_interface_resources(self) -> InterfaceResources:
        return InterfaceResources(n_threads=self.get_n_threads(),
                                  gpu_devices=[f'cuda:{i}' for i in self.get_used_gpu_ids()])

    def __iadd__(self, other: 'NodeResources') -> 'NodeResources':  # operator +=
        self.data += other.data  # todo: some compatibility checks?
        return self

    def __isub__(self, other: 'NodeResources') -> 'NodeResources':
        self.data -= other.data
        return self

    def __imul__(self, other: 'NodeResources') -> 'NodeResources':
        self.data *= other.data
        return self

    def __itruediv__(self, other: 'NodeResources') -> 'NodeResources':
        self.data /= other.data
        return self

    def __add__(self, other: 'NodeResources') -> 'NodeResources':
        result = copy.deepcopy(self)
        result += other
        return result

    def __sub__(self, other: 'NodeResources') -> 'NodeResources':
        result = copy.deepcopy(self)
        result -= other
        return result

    def __mul__(self, other: 'NodeResources') -> 'NodeResources':
        result = copy.deepcopy(self)
        result *= other
        return result

    def __truediv__(self, other: 'NodeResources') -> 'NodeResources':
        result = copy.deepcopy(self)
        result /= other
        return result

    def try_assign(self, required_resources: RequiredResources,
                   fixed_resources: 'SystemResources') -> Optional['NodeResources']:
        rr = required_resources
        fr = fixed_resources.resources[self.node_id]
        if not rr.should_add_fixed_resources():
            fr = NodeResources.zeros_like(fr)
        # todo: distribution across GPUs is potentially suboptimal
        # CPU stuff
        n_threads = fr.get_n_threads() + rr.n_threads
        if self.get_n_threads() < n_threads:
            return None

        cpu_ram_gb = fr.get_cpu_ram_gb() + rr.cpu_ram_gb
        if self.get_cpu_ram_gb() < cpu_ram_gb:
            return None

        n_cores = rr.n_explicit_physical_cores
        physical_core_usages = np.zeros(self.get_n_physical_cores())
        if n_cores > 0:
            free_pcu = self.get_physical_core_usages()
            free_in_sequence = np.convolve(free_pcu, np.ones(n_cores),'valid')
            idx = np.argmax(free_in_sequence >= n_cores - 0.5)
            if free_in_sequence[idx] >= n_cores - 0.5:
                physical_core_usages[idx:idx+n_cores] = 1.0
            else:
                return None

        # GPU stuff
        gpu_usages = np.zeros(self.n_gpus)
        gpu_rams_gb = np.zeros(self.n_gpus)
        gpu_usages_all = fr.get_gpu_usages() + rr.gpu_usage
        gpu_rams_gb_all = fr.get_gpu_rams_gb() + rr.gpu_ram_gb
        gpu_availability = np.logical_and(gpu_usages_all <= self.get_gpu_usages() + 1e-8,
                                          gpu_rams_gb_all <= self.get_gpu_rams_gb())
        # print(f'{fr.get_gpu_rams_gb()=}, {rr.gpu_ram_gb=}')
        # print(f'{gpu_usages_all=}, {gpu_rams_gb_all=}, {self.get_gpu_usages()=}, {self.get_gpu_rams_gb()=}, {gpu_availability=}')
        available_gpus = np.argwhere(gpu_availability)[:, 0]  # squeeze second dimension
        # sort available gpus by usage
        available_gpu_usages = self.get_gpu_usages()[available_gpus]
        # pick gpus with most free resources first
        available_gpus = available_gpus[np.argsort(available_gpu_usages)[::-1]]
        # print('gpu selection:', gpu_availability, available_gpu_usages, available_gpus)
        if len(available_gpus) < rr.n_gpus:
            return None
        else:
            gpu_ids = available_gpus[:rr.n_gpus]
            for i in gpu_ids:
                gpu_usages[i] = gpu_usages_all[i]
                gpu_rams_gb[i] = gpu_rams_gb_all[i]

        return NodeResources(node_id=self.node_id, n_threads=n_threads, cpu_ram_gb=cpu_ram_gb,
                             gpu_usages=gpu_usages, gpu_rams_gb=gpu_rams_gb,
                             physical_core_usages=physical_core_usages)

    # todo: maybe a __str__ or __repr__ method for printing?
    @staticmethod
    def zeros_like(node_resources: 'NodeResources') -> 'NodeResources':
        result = copy.deepcopy(node_resources)
        result.data *= 0
        return result


class SystemResources:
    """
    System resources, consisting of NodeResources for each node.
    """
    def __init__(self, resources: List[NodeResources]):
        self.resources = resources

    def __getitem__(self, index: int):
        return self.resources[index]

    def __len__(self):
        return len(self.resources)

    def __iadd__(self, other):
        for i in range(len(self.resources)):
            self.resources[i] += other.resources[i]
        return self

    def __isub__(self, other):
        for i in range(len(self.resources)):
            self.resources[i] -= other.resources[i]
        return self

    def __imul__(self, other):
        for i in range(len(self.resources)):
            self.resources[i] *= other.resources[i]
        return self

    def __itruediv__(self, other):
        for i in range(len(self.resources)):
            self.resources[i] /= other.resources[i]
        return self

    def __add__(self, other):
        result = copy.deepcopy(self)
        result += other
        return result

    def __sub__(self, other):
        result = copy.deepcopy(self)
        result -= other
        return result

    def __mul__(self, other):
        result = copy.deepcopy(self)
        result *= other
        return result

    def __truediv__(self, other):
        result = copy.deepcopy(self)
        result /= other
        return result

    def get_n_threads(self):
        return sum([r.get_n_threads() for r in self.resources])

    def get_cpu_ram_gb(self):
        return sum([r.get_cpu_ram_gb() for r in self.resources])

    def get_gpu_usage(self):
        return sum([r.get_total_gpu_usage() for r in self.resources])

    def get_gpu_ram_gb(self):
        return sum([r.get_total_gpu_ram_gb() for r in self.resources])

    def get_num_gpus(self):
        return sum([r.n_gpus for r in self.resources])

    def get_resource_vector(self):
        return sum([r.get_resource_vector() for r in self.resources])

    # todo: maybe a __str__ or __repr__ method for printing?


================================================
FILE: pytabkit/bench/scheduling/schedulers.py
================================================
import copy
import sys
import time
from typing import List, Dict, Union

import numpy as np

from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.jobs import AbstractJob
from pytabkit.bench.scheduling.resource_manager import JobInfo


def format_length_s(duration: float) -> str:
    seconds = int(duration)
    minutes = seconds // 60
    seconds -= minutes * 60
    hours = minutes // 60
    minutes -= hours * 60
    days = hours // 24
    hours -= days * 24

    result = f'{seconds}s'
    if minutes > 0:
        result = f'{minutes}m' + result
    if hours > 0:
        result = f'{hours}h' + result
    if days > 0:
        result = f'{days}d' + result

    return result


def format_date_s(time_s: float) -> str:
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_s))


class BaseJobScheduler:
    """
    Base scheduler class where the logic for selecting which jobs should be run next still has to be implemented.
    Contains functionality for printing intermediate states and the main loop in run().
    """
    def __init__(self, job_manager: RayJobManager):
        self.start_time = time.time()
        self.job_manager = job_manager
        self.job_infos: List[JobInfo] = []

    def _submit_more_jobs(self) -> None:
        # to be implemented in subclasses
        raise NotImplementedError()

    def add_jobs(self, jobs: List[AbstractJob]):
        for job in jobs:
            self.job_infos.append(JobInfo(job, job_id=len(self.job_infos)))

    def run(self):
        if len(self.job_infos) == 0:
            print(f'No jobs to run')
            return

        self.job_manager.start()
        self._print_start()

        while self._has_unfinished_jobs():
            self._submit_more_jobs()
            self._print_progress()

            wait_period_s = 30
            finished_job_infos = self.job_manager.pop_finished_job_infos(timeout_s=wait_period_s)
            if len(finished_job_infos) == 0:
                # no jobs finished after wait_period_s, print a running report and then wait for longer
                self._print_running_jobs()
                finished_job_infos = self.job_manager.pop_finished_job_infos()

            for job_info in finished_job_infos:
                # update the status of the job infos that have been finished
                self.job_infos[job_info.job_id] = job_info

            # todo: register finished job infos in self

        self._print_end()

        self.job_manager.terminate()

    def _has_unfinished_jobs(self) -> bool:
        return any(not ji.is_finished() for ji in self.job_infos)

    def _print_start(self):
        self.start_time = time.time()
        print(
            f'############################### START REPORT ##################################\n'
            f'# Start date: {format_date_s(self.start_time)}\n'
            f'# Number of jobs: {len(self.job_infos)}\n'
            f'###############################################################################',
            flush=True
        )

    def _print_end(self):
        end_time = time.time()
        duration = end_time - self.start_time
        group_stats = self._compute_group_stats()
        ram_factors = [ji.job_result.max_cpu_ram_gb / ji.assigned_resources.get_cpu_ram_gb()
                       for ji in self.job_infos]
        ram_factors.sort(reverse=True)
        if len(ram_factors) > 5:
            ram_factors = ram_factors[:5]
        time_factors_string = '\n'.join([f'# Time factor for {key}: {value["time_factor"]}'
                                         for key, value in group_stats.items()])

        n_jobs_failed = len([ji for ji in self.job_infos if ji.is_failed()])

        print(
            f'################################ END REPORT ###################################\n'
            f'# Start date: {format_date_s(self.start_time)}\n'
            f'# End date: {format_date_s(end_time)}\n'
            f'# Duration: {format_length_s(duration)}\n'
            f'# Number of failed jobs: {n_jobs_failed}\n'
            f'# Largest RAM factors: {ram_factors}\n'
            f'{time_factors_string}\n'
            f'###############################################################################',
            flush=True
        )

    def _compute_group_stats(self) -> Dict[str, Dict[str, Union[int, float]]]:
        job_groups = [ji.job.get_group() for ji in self.job_infos]
        groups = set(job_groups)
        group_stats = {}
        for group in groups:
            job_infos: List[JobInfo] = [ji for ji, jg in zip(self.job_infos, job_groups) if jg == group]
            started_job_infos = [ji for ji in job_infos if not ji.is_remaining()]
            running_job_infos = [ji for ji in started_job_infos if ji.is_running()]
            finished_job_infos = [ji for ji in job_infos if ji.is_finished()]
            finished_job_infos_with_time = [ji for ji in finished_job_infos if ji.job_result.finished_normally]
            n_started = len(started_job_infos)
            n_running = len(running_job_infos)
            n_finished = len(finished_job_infos)
            n_finished_with_time = len(finished_job_infos_with_time)
            if n_started == 0 or (n_finished_with_time == 0 and n_running == 0):
                time_factor = 1.0
            elif n_finished_with_time == 0:
                current_time = time.time()
                elapsed_time = sum([current_time - ji.start_time for ji in running_job_infos])
                predicted_time_units = sum([ji.required_resources.time_s for ji in running_job_infos])
                time_factor = max(1.0, elapsed_time / (predicted_time_units + 1e-8))
            else:
                used_time = sum([ji.job_result.time_s for ji in finished_job_infos_with_time])
                predicted_time_units = sum([ji.required_resources.time_s
                                            for ji in finished_job_infos_with_time])
                time_factor = used_time / (predicted_time_units + 1e-8)
            group_stats[group] = {'time_factor': time_factor,
                                  'n_started': n_started,
                                  'n_running': n_running,
                                  'n_finished': n_finished,
                                  'n_finished_with_time': n_finished_with_time}
        return group_stats

    def _get_time_estimates(self, job_infos: List[JobInfo], group_stats: Dict[str, Dict[str, Union[int, float]]]) \
            -> np.ndarray:
        current_time = time.time()
        startup_time_s = 1.0  # guessed
        time_estimates = []
        for ji in job_infos:
            if ji.is_finished():
                time_estimates.append(0.0)  # job is already finished
                continue
            rr = ji.required_resources
            time_estimate = group_stats[ji.job.get_group()]['time_factor'] * rr.time_s
            if not ji.is_remaining():
                time_estimate = max(0.0, time_estimate - (current_time - ji.start_time))
            else:
                time_estimate += startup_time_s
            time_estimates.append(time_estimate)
        return np.asarray(time_estimates)

    def _print_progress(self):
        group_stats = self._compute_group_stats()
        resource_manager = self.job_manager.get_resource_manager()

        start_time = self.start_time
        current_time = time.time()
        elapsed_time = current_time - start_time

        total_resources = resource_manager.get_total_resources()
        fixed_resources = resource_manager.get_fixed_resources()
        average_fixed_resources = (fixed_resources * total_resources).get_resource_vector() \
                                  / (total_resources.get_resource_vector() + 1e-8)

        job_infos = self.job_infos

        n_jobs_finished = len([ji for ji in job_infos if ji.is_finished()])  # succeeded and failed ones
        n_jobs_remaining = len([ji for ji in job_infos if ji.is_remaining()])
        n_jobs_failed = len([ji for ji in job_infos if ji.is_failed()])
        n_jobs_running = len(job_infos) - n_jobs_finished - n_jobs_remaining

        time_estimates = self._get_time_estimates(job_infos, group_stats=group_stats)
        argmax_time_estimate = np.argmax(time_estimates)
        longest_job_desc = job_infos[argmax_time_estimate].job.get_desc()
        longest_time_estimate: float = time_estimates[argmax_time_estimate]
        system_resource_vec = total_resources.get_resource_vector()
        # estimate \sum_{jobs} job_resources * remaining_job_time
        # (could also do physical cores, but that should be covered by threads)
        total_job_time_resource_vec = sum([ji.required_resources.get_resource_vector(average_fixed_resources) * te
                                           for ji, te in zip(job_infos, time_estimates)])
        # todo: improve this estimate towards the end of a run?
        remaining_time_estimate = np.max(total_job_time_resource_vec / (system_resource_vec + 1e-8))

        elapsed_fraction = elapsed_time / (elapsed_time + remaining_time_estimate)

        end_date_str = format_date_s(current_time + remaining_time_estimate)

        # todo: also print predicted system usage in percent (relative to criticality of resources)?
        #  or print current relative resource usages and remaining task relative resource usages
        # todo: also log this somewhere automatically?
        print(
            f'############################ INTERMEDIATE REPORT ##############################\n'
            f'# {n_jobs_finished} jobs finished ({n_jobs_failed} failed), {n_jobs_running} jobs running, {n_jobs_remaining} jobs remaining\n'
            f'# Elapsed: {format_length_s(elapsed_time)} ({elapsed_time:.2f}s)\n'
            f'# Remaining: {format_length_s(remaining_time_estimate)} ({remaining_time_estimate:.2f}s)\n'
            f'# Percent completed: {100 * elapsed_fraction:.2f}%\n'
            f'# Estimated end time: {end_date_str}\n'
            f'# Current time: {format_date_s(current_time)}\n'
            f'# Longest remaining job: {longest_job_desc} with {format_length_s(longest_time_estimate)}\n'
            f'###############################################################################',
            flush=True
        )

    def _print_running_jobs(self):
        group_stats = self._compute_group_stats()

        current_time = time.time()

        job_infos = self.job_infos

        n_jobs_finished = len([ji for ji in job_infos if ji.is_finished()])
        n_jobs_remaining = len([ji for ji in job_infos if ji.is_remaining()])
        n_jobs_running = len(job_infos) - n_jobs_finished - n_jobs_remaining

        time_estimates = self._get_time_estimates(job_infos, group_stats=group_stats)

        job_strs = []

        sorted_time_idxs = np.argsort(time_estimates)

        # for ji, time_estimate in zip(job_infos, time_estimates):
        for i in sorted_time_idxs:
            ji = job_infos[i]
            time_estimate = time_estimates[i]
            if not ji.is_running():
                continue  # job is not currently running
            job: AbstractJob = ji.job
            job_desc = job.get_desc()
            job_str = (f'# Job {job_desc} has been running for {format_length_s(current_time-ji.start_time)}'
                       f', estimated remaining time: {format_length_s(time_estimate)}')
            job_strs.append(job_str)

        print(
            f'############################### RUNNING REPORT ################################\n'
            f'# Current time: {format_date_s(current_time)}, {n_jobs_running} jobs are running:\n'
            + '\n'.join(job_strs) + '\n' +
            f'###############################################################################',
            flush=True
        )


class SimpleJobScheduler(BaseJobScheduler):
    """
    Simple scheduler. Submits jobs with the largest estimated time. If a job doesn't fit,
    jobs with not too much smaller time can be submitted instead.
    In the beginning, the scheduler ensures that at least three jobs from each group are run
    (e.g. 3x XGB, 3x LGBM, 3x MLP).
    """
    def _submit_more_jobs(self) -> None:
        min_starts_per_group = 3

        job_infos = [ji for ji in self.job_infos if ji.is_remaining()]  # need running jobs as well for n_started_times?

        if len(job_infos) == 0:
            print(f'No job infos remaining')
            return

        group_stats = self._compute_group_stats()
        job_times = self._get_time_estimates(job_infos, group_stats)
        n_started_times = {key: value['n_running'] + value['n_finished_with_time']
                           for key, value in group_stats.items()}
        resource_manager = self.job_manager.get_resource_manager()
        # n_started_times = [group_stats[ji['job'].get_group()]['n_running']
        #                   + group_stats[ji['job'].get_group()]['n_finished_with_time'] for ji in job_infos]

        free_resources = copy.deepcopy(resource_manager.get_free_resources())
        fixed_resources = resource_manager.get_fixed_resources()

        if any(value < min_starts_per_group for value in n_started_times.values()):
            # need to start jobs first from groups where we don't have enough time measurements yet
            # do this by increasing their job_times estimate
            job_times_offset = 2 * np.max(job_times)
            for group, n_started in n_started_times.items():
                if n_started < min_starts_per_group:
                    job_idxs = np.asarray([i for i, ji in enumerate(job_infos) if ji.job.get_group() == group],
                                          dtype=np.int32)
                    sort_perm = np.argsort(job_times[job_idxs])
                    n_offset = min(len(sort_perm), min_starts_per_group - n_started)
                    # add job_times_offset to the n_offset jobs from this group with largest time estimate
                    job_times[job_idxs[sort_perm[-n_offset:]]] += job_times_offset

        # if a job with time estimate t cannot be started,
        # don't start jobs with time estimate less than min_time_factor * t
        # the maximum value of t is tracked in max_non_started_time
        min_time_factor = 0.1
        max_non_started_time = 0.0

        job_idxs_sorted = np.argsort(job_times)[::-1]  # sort descending

        for job_idx in job_idxs_sorted:
            if job_times[job_idx] < min_time_factor * max_non_started_time:
                # don't start too fast jobs if other much slower ones are waiting
                return

            job_info = job_infos[job_idx]

            # otherwise, try assigning the job
            for node_idx, r in enumerate(free_resources.resources):
                # print(f'{fixed_resources.__dict__=}')
                # print(f'{job_info.required_resources.__dict__=}')
                # print(f'{r.data=}, {r.get_resource_vector()=}, {node_idx=}')
                assigned_resources = r.try_assign(job_info.required_resources, fixed_resources)
                # print(f'{bool(assigned_resources)=}')
                if assigned_resources is not None:
                    job_info.set_started(assigned_resources)
                    self.job_manager.submit_job(job_info)
                    free_resources.resources[node_idx] -= assigned_resources
                    break
            else:
                # could not assign the job
                max_non_started_time = max(max_non_started_time, job_times[job_idx])


class CustomJobScheduler(BaseJobScheduler):
    """
    More complicated scheduler with different heuristics for which jobs to submit first
    (based on which resources it thinks are scarce, estimated time, which methods have not been run yet, etc.).
    This scheduler can be slow for a large number of jobs (say 10,000 or more).
    """
    def _submit_more_jobs(self) -> None:
        # todo: how to handle OOM errors? Reduce total memory of nodes? Or increase memory of jobs?
        #  Or add constants to free_resources?
        #  maybe check if last error is at least one minute ago or so
        # current error handling: count job as finished, don't rerun

        min_starts_per_group = 3

        job_infos = [ji for ji in self.job_infos if not ji.is_finished()]

        group_stats = self._compute_group_stats()
        job_times = self._get_time_estimates(job_infos, group_stats)
        n_started_time = {key: value['n_running'] + value['n_finished_with_time'] for key, value in group_stats.items()}
        resource_manager = self.job_manager.get_resource_manager()
        # n_started_time = [group_stats[ji['job'].get_group()]['n_running']
        #                   + group_stats[ji['job'].get_group()]['n_finished_with_time'] for ji in job_infos]

        total_resources = resource_manager.get_total_resources()
        free_resources = copy.deepcopy(resource_manager.get_free_resources())
        fixed_resources = resource_manager.get_fixed_resources()

        print('total_resources.get_resource_vector():', total_resources.get_resource_vector())

        system_rv = total_resources.get_resource_vector()
        job_availability = np.asarray([1.0 if ji.is_remaining() else 0.0 for ji in job_infos])
        # n_nodes x 4
        total_node_rvs = np.asarray([r.get_resource_vector() for r in total_resources.resources])
        # shape: 4
        average_fixed_rv = (fixed_resources * total_resources).get_resource_vector() \
                           / (total_resources.get_resource_vector() + 1e-8)
        job_rvs = np.asarray([ji.required_resources.get_resource_vector(average_fixed_rv) for ji in job_infos])
        remaining_job_time_rv = sum([job_rv * job_time for job_rv, job_time in zip(job_rvs, job_times)])
        remaining_times_by_resource = remaining_job_time_rv / (system_rv + 1e-10)
        remaining_distr = remaining_times_by_resource / (np.max(remaining_times_by_resource) + 1e-8)
        criticality = np.exp(5.0 * remaining_distr)
        criticality /= np.sum(criticality)  # tempered softmax
        # max_remaining_time = np.max(remaining_times_by_resource)

        node_job_runability = np.asarray(
            [[r.try_assign(ji.required_resources, fixed_resources) is not None for ji in job_infos]
             for r in total_resources.resources])

        job_runability = np.any(node_job_runability, axis=0)
        # print('job_runability:', job_runability)
        for i in np.argwhere(~job_runability):
            # job i cannot run on any node, even if they are completely empty
            resource_vector = job_infos[int(i)].required_resources.get_resource_vector(average_fixed_rv)
            print(f'The following job does not fit on any node: {job_infos[int(i)].job.get_desc()}'
                  f', its required resource vector is {resource_vector}.',
                  file=sys.stderr, flush=True)

            job_availability[i] = 0.0

        while np.sum(job_availability) > 0.0:
            # if nodes get full before jobs run out, a return statement in the loop is used
            used_resources = total_resources - free_resources
            used_node_rvs = np.asarray([r.get_resource_vector() for r in used_resources.resources])

            # All scores will have shape n_nodes x n_jobs or broadcast to it

            # ----- Assignability -----

            assignments = [[r.try_assign(ji.required_resources, fixed_resources) for ji in job_infos]
                           for r in free_resources.resources]
            assignability_score = np.asarray([[1.0 if a is not None else 0.0 for a in l] for l in assignments])

            # ----- Uncertainty score -----
            uncertainty_score = np.asarray([
                max(0.0, min_starts_per_group - n_started_time[ji.job.get_group()])
                for ji in job_infos])
            uncertainty_score = uncertainty_score[None, :]

            # ----- Short Job Penalty -----

            # only use still available jobs for remaining partial sums
            job_times_rvs = job_times[:, None] * job_rvs * job_availability[:, None]
            perm = np.argsort(job_times)
            time_rv_partial_sums = np.zeros_like(job_times_rvs)
            time_rv_partial_sums[perm] = np.cumsum(job_times_rvs[perm], axis=0)
            time_partial_sums = [np.max(trps / (system_rv + 1e-8)) for trps in time_rv_partial_sums]
            max_time = np.max(job_times)  # todo: use times of all jobs, including currently running ones?
            partial_sum_threshold = 3 * max_time  # heuristic
            # penalty in [0, 1], largest for shortest jobs
            # shape: n_jobs
            short_job_penalty = (partial_sum_threshold - time_partial_sums) / partial_sum_threshold
            short_job_penalty[short_job_penalty < 0.0] = 0.0
            short_job_penalty = short_job_penalty[None, :]  # extend by node dimension

            # ----- Time score -----

            # could also use max_remaining_time in denominator instead
            time_score = job_times[None, :] / (max_time + 1e-8)  # in [0, 1]

            # ----- Resource score -----

            resource_score = np.sum(job_rvs[None, :, :] * criticality[None, None, :], axis=-1)
            resource_score /= (np.max(resource_score) + 1e-8)  # now in [0, 1]

            # ----- Utilization score -----

            # use as shape: n_nodes x n_jobs x 4
            new_resources = used_node_rvs[:, None, :] + job_rvs[None, :, :]
            new_utilization = new_resources / (total_node_rvs[:, None, :] + 1e-10)
            # what you could have got with uniform utilization
            new_opt_resources = np.max(new_utilization, axis=-1, keepdims=True) * total_node_rvs[:, None, :]
            # multiplying utilization with resources avoids the 0/0 GPU utilization problem
            new_missed_resources = new_opt_resources - new_resources

            old_resources = used_node_rvs[:, None, :]
            old_utilization = old_resources / (total_node_rvs[:, None, :] + 1e-10)
            # what you could have got with uniform utilization
            old_opt_resources = np.max(old_utilization, axis=-1, keepdims=True) * total_node_rvs[:, None, :]
            # multiplying utilization with resources avoids the 0/0 GPU utilization problem
            old_missed_resources = old_opt_resources - old_resources

            missing_improvement = np.sum((new_missed_resources - old_missed_resources) * criticality[None, None, :],
                                         axis=-1)
            running_improvement = np.sum(job_rvs[None, :, :] * criticality[None, None, :], axis=-1)

            # should be in (-\infty, 1]
            utilization_score = np.max(new_utilization, axis=-1) * missing_improvement / (running_improvement + 1e-8)
            utilization_score = utilization_score / (1.0 + np.abs(utilization_score))  # now in (-1, 1)

            # ----- Joint score -----
            # print(utilization_score.shape, time_score.shape, resource_score.shape, assignability_score.shape,
            #       short_job_penalty.shape, uncertainty_score.shape)
            joint_score = utilization_score + 0.3 * time_score + 0.2 * resource_score - 0.5 * assignability_score \
                          - 5 * short_job_penalty + 1000 * uncertainty_score
            low_value = np.min(joint_score) - 1
            joint_score[:, job_availability <= 0.5] = low_value
            joint_score[~node_job_runability] = low_value

            # ----- Find next node-job pair -----

            # strategy: find next best node-job pair.
            # If no assignment possible, terminate.
            # If job can be run now (assignable), add to list and recompute scores.
            # If job is not assignable to node,
            # block all jobs on node and block job on all nodes where it is not assignable.
            # Then loop back to next best node-job pair.

            while True:  # loop until an assignment is found or all nodes are blocked by unassignable jobs
                best_idxs = np.unravel_index(np.argmax(joint_score), joint_score.shape)
                if joint_score[best_idxs] == low_value:
                    print('No job remaining')
                    return

                node_idx = best_idxs[0]
                job_idx = best_idxs[1]
                assigned_resources = assignments[node_idx][job_idx]
                if assigned_resources is None:  # node is too full to run job now
                    print('Node too full')
                    # block node for now
                    joint_score[node_idx, :] = low_value
                    # make sure that job can only be stolen by other nodes if they are assignable
                    joint_score[assignability_score[:, job_idx] == 0.0, job_idx] = low_value
                else:
                    print('Assigning job')
                    job_availability[job_idx] = 0.0
                    job_info = job_infos[job_idx]
                    job_info.set_started(assigned_resources)
                    self.job_manager.submit_job(job_info)
                    free_resources.resources[node_idx] -= assigned_resources
                    n_started_time[job_info.job.get_group()] += 1
                    break  # leave inner loop, recompute scores


================================================
FILE: pytabkit/models/__init__.py
================================================


================================================
FILE: pytabkit/models/alg_interfaces/__init__.py
================================================


================================================
FILE: pytabkit/models/alg_interfaces/alg_interfaces.py
================================================
import functools
import warnings
from pathlib import Path
from typing import List, Tuple, Any, Optional, Dict

import torch

from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.data.nested_dict import NestedDict
from pytabkit.models.hyper_opt.hyper_optimizers import HyperOptimizer

from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset, TaskType
from pytabkit.models.torch_utils import cat_if_necessary
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import Metrics


class AlgInterface:
    """
    AlgInterface is an abstract base class for tabular ML methods
    with an interfaces that offers more possibilities than a standard scikit-learn interface.

    In particular, it allows for parallelized fitting of multiple models, bagging, and refitting.
    The idea is as follows:

    - The dataset can be split into a test set and the remaining data. (We call this a trainval-test split.)
        The fit() method allows to specify multiple such splits,
        and some AlgInterface implementations (NNAlgInterface) allow to vectorize computations across these splits.
        However, for vectorization, we may require that the test set sizes are identical in all splits.
    - The remaining data can further be split into training and validation data. (We call this a train-val split.)
        AlgInterface allows to fit with one or multiple train-val splits, which can also be vectorized in NNAlgInterface.
        Optionally, the function `get_refit_interface()` allows to extract an AlgInterface that can be used for
        fitting the model on training+validation set
        with the best settings found on the validation set in the cross-validation stage (represented by self.fit_params).
        These "best settings" could be an early stopping epoch or number of trees,
        or best hyperparameters found by hyperparameter optimization.
        We call this refitting.

    Another feature of AlgInterface is that it provides methods to get (an estimate of) required resources
    and to evaluate metrics on training, validation, and test set.
    """

    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        """
        :param fit_params: This parameter can be used to store the best hyperparameters
            found during fit() in (cross-)validation mode. These can then be used for fit() in refitting mode.
            If fit_params is not None, it should be a list with one dictionary per trainval-test split.
            The dictionaries then contain the obtained hyperparameters for each of the trainval-test splits.
            Normally, there are no best parameters per train-val split
            as we might not have the same number of refitted models as train-val splits.
        :param config: Other parameters.
        """
        self.config = config
        self.fit_params = fit_params
        self.curr_pred_params_name = ''

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        """
        Fit the models on the given data and splits.
        Should be overridden by subclasses unless fit_and_eval() is overloaded.
        In the latter case, this method will by default use fit_and_eval() and discard the evaluation.

        :param ds: DictDataset representing the dataset. Should be on the CPU.
        :param idxs_list: List containing one SplitIdxs object per trainval-test split. Indices should be on the CPU.
        :param interface_resources: Resources assigned to fit().
        :param logger: Logger that can be used for logging.
        :param tmp_folders: List of paths that can be used for storing intermediate data.
            The paths can be None, in which case methods will try not to save intermediate results.
            There should be one folder per trainval-test-split (i.e. only one per k-fold CV).
        :param name: Name of the algorithm (for logging).
        :return: May return information about different possible fit_params settings that can be used.
            Say a variable `results` is returned that is not None.
            Then, results[tt_split_idx][tv_split_idx] should be a list of tuples (params, loss).
            This is useful for k-fold cross-validation,
            where the params with the best average loss (averaged over tv_split_idx) can be selected for fit_params.
        """
        if self.__class__.fit_and_eval == AlgInterface.fit_and_eval:
            raise NotImplementedError()  # avoid infinite recursion
        else:
            self.fit_and_eval(ds, idxs_list, interface_resources, logger, tmp_folders, name, metrics=None,
                              return_preds=False)
        return None

    def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
                     logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics],
                     return_preds: bool) -> List[NestedDict]:
        """
        Run fit() with the given parameters and then return the result of eval() with the given metrics.
        This method can be overridden instead of fit() if it is more convenient.
        The idea is that for hyperparameter optimization,
        one has to evaluate each hyperparameter combination anyway after training it,
        so it is more efficient to implement fit_and_eval() and return the evaluation of the best method at the end.
        See the documentation of fit() and eval() for the meaning of the parameters and returned values.
        """
        if self.__class__.fit == AlgInterface.fit:
            raise NotImplementedError()  # avoid infinite recursion
        self.fit(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources,
                 logger=logger, tmp_folders=tmp_folders, name=name)
        return self.eval(ds=ds, idxs_list=idxs_list, metrics=metrics, return_preds=return_preds)

    def eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], metrics: Optional[Metrics],
             return_preds: bool) -> List[NestedDict]:
        """
        Evaluates the (already fitted) method using various metrics on training, validation, and test sets.
        The results will also contain the found fit_params and optionally the predictions on the dataset.
        This method should normally not be overridden in subclasses.

        :param ds: Dataset.
        :param idxs_list: List of indices for the training-validation-test splits,
            one per trainval-test split as in fit().
        :param metrics: Metrics object that defines which metrics should be evaluated.
            If metrics is None, an empty list will be returned
            (which might avoid unnecessary computation when implementing fit() through fit_and_eval()).
        :param return_preds: Whether the predictions on the dataset should be included in the returned results.
        :return: Returns a list with one NestedDict for every trainval-test split.
            Denote by `results` such a NestedDict object. Then, `results` will contain the following contents:
            results['metrics', 'train'/'val'/'test', str(n_models), str(start_idx), metric_name] = metric_value
            Here, an ensemble of the predictions of models [start_idx:start_idx+n_models] will be used.
            results['y_preds'] = a list (converted from a tensor) with predictions on the whole dataset,
            included only if return_preds==True.
            results['fit_params'] = self.fit_params
        """

        if metrics is None:
            results = []
            # for idxs in idxs_list:
            #     result = NestedDict()
            #     for split_name in ['train', 'val', 'test']:
            #         result['metrics'][split_name]['1']['0'] = dict()
            #     if return_preds:
            #         pass
            #     results.append(dict(metrics))
            return results
        X, y = ds.split_xy()
        y = y.tensors['y']
        y_pred_full = self.predict(X).detach().cpu()
        # print(f'{y_pred_full[0, idxs_list[0].val_idxs[0, 4]]=}')
        # print(f'{self.predict(X.get_sub_dataset(idxs_list[0].val_idxs[0]))[0, 4]=}')
        # print(f'{idxs_list[0].val_idxs[0, 4]=}')
        # print(f'{y=}')
        # print(f'{y_pred_full=}')
        # print(f'{y.shape=}')
        # print(f'{y_pred_full.shape=}')
        idx = 0
        results_list = []
        for split_idx, idxs in enumerate(idxs_list):
            results = NestedDict()

            y_preds = y_pred_full[idx:idx + idxs.n_trainval_splits]
            if return_preds:
                results['y_preds'] = y_preds.numpy().tolist()
            idx += idxs.n_trainval_splits

            if idxs.test_idxs is not None:
                # print(f'{y_preds.shape=}')
                # print(f'{y.shape=}')
                results['metrics', 'test'] = metrics.compute_metrics_dict(
                    y_preds=[y_preds[i, idxs.test_idxs] for i in range(y_preds.shape[0])],
                    y=y[idxs.test_idxs],
                    use_ens=True)
            train_metrics = NestedDict()
            val_metrics = NestedDict()
            for i in range(idxs.n_trainval_splits):
                train_dict = metrics.compute_metrics_dict([y_preds[i, idxs.train_idxs[i]]], y[idxs.train_idxs[i]],
                                                          use_ens=False)
                train_metrics['1', str(i)] = train_dict['1', '0']

                if idxs.val_idxs is not None and idxs.val_idxs.shape[-1] > 0:
                    # print(f'{y_preds[0, idxs.val_idxs[0, 4]]=}')
                    val_dict = metrics.compute_metrics_dict([y_preds[i, idxs.val_idxs[i]]], y[idxs.val_idxs[i]],
                                                            use_ens=False)
                    val_metrics['1', str(i)] = val_dict['1', '0']
                    # print(f'{val_metrics=}')
                    # print(f'{idxs.val_idxs.shape[-1]=}')
                    # print(f'{torch.min(y_preds[0, idxs.val_idxs[0]]).item()=}')
                    # print(f'{ds.tensors["x_cont"][idxs.val_idxs[0, 4]]=}')
                    # print(f'{ds.tensors["x_cat"][idxs.val_idxs[0, 4]]=}')

            results['metrics', 'train'] = train_metrics
            if idxs.val_idxs is not None:
                results['metrics', 'val'] = val_metrics
            if self.fit_params is not None:
                results['fit_params'] = self.fit_params[split_idx]
            results_list.append(results)

        return results_list

    def predict(self, ds: DictDataset) -> torch.Tensor:
        """
        Method to predict labels on the given dataset. Override in subclasses.

        :param ds: Dataset on which to predict labels
        :return: Returns a tensor of shape [n_trainval_splits * n_splits, ds.n_samples, output_shape]
            In the classification case, output_shape will be the number of classes (even in the binary case)
            and the outputs will be logits (i.e., softmax should be applied to get probabilities)
            In the regression case, output_shape will be the target dimension (often 1).
        """
        raise NotImplementedError()

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        """
        Returns another AlgInterface that is configured for refitting on the training and validation data.
        Override in subclasses.

        :param n_refit: Number of models that should be refitted (with different seeds) per trainval-test split.
        :param fit_params: Fit parameters (see the constructor) that should be used for refitting.
            If fit_params is None, self.fit_params will be used instead.
        :return: Returns the AlgInterface object for refitting.
        """
        raise NotImplementedError()

    def get_fit_params(self) -> Optional[List[Dict]]:
        """
        :return: Return self.fit_params.
        """
        return self.fit_params

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        """
        Estimate the required resources for fit().

        :param ds: Dataset. Does not have to contain tensors.
        :param n_cv: Number of train-val splits per trainval-test split.
        :param n_refit: Number of refitted models per trainval-test split.
        :param n_splits: Number of trainval-test splits.
        :param split_seeds: Seeds for every trainval-test split.
        :return: Returns estimated required resources.
        """
        raise NotImplementedError()

    # ------- for alg interfaces that can predict with multiple versions of an algorithm

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        # override in subclasses if more options are available
        return {'': dict()}

    def get_current_predict_params_name(self):
        return self.curr_pred_params_name

    def get_current_predict_params_dict(self):
        return self.get_available_predict_params()[self.curr_pred_params_name]

    def set_current_predict_params(self, name: str) -> None:
        self.curr_pred_params_name = name

    def to(self, device: str) -> None:
        warnings.warn(f'.to() method does nothing for {self.__class__} (not implemented)')


class MultiSplitWrapperAlgInterface(AlgInterface):
    # todo: do we need the option to run this with a "split batch size" > 1 for the NNInterface?
    def __init__(self, single_split_interfaces: List[AlgInterface], **config):
        super().__init__(single_split_interfaces=single_split_interfaces, **config)
        # todo: could allow parallel evaluation, but not for now
        self.single_split_interfaces = single_split_interfaces

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        # return interface with the hyperparameters found by cross-validation for refitting
        # this can only be called if some fit method has been called before with validation data
        fit_params = fit_params or self.fit_params
        if fit_params is not None:
            assert len(fit_params) == len(self.single_split_interfaces)
            fit_params_list = [[p] for p in fit_params]
        else:
            fit_params_list = [None] * len(self.single_split_interfaces)
        return MultiSplitWrapperAlgInterface([s.get_refit_interface(n_refit, p)
                                              for p, s in zip(fit_params_list, self.single_split_interfaces)])

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(self.single_split_interfaces) == len(idxs_list)
        assert len(idxs_list) == len(tmp_folders)

        for split_idx in range(len(idxs_list)):
            self.single_split_interfaces[split_idx].fit(ds, [idxs_list[split_idx]], interface_resources, logger,
                                                        [tmp_folders[split_idx]], name)
        self.fit_params = [ssi.fit_params[0] for ssi in self.single_split_interfaces]
        return None

    def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
                     logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics],
                     return_preds: bool) -> List[NestedDict]:
        assert len(self.single_split_interfaces) == len(idxs_list)
        assert len(idxs_list) == len(tmp_folders)

        results_list = []

        for split_idx in range(len(idxs_list)):
            results_list.extend(self.single_split_interfaces[split_idx].fit_and_eval(
                ds, [idxs_list[split_idx]], interface_resources, logger,
                [tmp_folders[split_idx]], name, metrics, return_preds))

        return results_list

    def predict(self, ds: DictDataset) -> torch.Tensor:
        return cat_if_necessary([s.predict(ds) for s in self.single_split_interfaces], dim=0)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        single_resources = [
            ssi.get_required_resources(ds, n_cv, n_refit, n_splits=1, split_seeds=[split_seeds[i]], n_train=n_train)
            for i, ssi in enumerate(self.single_split_interfaces)]
        return RequiredResources.combine_sequential(single_resources)

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        return self.single_split_interfaces[0].get_available_predict_params()

    def set_current_predict_params(self, name: str) -> None:
        super().set_current_predict_params(name)
        for ssi in self.single_split_interfaces:
            ssi.set_current_predict_params(name)


class SingleSplitAlgInterface(AlgInterface):
    pass  # this class is just to document that the fit() and fit_and_eval() functions can only take one split


class OptAlgInterface(SingleSplitAlgInterface):
    def __init__(self, hyper_optimizer: HyperOptimizer, max_resource_config: Dict, **config):
        super().__init__(**config)
        # self.create_alg_interface = create_alg_interface
        self.hyper_optimizer = hyper_optimizer

        # a configuration that can be passed to self.create_alg_interface()
        # which should be used for resource estimation.
        # E.g. for tree-based methods this should involve the maximum depth and maximum n_estimators
        # that can be used during HPO.
        self.max_resource_config = max_resource_config

        # self.fit_params['hyper_fit_params'] will contain the optimized parameters,
        # self.fit_params['sub_fit_params'] will contain the fit_params of the best fitted alg_interface
        self.best_alg_interface = None
        self.opt_step = 0

        # list where all results from all optimization steps can be stored (except y_preds, to save memory)
        # this list will then be included into the final results, such that one can retrospectively simulate
        # what would have happened if the optimization had been terminated earlier
        self.results_list = []

    def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
        raise NotImplementedError()

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        if fit_params is not None:
            assert len(fit_params) == 1  # single split
        else:
            assert self.fit_params is not None
            fit_params = self.fit_params
        # print(f'{fit_params=}')
        alg_interface = self.create_alg_interface(n_refit,
                                                  **utils.join_dicts(self.config, fit_params[0]['hyper_fit_params']))
        # the alg_interface itself may have other hypers that have been fit
        return alg_interface.get_refit_interface(n_refit, fit_params[0]['sub_fit_params'])

    def objective(self, params, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
                  logger: Logger, tmp_folder: Optional[Path], name: str, metrics: Optional[Metrics],
                  return_preds: bool) -> Tuple[float, Tuple[List[NestedDict], AlgInterface]]:
        self.opt_step += 1
        tmp_folder = tmp_folder / f'step_{self.opt_step}' if tmp_folder is not None else None

        could_load = False

        # try to load results
        if tmp_folder is not None and utils.existsFile(tmp_folder / 'DONE'):
            # should be able to load the results
            alg_interface = utils.deserialize(tmp_folder / 'alg_interface.pkl', compressed=True)
            results = utils.deserialize(tmp_folder / 'results.pkl')
            sub_fit_params = utils.deserialize(tmp_folder / 'fit_params.pkl')
            loaded_params = utils.deserialize(tmp_folder / 'params.pkl')

            if loaded_params != params:
                print('Got different params than the saved ones, '
                      'hyperparameter optimizer might be non-deterministic')
                print(f'{params=}')
                print(f'{loaded_params=}', flush=True)
                # logger.log(1, 'Got different params than the saved ones, '
                #               'hyperparameter optimizer might be non-deterministic')
                # don't set could_load to true, recompute
                utils.delete_file(tmp_folder / 'DONE')
            else:
                could_load = True

        if not could_load:
            # compute results
            tmp_folders = [tmp_folder / 'alg_interface' if tmp_folder is not None else None]
            alg_interface = self.create_alg_interface(idxs_list[0].n_trainval_splits,
                                                      **utils.join_dicts(self.config, params))
            results = alg_interface.fit_and_eval(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources,
                                                 logger=logger, tmp_folders=tmp_folders, name=name, metrics=metrics,
                                                 return_preds=return_preds)
            sub_fit_params = alg_interface.get_fit_params()

            # save results
            if tmp_folder is not None:
                utils.serialize(tmp_folder / 'alg_interface.pkl', alg_interface, compressed=True)
                utils.serialize(tmp_folder / 'results.pkl', results)
                # serialize fit_params separately in case the alg_interface cannot be loaded
                utils.serialize(tmp_folder / 'fit_params.pkl', sub_fit_params)
                utils.serialize(tmp_folder / 'params.pkl', params)

                # save the "DONE" file last to indicate that all other files have been completely written
                utils.writeToFile(tmp_folder / 'DONE', '')

        # todo: could do sub_fit_params[0] instead since it's only one split anyway?
        results[0]['fit_params'] = {'hyper_fit_params': params, 'sub_fit_params': sub_fit_params}

        # store all parameters and results (metrics) without predictions
        self.results_list.append(utils.update_dict(results[0].get_dict(), remove_keys=['y_preds']))

        val_loss = metrics.compute_val_score(results[0]['metrics']['val'])
        return val_loss, (results, alg_interface)

    def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
                     logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics],
                     return_preds: bool) -> List[NestedDict]:
        assert len(idxs_list) == 1  # this is a SingleSplitAlgInterface
        assert len(tmp_folders) == 1  # this is a SingleSplitAlgInterface
        split_idxs = idxs_list[0]
        tmp_folder = tmp_folders[0]
        opt_desc = f'split {split_idxs.split_id} of {name}'

        if metrics is None:
            # create metrics because we need to have a validation score
            task_type = TaskType.CLASSIFICATION if ds.tensor_infos['y'].is_cat() else TaskType.REGRESSION
            val_metric_name = self.config.get('val_metric_name', Metrics.default_val_metric_name(task_type))
            metrics = Metrics(metric_names=[val_metric_name], val_metric_name=val_metric_name, task_type=task_type)

        self.opt_step = 0
        f = functools.partial(self.objective, ds=ds, idxs_list=idxs_list, interface_resources=interface_resources,
                              logger=logger, tmp_folder=tmp_folder, name=name, metrics=metrics,
                              return_preds=return_preds)
        hyper_fit_params, (results, best_alg_interface) = self.hyper_optimizer.optimize(
            f=f, seed=split_idxs.sub_split_seeds[0], opt_desc=opt_desc, logger=logger)
        self.best_alg_interface = best_alg_interface
        self.fit_params = [results[0]['fit_params']]
        results[0]['opt_step_results'] = self.results_list
        return results

    def predict(self, ds: DictDataset) -> torch.Tensor:
        return self.best_alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        ref_alg_interface = self.create_alg_interface(n_sub_splits=1, **self.max_resource_config)
        single_resources = ref_alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1,
                                                                    split_seeds=split_seeds, n_train=n_train)
        single_resources.time_s *= (self.hyper_optimizer.get_n_hyperopt_steps() * n_cv + n_refit) * n_splits
        return single_resources


class RandomParamsAlgInterface(SingleSplitAlgInterface):
    def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        """
        :param model_idx: used for seeding along with the seed given in fit(), so we can do random search HPO
            by combining multiple RandomParamsNNAlgInterface objects with different model_idx values-
        :param fit_params: Fit parameters (stopping epoch for refitting).
        :param config: Configuration parameters.
        """
        super().__init__(fit_params=fit_params, **config)
        self.model_idx = model_idx
        self.alg_interface = None

    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        raise NotImplementedError()  # override in subclass

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        raise NotImplementedError()

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        raise NotImplementedError('Refit is not fully implemented...')
        # return RandomParamsNNAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params,
        #                                   **self.config)

    def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int, n_tv_splits: int):
        # this is also set in get_required_resources, but okay
        if self.fit_params is None:
            hparam_seed = utils.combine_seeds(seed, self.model_idx)
            is_classification = not ds.tensor_infos['y'].is_cont()
            self.fit_params = [self._sample_params(is_classification, hparam_seed, n_train)]
        # todo: need epoch for refit
        return self._create_interface_from_config(n_tv_splits=n_tv_splits, fit_params=None,
                                                  **utils.update_dict(self.config, self.fit_params[0]))

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1
        n_tv_splits = idxs_list[0].n_trainval_splits
        self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, n_train=idxs_list[0].n_train,
                                                        n_tv_splits=n_tv_splits)
        print(f'{self.fit_params[0]=}')
        self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name)
        self.fit_params[0]['sub_fit_params'] = self.alg_interface.fit_params[0]


    def predict(self, ds: DictDataset) -> torch.Tensor:
        self.alg_interface.set_current_predict_params(self.get_current_predict_params_name())
        return self.alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert len(split_seeds) == 1
        alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train=n_train, n_tv_splits=n_cv)
        return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train)


================================================
FILE: pytabkit/models/alg_interfaces/autogluon_model_interfaces.py
================================================
import copy
import os
from typing import List, Any, Optional

import numpy as np
import pandas as pd
import torch
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.base import RequiredResources, InterfaceResources
from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.utils import FunctionProcess


class AutoGluonModelAlgInterface(SklearnSubSplitInterface):
    # parameters: use_gpu?, hp_family?, model_types, max_n_models_per_type
    # possible values for hp_family: default, zeroshot, zeroshot_hpo, zeroshot_hpo_hybrid, default_FTT, light
    # possible values for model_types: 'FASTAI', 'NN_TORCH', 'FT_TRANSFORMER', 'XGB', 'CAT', 'GBM', 'RF', 'XT'
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        from autogluon.tabular import TabularPredictor

        params_config = []
        params = utils.extract_params(self.config, params_config)
        params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
        val_metric_name = self.config.get('val_metric_name')

        # todo: random_state?

        other_kwargs = dict()

        if self.n_classes > 0:
            problem_type = 'binary' if self.n_classes == 2 else 'multiclass'
            if val_metric_name is None or val_metric_name == 'class_error':
                eval_metric = 'accuracy'
            elif val_metric_name == 'cross_entropy':
                eval_metric = 'log_loss'
            else:
                raise ValueError(f'{val_metric_name=} not implemented')
        else:
            problem_type = 'regression'
            if val_metric_name is None or val_metric_name == 'rmse':
                eval_metric = 'rmse'
            elif val_metric_name.startswith('pinball('):
                problem_type = 'quantile'
                eval_metric = 'pinball_loss'
                other_kwargs = dict(quantile_levels=[float(val_metric_name[len('pinball('):-1])])
            else:
                raise ValueError(f'{val_metric_name=} not implemented')

        self.eval_metric = eval_metric

        return TabularPredictor(label='label', eval_metric=eval_metric,
                                problem_type=problem_type,
                                path=self.config.get('tmp_folder', None),
                                verbosity=self.config.get('verbosity', 0),
                                log_to_file=False, **other_kwargs)

    def _create_df(self, X: pd.DataFrame, y: Optional[np.ndarray]):
        new_columns = {'input_' + col_name: X[col_name] for col_name in X.columns}
        if y is not None:
            new_columns['label'] = y
        df = pd.DataFrame(new_columns)
        if y is not None:
            is_reg = y.dtype.kind == 'f'
            df['label'] = df['label'].astype('float64' if is_reg else 'category')
        return df

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1

        use_gpu = self.config.get('use_gpu', False)
        model_types = self.config['model_types']
        if isinstance(model_types, str):
            model_types = [model_types]
        has_ft_transformer = 'FT_TRANSFORMER' in model_types

        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5, 'n_samples*n_features': 5e-6}
        ram_params = {'': 0.5 if use_gpu else 3.0, 'ds_onehot_size_gb': 1.5}
        gpu_ram_params = {'': 0.4, 'ds_onehot_size_gb': 1.5,
                          'n_features': 3e-2 if has_ft_transformer else 1e-4} if use_gpu else None
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1 if use_gpu else 0,
                               gpu_usage=0.02 if use_gpu else 0.0)
        return rc.get_required_resources(ds)

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        df = self._create_df(x_df, y)
        # by default, we ignore the validation set since most sklearn methods do not support it
        n_samples = len(x_df)
        train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
        train_mask[val_idxs] = False

        hparams_selected = dict()

        from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

        hparams = copy.deepcopy(get_hyperparameter_config(self.config.get('hp_family', 'default')))
        interface_resources: InterfaceResources = self.config['interface_resources']
        cuda_ids = [device[len('cuda:'):] for device in interface_resources.gpu_devices if device.startswith('cuda:')]
        use_gpu = len(cuda_ids) > 0
        # todo: this is only correct if the variable wasn't already set before
        # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(cuda_ids)
        print(f'_fit_sklearn: {torch.cuda.is_initialized()=}')
        # todo: does it work?
        print(f'{torch.cuda.device_count()=}')
        print(f'{cuda_ids=}')
        print(f'{os.getenv("CUDA_VISIBLE_DEVICES")=}')

        max_n_models_per_type = self.config.get('max_n_models_per_type', 0)
        hparams_idx = self.config.get('hparams_idx', None)
        model_types = self.config['model_types']
        if isinstance(model_types, str):
            model_types = [model_types]

        for key, value in hparams.items():
            # if key in ['FASTAI', 'NN_TORCH', 'FT_TRANSFORMER']:
            if key not in model_types:
                continue
            if not isinstance(value, list):
                value = [value]
            if hparams_idx is not None:
                value = [value[hparams_idx]]
            if max_n_models_per_type > 0 and len(value) > max_n_models_per_type:
                value = value[:max_n_models_per_type]
            for config in value:
                config['ag_args_fit'] = dict(num_gpus=1 if use_gpu else 0)
                if key == 'FT_TRANSFORMER':
                    config['ag_args_fit']['_max_features'] = 100_000
                    config['_max_features'] = 100_000

            hparams_selected[key] = value

        print(f'{hparams_selected=}')

        self.model.fit(df.iloc[train_mask], tuning_data=df.iloc[~train_mask],
                               presets='medium_quality',
                               fit_weighted_ensemble=False,
                               fit_full_last_level_weighted_ensemble=False,
                               hyperparameters=hparams_selected,
                               )

        # fit_func = lambda df, hparams_selected, train_mask, model: model.fit(df.iloc[train_mask], tuning_data=df.iloc[~train_mask],
        #                        presets='medium_quality',
        #                        fit_weighted_ensemble=False,
        #                        fit_full_last_level_weighted_ensemble=False,
        #                        hyperparameters=hparams_selected,
        #                        )
        #
        # print(f'Running fit on autogluon model')
        #
        # # fit_func(df, hparams_selected, train_mask, self.model)
        # self.model = FunctionProcess(fit_func, df, hparams_selected, train_mask, self.model).start().pop_result()
        # print(f'fit completed')

    def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict(self._create_df(x_df, None)).to_numpy()

    def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict_proba(self._create_df(x_df, None)).to_numpy()


================================================
FILE: pytabkit/models/alg_interfaces/base.py
================================================
from typing import Optional, List

import numpy as np
import torch


class SplitIdxs:
    """
    Represents multiple train-validation-test splits for AlgInterface.
    """
    def __init__(self, train_idxs: torch.Tensor, val_idxs: Optional[torch.Tensor], test_idxs: Optional[torch.Tensor],
                 split_seed: int, sub_split_seeds: List[int], split_id: int):
        """
        :param train_idxs: Tensor of shape (n_trainval_splits, n_train_idxs).
            Each of the train-val splits needs to have the same number of training samples.
            The elements of the tensor should index the training set elements in a larger dataset.
        :param val_idxs: Tensor of shape (n_trainval_splits, n_val_idxs), or None if no validation set should be used.
        :param test_idxs: Tensor of shape (n_test_idxs,). The same test set will be used for all train-val splits.
        :param split_seed: Random seed for algorithms on this split.
        :param sub_split_seeds: Separate random seeds for algorithms on each train-val split
            (length should be n_trainval_splits).
        :param split_id: ID of this split (for logging/saving purposes).
        """
        self.train_idxs = train_idxs
        self.val_idxs = val_idxs
        self.test_idxs = test_idxs
        self.split_seed = split_seed
        self.sub_split_seeds = sub_split_seeds
        self.split_id = split_id
        self.n_trainval_splits = train_idxs.shape[0]
        self.n_train = train_idxs.shape[-1]
        self.n_val = 0 if val_idxs is None else val_idxs.shape[-1]
        self.n_test = 0 if test_idxs is None else test_idxs.shape[-1]
        if len(self.sub_split_seeds) != self.n_trainval_splits:
            raise ValueError('len(self.alg_seeds) != self.n_trainval_splits')
        if val_idxs is not None and val_idxs.shape[0] != self.n_trainval_splits:
            raise ValueError('val_idxs.shape[0] != self.n_trainval_splits')

    def get_sub_split_idxs(self, i: int) -> 'SubSplitIdxs':
        return SubSplitIdxs(self.train_idxs[i], self.val_idxs[i] if self.val_idxs is not None else None,
                            self.test_idxs, self.sub_split_seeds[i])

    def get_sub_split_idxs_alt(self, i: int) -> 'SplitIdxs':
        return SplitIdxs(self.train_idxs[i:i+1], self.val_idxs[i:i+1] if self.val_idxs is not None else None,
                            self.test_idxs, self.split_seed, self.sub_split_seeds[i:i+1], split_id=self.split_id)


class SubSplitIdxs:
    """
    Represents a single trainval-test split with multiple train-val splits
    """
    def __init__(self, train_idxs: torch.Tensor, val_idxs: Optional[torch.Tensor], test_idxs: Optional[torch.Tensor],
                 alg_seed: int):
        # train_idxs: n_train_idxs
        # val_idxs: n_val_idxs (optional)
        # test_idxs: n_test_idxs (optional)
        self.train_idxs = train_idxs
        self.val_idxs = val_idxs
        self.test_idxs = test_idxs
        self.alg_seed = alg_seed
        self.n_train = train_idxs.shape[-1]
        self.n_val = 0 if val_idxs is None else val_idxs.shape[-1]
        self.n_test = 0 if test_idxs is None else test_idxs.shape[-1]


class InterfaceResources:
    """
    Simple class representing resources that a method is allowed to use (number of threads and GPUs).
    """
    def __init__(self, n_threads: int, gpu_devices: List[str], time_in_seconds: Optional[int] = None):
        self.n_threads = n_threads
        self.gpu_devices = gpu_devices
        self.time_in_seconds = time_in_seconds


class RequiredResources:
    """
    Represents estimated/requested resources by a method.
    """
    def __init__(self, time_s: float, n_threads: float, cpu_ram_gb: float, n_gpus: int = 0, gpu_usage: float = 1.0,
                 gpu_ram_gb: float = 0.0, n_explicit_physical_cores: int = 0):
        self.n_threads = n_threads
        self.cpu_ram_gb = cpu_ram_gb
        self.n_gpus = n_gpus
        self.gpu_usage = gpu_usage
        self.gpu_ram_gb = gpu_ram_gb
        self.time_s = time_s
        # for liquidSVM, want to have contiguous core indices
        self.n_explicit_physical_cores = n_explicit_physical_cores

    def get_resource_vector(self, fixed_resource_vector: np.ndarray):
        own_resources = np.asarray([self.n_threads, self.cpu_ram_gb, self.gpu_usage, self.gpu_ram_gb])
        if self.should_add_fixed_resources():
            # do not use fixed cpu ram since that is also measured for GPU usage
            own_resources += fixed_resource_vector
        multiplier = np.asarray([1.0, 1.0, self.n_gpus, self.n_gpus])
        return multiplier * own_resources

    def should_add_fixed_resources(self) -> bool:
        return self.n_gpus > 0

    @staticmethod
    def combine_sequential(resources_list: List['RequiredResources']):
        return RequiredResources(time_s=sum([r.time_s for r in resources_list]),
                                 n_threads=max([r.n_threads for r in resources_list]),
                                 cpu_ram_gb=max([r.cpu_ram_gb for r in resources_list]),
                                 n_gpus=max([r.n_gpus for r in resources_list]),
                                 gpu_usage=max([r.gpu_usage for r in resources_list]),
                                 gpu_ram_gb=max([r.gpu_ram_gb for r in resources_list]),
                                 n_explicit_physical_cores=max([r.n_explicit_physical_cores for r in resources_list]),
                                 )


================================================
FILE: pytabkit/models/alg_interfaces/calibration.py
================================================
import traceback
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any, Callable

import numpy as np
import scipy
import sklearn
import torch
import torch.nn as nn
from dask.array import greater
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.data.data import DictDataset
from pytabkit.models.training.logging import Logger

import math


class PostHocCalibrationAlgInterface(AlgInterface):
    def __init__(self, alg_interface: AlgInterface, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.alg_interface = alg_interface
        self.calibrators = []
        self.n_calibs = []

    def _transform_probs(self, probs: np.ndarray) -> np.ndarray:
        offset = self.config.get('calib_input_offset', 0.0)
        if offset != 0.0:
            probs = probs + offset
            probs = probs / np.sum(probs, axis=-1, keepdims=True)
        return probs

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) \
            -> Optional[List[List[List[Tuple[Dict, float]]]]]:
        self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name)
        y_preds = self.alg_interface.predict(ds)

        self.n_tv_splits_list_ = [idxs.n_trainval_splits for idxs in idxs_list]

        if self.config.get('calibrate_per_fold', True):
            for tt_split_idx, split_idxs in enumerate(idxs_list):
                for tv_split_idx in range(split_idxs.n_trainval_splits):
                    val_idxs = split_idxs.val_idxs[tv_split_idx]
                    y = ds.tensors['y'][val_idxs]
                    y_pred = y_preds[len(self.calibrators), val_idxs]
                    y_pred_probs = torch.softmax(y_pred, dim=-1)

                    import probmetrics.calibrators
                    import probmetrics.distributions
                    calib = probmetrics.calibrators.get_calibrator(**self.config)
                    if self.config.get('calibrate_with_logits', True):
                        calib.fit_torch(y_pred=probmetrics.distributions.CategoricalLogits(y_pred.detach().cpu()),
                                        y_true_labels=y[:, 0])
                    else:
                        calib.fit(self._transform_probs(y_pred_probs.detach().cpu().numpy()), y.cpu().numpy()[:, 0])

                    self.calibrators.append(calib)
                    self.n_calibs.append(val_idxs.shape[-1])
        else:
            y_pred_idx = 0
            for tt_split_idx, split_idxs in enumerate(idxs_list):
                y_pred_list = []
                y_list = []
                for tv_split_idx in range(split_idxs.n_trainval_splits):
                    val_idxs = split_idxs.val_idxs[tv_split_idx]
                    y_pred_list.append(y_preds[y_pred_idx, val_idxs])
                    y_list.append(ds.tensors['y'][val_idxs])
                    y_pred_idx += 1

                y_pred = torch.cat(y_pred_list, dim=0)
                y = torch.cat(y_list, dim=0)

                import probmetrics.calibrators
                import probmetrics.distributions
                calib = probmetrics.calibrators.get_calibrator(**self.config)
                if self.config.get('calibrate_with_logits', True):
                    calib.fit_torch(y_pred=probmetrics.distributions.CategoricalLogits(y_pred.detach().cpu()),
                                    y_true_labels=y[:, 0].detach().cpu())
                else:
                    calib.fit(self._transform_probs(torch.softmax(y_pred, dim=-1).detach().cpu().numpy()), y.cpu().numpy()[:, 0])

                self.calibrators.extend([calib] * split_idxs.n_trainval_splits)
                self.n_calibs.extend([y_pred.shape[0]] * split_idxs.n_trainval_splits)

        self.fit_params = [dict(sub_fit_params=fp) for fp in self.alg_interface.fit_params]

        return None

    def predict(self, ds: DictDataset) -> torch.Tensor:
        y_preds = self.alg_interface.predict(ds)
        y_preds_probs = torch.softmax(y_preds, dim=-1)
        y_preds_calib = []

        if self.config.get('ensemble_before_calib', False):
            start_idx = 0
            for n_tv_splits in self.n_tv_splits_list_:
                avg_probs = y_preds_probs[start_idx:start_idx+n_tv_splits].mean(dim=0, keepdim=True)
                y_preds_probs[start_idx:start_idx + n_tv_splits] = avg_probs
                start_idx += n_tv_splits
            y_preds = torch.log(y_preds_probs + 1e-30)

        for i in range(y_preds.shape[0]):
            if self.config.get('calibrate_with_logits', True):
                from probmetrics.distributions import CategoricalLogits
                y_pred_calib = self.calibrators[i].predict_proba_torch(
                    CategoricalLogits(y_preds[i].detach().cpu())).get_probs()
            else:
                y_pred_calib = self.calibrators[i].predict_proba(
                    self._transform_probs(y_preds_probs[i].detach().cpu().numpy()))
                # the np.array(...) is for avoiding read-only array warnings
                y_pred_calib = torch.as_tensor(np.array(y_pred_calib), dtype=torch.float32)

            if self.config.get('use_calib_offset', False):
                y_pred_calib += 1. / self.n_calibs[i]
            y_pred_calib = torch.log(y_pred_calib + 1e-30)
            y_preds_calib.append(y_pred_calib)
        result = torch.stack(y_preds_calib, dim=0)
        # print(f'{y_preds.shape=}, {result.shape=}')
        return result

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        return self.alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train)

    def to(self, device: str) -> None:
        self.alg_interface.to(device)


================================================
FILE: pytabkit/models/alg_interfaces/catboost_interfaces.py
================================================
import copy
import warnings
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple, Union

import numpy as np
import torch

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.resource_params import ResourceParams
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, \
    SingleSplitWrapperAlgInterface, \
    SklearnSubSplitInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer

from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, \
    OptAlgInterface, RandomParamsAlgInterface
from pytabkit.models.training.metrics import Metrics


class CatBoostSklearnSubSplitInterface(SklearnSubSplitInterface):
    def _get_cat_indexes_arg_name(self) -> str:
        return 'cat_features'

    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None, 1000),
                         ('depth', ['depth', 'max_depth'], 6),
                         ('random_strength', None, 1.0),
                         ('l2_leaf_reg', None, 3.0),
                         ('depth', ['depth', 'max_depth'], 6),
                         ('learning_rate', ['lr', 'learning_rate', 'eta']),
                         ('one_hot_max_size', None),
                         ('bagging_temperature', None),
                         ('leaf_estimation_iterations', None),
                         ('bootstrap_type', None),
                         ('subsample', None),
                         ('sampling_frequency', None),
                         ('boosting_type', None),
                         ('colsample_bylevel', ['colsample_bylevel', 'rsm'], None),
                         ('min_data_in_leaf', ['min_data_in_leaf', 'min_child_samples'], None),
                         ('grow_policy', None),
                         ('num_leaves', None),
                         ('border_count', ['border_count', 'max_bin']),
                         ('thread_count', ['thread_count', 'n_threads'], n_threads),
                         ('verbose', None, False),
                         ('allow_writing_files', None, False),
                         ]

        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            from catboost import CatBoostClassifier

            return CatBoostClassifier(random_state=seed, **params)
        else:
            from catboost import CatBoostRegressor

            return CatBoostRegressor(random_state=seed, **params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.cb_class_time,
                               cpu_ram_params=ResourceParams.cb_class_ram)
        return rc.get_required_resources(ds)


class CatBoostCustomMetric:
    # see https://stackoverflow.com/questions/65462220/how-to-create-custom-eval-metric-for-catboost
    # and https://catboost.ai/en/docs/concepts/python-usages-examples

    def __init__(self, metric_name: str, is_classification: bool, is_higher_better: bool = False,
                 select_pred_col: Optional[int] = None):
        self.metric_name = metric_name
        self.is_classification = is_classification
        self.is_higher_better = is_higher_better
        self.select_pred_col = select_pred_col

    def is_max_optimal(self):
        return self.is_higher_better

    def evaluate(self, approxes, target, weight):
        assert len(target) == len(approxes[0])
        assert weight is None

        y = torch.as_tensor(target, dtype=torch.long if self.is_classification else torch.float32)
        if len(y.shape) == 1:
            y = y[:, None]

        y_pred = torch.as_tensor(np.array(approxes), dtype=torch.float32).t()
        # CatBoost already provides logits in approxes

        if self.select_pred_col is not None:
            y_pred = y_pred[:, self.select_pred_col, None]

        if self.is_classification and y_pred.shape[1] == 1:
            # binary classification, CatBoost provides logits of the class 1
            p = torch.sigmoid(y_pred)
            y_pred_probs = torch.cat([1. - p, p], dim=1)
            y_pred = torch.log(y_pred_probs + 1e-30)

        # print(f'{y.shape=}, {y_pred.shape=}')
        # print(f'{y_pred=}')

        loss = Metrics.apply(y_pred, y, self.metric_name).item()

        weight_sum = y.shape[0]

        return weight_sum * loss, weight_sum

    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)


class CatBoostSubSplitInterface(TreeBasedSubSplitInterface):
    def _get_params(self):
        # target parameter names, possible source parameter names, default value
        params_config = [('n_estimators', None, 1000),
                         ('depth', ['depth', 'max_depth'], 6),
                         ('random_strength', None, 1.0),
                         ('l2_leaf_reg', None, 3.0),
                         ('learning_rate', ['lr', 'learning_rate', 'eta']),
                         ('one_hot_max_size', None),
                         ('bagging_temperature', None),
                         ('leaf_estimation_iterations', None),
                         ('bootstrap_type', None),
                         ('subsample', None),
                         ('boosting_type', None, 'Plain'),  # fix default to Plain to equalize CPU and GPU
                         ('colsample_bylevel', ['colsample_bylevel', 'rsm'], None),
                         ('min_data_in_leaf', ['min_data_in_leaf', 'min_child_samples'], None),
                         ('grow_policy', None),
                         ('max_leaves', ['max_leaves', 'num_leaves'], None),
                         ('border_count', ['border_count', 'max_bin'], 254),  # fix default to 254 for GPU as well
                         ('used_ram_limit', None),
                         ('od_type', 'Iter'),
                         ('od_pval', None),
                         ('od_wait', ['od_wait', 'early_stopping_rounds'], None),
                         ('sampling_frequency', None),
                         ('max_ctr_complexity', None),
                         ('model_size_reg', None),
                         ]

        params = utils.extract_params(self.config, params_config)
        params['verbose'] = self.config.get('verbosity', 0) > 0
        bootstrap_type = params.get('bootstrap_type', 'Bayesian')
        if bootstrap_type == 'Bayesian':
            if 'subsample' in params:
                del params['subsample']
        elif bootstrap_type == 'Bernoulli':
            if 'bagging_temperature' in params:
                del params['bagging_temperature']
        grow_policy = params.get('grow_policy', 'SymmetricTree')
        if grow_policy != 'Lossguide':
            if 'max_leaves' in params:
                del params['max_leaves']
        if grow_policy == 'SymmetricTree':
            if 'min_data_in_leaf' in params:
                del params['min_data_in_leaf']
        return params

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        assert n_refit == 1
        return CatBoostSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config)

    def _get_eval_metric(self, val_metric_name: Optional[str], n_classes: int) -> Union[str, CatBoostCustomMetric]:
        if n_classes == 0:
            if val_metric_name is None or val_metric_name == 'rmse':
                return 'RMSE'
            else:
                return CatBoostCustomMetric(metric_name=val_metric_name,
                                            is_classification=n_classes > 0,
                                            is_higher_better=False)
            # else:
            #     raise ValueError(f'Validation metric "{val_metric_name}" is currently not implemented for CatBoost')
        else:
            # classification
            if val_metric_name is None or val_metric_name == 'classification_error':
                return 'ZeroOneLoss'
            elif val_metric_name == 'cross_entropy':
                return 'Logloss' if n_classes == 2 else 'MultiClass'
            elif val_metric_name == 'brier' and n_classes == 2:
                # catboost doesn't support brier score for multiclass yet
                return 'BrierScore'
            else:
                return CatBoostCustomMetric(metric_name=val_metric_name,
                                            is_classification=n_classes > 0,
                                            is_higher_better=False)
            # else:
            #     raise ValueError(f'Validation metric "{val_metric_name}" is currently not implemented for CatBoost')

    # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/catboost_experiment.py
    def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]:
        params = copy.deepcopy(params)

        device: Optional[str] = params.pop('device', None)
        if device is not None and device.startswith('cuda'):
            params['task_type'] = 'GPU'
            params['devices'] = device.split(':')[1] if device.startswith('cuda:') else '0'

        if n_classes == 0:
            train_metric_name = self.config.get('train_metric_name', 'mse')
            # val_metric_name = self.config.get('val_metric_name', 'rmse')
            if train_metric_name == 'mse':
                params['loss_function'] = 'RMSE'
            elif train_metric_name.startswith('pinball('):
                quantile_str = train_metric_name[len('pinball('):-1]
                params['loss_function'] = f'Quantile:alpha={quantile_str}'
            else:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
        elif n_classes == 2:
            params.update({'loss_function': 'Logloss'})
        else:
            params.update({'loss_function': 'MultiClass', 'classes_count': n_classes})
        params['eval_metric'] = self._get_eval_metric(self.config.get('val_metric_name', None), n_classes)
        params['allow_writing_files'] = False
        params['use_best_model'] = False  # otherwise trees would get removed based only on a single split
        for key in ['random_strength', 'one_hot_max_size', 'leaf_estimation_iterations']:
            if key in params:
                params[key] = int(params[key])
        return params

    def _convert_ds(self, ds: DictDataset) -> Any:
        import catboost

        x_df = ds.without_labels().to_df()
        if self.config.get('shuffle_columns', False):
            if not hasattr(self, 'col_perm_'):
                self.col_perm_ = np.random.permutation(x_df.shape[1])
            x_df = x_df.iloc[:, self.col_perm_]
        label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy()
        cat_features = x_df.select_dtypes(include='category').columns.tolist()
        return catboost.Pool(x_df, label, cat_features=cat_features)

    def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int,
             n_threads: int, val_metric_name: Optional[str] = None,
             tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]:
        import catboost

        # print(f'Fitting CatBoost')
        n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item()
        params = self._preprocess_params(params, n_classes)
        params.update({'random_seed': seed, 'thread_count': n_threads})

        if val_ds is None:
            params = utils.update_dict(params, remove_keys=['od_type', 'od_pval', 'od_wait'])

        if tmp_folder is not None:
            params.update({'allow_writing_files': True, 'save_snapshot': True,
                           'snapshot_file': str(tmp_folder / 'catboost_model.cbm'),
                           'snapshot_interval': 120.0})
            # with these parameters, catboost will reload from the model automatically if it is there
        bst = catboost.CatBoost(params)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', message='Can\'t optimize method "evaluate" because self argument is used')
            bst.fit(self._convert_ds(train_ds), eval_set=None if val_ds is None else self._convert_ds(val_ds))

        if val_ds is not None:
            evals_result = bst.get_evals_result()
            # print(f'{evals_result["validation"]=}')
            eval_metric = self._get_eval_metric(self.config.get('val_metric_name', None), n_classes)
            eval_metric_name = eval_metric if isinstance(eval_metric, str) else eval_metric.__class__.__name__
            val_errors = evals_result['validation'][eval_metric_name]
        else:
            val_errors = None
        return bst, val_errors

    def _predict(self, bst, ds: DictDataset, n_classes: int,
                 other_params: Dict[str, Any]) -> torch.Tensor:
        # bst should be of type catboost.CatBoost
        # print(f'CatBoost _predict(): {other_params=}')
        ntree_end = 0 if other_params is None else other_params['n_estimators']
        prediction_type = 'RawFormulaVal' if n_classes == 0 else 'LogProbability'
        y_pred = torch.as_tensor(
            bst.predict(self._convert_ds(ds), ntree_end=ntree_end, prediction_type=prediction_type),
            dtype=torch.float32)
        if n_classes == 0:
            y_pred = y_pred.unsqueeze(-1)

        # print(f'{y_pred.shape=}')
        # print(f'{y_pred.mean(dim=0)=}')
        #
        # if torch.any(y_pred == -np.inf):
        #     y_pred_prob = torch.softmax(y_pred, dim=-1)
        #     # y_pred_prob = y_pred_prob.clamp(1e-10, 1)
        #     y_pred = torch.log(y_pred_prob + 1e-30)

        # y_pred = torch.clamp(y_pred, -100.0, 100.0)  # todo

        # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}')
        return y_pred

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, max_n_threads=8, max_depth=6), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.cb_class_time,
                               cpu_ram_params=ResourceParams.cb_class_ram)
        return rc.get_required_resources(ds)


class CatBoostHyperoptAlgInterface(OptAlgInterface):
    def __init__(self, space=None, n_hyperopt_steps: int = 50, **config):
        from hyperopt import hp
        default_config = {}
        max_config = {}
        # if space is None:
        # modified space from catboost quality benchmarks
        # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/catboost_experiment.py
        # space = {
        #     'depth': hp.choice('depth', [6]),
        #     # only 'ctr_target_border_count' exists for this catboost version
        #     # 'ctr_border_count': hp.choice('ctr_border_count', [16]),
        #     'border_count': hp.choice('border_count', [128]),
        #     # deprecated, CounterMax not allowed
        #     # 'ctr_description': hp.choice('ctr_description', [['Borders', 'CounterMax']]),
        #     'learning_rate': hp.loguniform('learning_rate', -5, 0),
        #     'random_strength': hp.choice('random_strength', [1, 20]),
        #     'one_hot_max_size': hp.choice('one_hot_max_size', [0, 25]),
        #     'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
        #     'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
        #     'used_ram_limit': hp.choice('used_ram_limit', [100000000000]),
        # }
        # need to add defaults as well
        if space is None:
            space = config.get('hpo_space_name', None)
        if space == 'NODE' or space == 'popov':
            # space from NODE paper:
            # Popov, Morozov, and Babenko, Neural oblivious decision ensembles for deep learning on tabular data
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'learning_rate': hp.loguniform('learning_rate', -5, 0),
                'random_strength': hp.quniform('random_strength', 1, 20, 1),
                'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1),
                'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
                'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 10, 1),
            }
            default_config = dict(n_estimators=2048)
            max_config['max_depth'] = 6
        elif space == 'shwartz-ziv':
            # from Shwartz-Ziv and Armon, Tabular data: Deep learning is not all you need
            # same as NODE except higher upper bound for leaf estimation iterations
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'learning_rate': hp.loguniform('learning_rate', -5, 0),
                'random_strength': hp.quniform('random_strength', 1, 20, 1),
                'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1),
                'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
                'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1),
            }
            default_config = dict(n_estimators=2048)  # not specified from the paper, so we take the value from NODE
            max_config['max_depth'] = 6
        elif space == 'tabpfn' or space == 'hollmann':
            # from Hollmann, Müller, Eggensperger, Hutter,
            # TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second
            # similar to shwartz-ziv except that one_hot_max_size is not specified and n_estimators is optimized
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'n_estimators': hp.quniform('n_estimators', 100, 4000, 1),
                'learning_rate': hp.loguniform('learning_rate', -5, 0),
                'random_strength': hp.quniform('random_strength', 1, 20, 1),
                'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
                'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1),
            }
        elif space == 'gorishniy':
            # from Gorishniy, Rubachev, Khrulkov, Babenko, Revisiting Deep Learning Models for Tabular Data
            space = {
                'max_depth': hp.quniform('max_depth', 3, 10),
                'learning_rate': hp.loguniform('learning_rate', np.log(1e-5), 0),
                'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
                'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 10, 1),
            }
            default_config = dict(n_estimators=2000)
            max_config['max_depth'] = 10
        config = utils.update_dict(default_config, config)
        super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(),
                                                           n_hyperopt_steps=n_hyperopt_steps,
                                                           **config),
                         max_resource_config=utils.join_dicts(config, max_config),
                         **config)

    def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**config) for i in range(n_sub_splits)])


class RandomParamsCatBoostAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        # adapted from Shwartz-Ziv et al.
        hpo_space_name = self.config.get('hpo_space_name', 'shwartz-ziv')
        if hpo_space_name == 'shwartz-ziv':
            space = {
                'learning_rate': np.exp(rng.uniform(-5, 0)),
                'random_strength': rng.integers(1, 20, endpoint=True),
                'one_hot_max_size': rng.integers(0, 25, endpoint=True),
                'l2_leaf_reg': np.exp(rng.uniform(0, np.log(10))),
                'bagging_temperature': rng.uniform(0, 1),
                'leaf_estimation_iterations': rng.integers(1, 20, endpoint=True),
                'n_estimators': 1000,
                'max_depth': 6
            }
        elif hpo_space_name == 'large':
            # todo: there should be no harm in tuning nan_mode in ['Min', 'Max']
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(5e-1))),

                # bootstrap
                'bootstrap_type': rng.choice(['Bayesian', 'Bernoulli']),  # todo: could do more
                'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.5, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']),
                'max_depth': rng.integers(1, 10, endpoint=True),  # todo: support more for Lossguide
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))),  # only for Depthwise and Lossguide
                'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))),  # only for Lossguide

                'colsample_bylevel': rng.uniform(0.5, 1.0),
                'random_strength': rng.uniform(0.0, 20.0),  # todo: make log-uniform?

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(2, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(2e0))),
                'max_ctr_complexity': rng.integers(1, 5, endpoint=True),
            }
        elif hpo_space_name == 'large-v2':
            # slightly shrunk version of large
            # todo: there should be no harm in tuning nan_mode in ['Min', 'Max']
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.5, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']),  # todo: could shrink
                'max_depth': rng.integers(2, 10, endpoint=True),  # todo: support more for Lossguide
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))),  # only for Depthwise and Lossguide
                'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))),  # only for Lossguide

                'colsample_bylevel': rng.uniform(0.5, 1.0),
                'random_strength': rng.uniform(0.0, 20.0),  # todo: make log-uniform?

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(2, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(2e0))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v3':
            # slightly shrunk version of large-v2 (shrunk random_strength and max_depth, colsample_bylevel, model_size_reg)
            # avoided removing lossguide for now
            # todo: there should be no harm in tuning nan_mode in ['Min', 'Max']
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.5, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']),  # todo: could shrink
                'max_depth': rng.integers(4, 10, endpoint=True),  # todo: support more for Lossguide
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))),  # only for Depthwise and Lossguide
                'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))),  # only for Lossguide

                'colsample_bylevel': rng.uniform(0.6, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(2, 128, endpoint=True),  # todo: make logarithmic?
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v4':
            # slightly shrunk version of large-v3:
            # removed Lossguide -> also removed max_leaves
            # shrunk colsample_bylevel, min_data_in_leaf, one_hot_max_size
            # todo: there should be no harm in tuning nan_mode in ['Min', 'Max']
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 10, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v5':
            # large-v4 but with max_depth <= 8 as in tabrepo1
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v6':
            # large-v5 but with tabrepo lr search space
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v7':
            # large-v5 but with early_stopping_rounds=50
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'max_bin': 254,  # added this to be sure
                'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))),  # shrunk

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v8-10k':
            # large-v7 but with 10k estimators and the tabrepo1 lr search space
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'max_bin': 254,  # added this to be sure
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'random_strength': rng.uniform(0.0, 2.0),  # shrunk

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'large-v9-10k':
            # large-v8-10k but without tuning random_strength
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'max_bin': 254,  # added this to be sure
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),

                # bootstrap
                'bootstrap_type': 'Bernoulli',  # shrunk
                # 'bagging_temperature': rng.uniform(0, 4),  # can only be used with Bayesian
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                # PerTreeLevel not supported for Lossguide
                # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']),  # CPU only!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))),  # only for Depthwise and Lossguide

                'colsample_bylevel': rng.uniform(0.85, 1.0),

                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))),
                'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))),

                # categorical features
                'one_hot_max_size': rng.integers(8, 128, endpoint=True),
                'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
            }
        elif hpo_space_name == 'tabrepo1':
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'l2_leaf_reg': rng.uniform(1.0, 5.0),
                'max_ctr_complexity': rng.integers(1, 5, endpoint=True),
                'one_hot_max_size': rng.choice([2, 3, 5, 10]),
                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
            }
        elif hpo_space_name == 'tabrepo1-es':
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'early_stopping_rounds': 50,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'l2_leaf_reg': rng.uniform(1.0, 5.0),
                'max_ctr_complexity': rng.integers(1, 5, endpoint=True),
                'one_hot_max_size': rng.choice([2, 3, 5, 10]),
                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
            }
        elif hpo_space_name == 'tabrepo1-es-10k':
            space = {
                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 8, endpoint=True),
                'l2_leaf_reg': rng.uniform(1.0, 5.0),
                'max_ctr_complexity': rng.integers(1, 5, endpoint=True),
                'one_hot_max_size': rng.choice([2, 3, 5, 10]),
                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
            }
        elif hpo_space_name == 'tabarena':
            space = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 300,  # probably not exactly equivalent to TabArena
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),

                'bootstrap_type': 'Bernoulli',
                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!

                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
                'max_depth': rng.integers(4, 8, endpoint=True),

                'colsample_bylevel': rng.uniform(0.85, 1.0),
                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),

                'leaf_estimation_iterations': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(21.0)))),

                # categorical features
                'one_hot_max_size': np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))),
                'model_size_reg': np.exp(rng.uniform(np.log(0.1), np.log(1.5))),
                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk

                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
                'max_bin': 254,  # added this to be sure
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**config) for i in range(n_tv_splits)])


================================================
FILE: pytabkit/models/alg_interfaces/ensemble_interfaces.py
================================================
import copy
import time
from pathlib import Path
from typing import List, Optional, Dict

import numpy as np
import torch

from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.data.data import DictDataset, TaskType
from pytabkit.models.torch_utils import cat_if_necessary
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import Metrics
from pytabkit.models.utils import ObjectLoadingContext


class WeightedPrediction:
    def __init__(self, y_pred_list: List[torch.Tensor], task_type: TaskType):
        self.task_type = task_type
        self.y_pred_converted_list = y_pred_list if task_type == TaskType.REGRESSION \
            else [torch.softmax(y_pred, dim=-1) for y_pred in y_pred_list]

    def predict_for_weights(self, weights: np.ndarray):
        weights = weights.astype(np.float32)
        norm_weights = weights / np.sum(weights)
        weighted_sum = sum([w * y_pred for w, y_pred in zip(norm_weights, self.y_pred_converted_list)])
        if self.task_type == TaskType.CLASSIFICATION:
            weighted_sum = torch.log(weighted_sum + 1e-30)
        return weighted_sum


class CaruanaEnsembleAlgInterface(SingleSplitAlgInterface):
    """
    Following a simple variant of Caruana et al. (2004), "Ensemble selection from libraries of models"
    without pre-selection of candidates
    """

    def __init__(self, alg_interfaces: List[AlgInterface], fit_params: Optional[List[Dict]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.alg_interfaces = alg_interfaces
        self.task_type = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        return CaruanaEnsembleAlgInterface([alg_interface.get_refit_interface(n_refit=n_refit)
                                            for alg_interface in self.alg_interfaces],
                                           fit_params=fit_params or self.fit_params)

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1

        # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory
        tmp_folder = tmp_folders[0]
        self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for
                              i, ai in enumerate(self.alg_interfaces)]
        # store copies here, but the ones that will actually be trained are in alg_contexts_
        # this means that models should not be held in RAM all the time
        self.alg_interfaces = copy.deepcopy(self.alg_interfaces)

        sub_fit_params = []

        # train sub-models
        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
            with alg_ctx as alg_interface:
                sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
                                   tmp_folders]
                if self.config.get('diversify_seeds', False):
                    sub_idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs,
                                               test_idxs=idxs.test_idxs, split_seed=idxs.split_seed + alg_idx,
                                               sub_split_seeds=[sss + alg_idx for sss in idxs.sub_split_seeds],
                                               split_id=idxs.split_id) for idxs in idxs_list]
                else:
                    sub_idxs_list = idxs_list
                alg_interface.fit(ds, sub_idxs_list, interface_resources, logger, sub_tmp_folders,
                                  name + f'sub-alg-{alg_idx}')
                sub_fit_params.append(alg_interface.get_fit_params()[0])

        if self.fit_params is not None:
            # this is the refit stage, there is no validation data set to determine the weights on,
            # instead the weights are already in fit_params
            return
        if idxs_list[0].val_idxs is None:
            raise ValueError('CaruanaEnsembleAlgInterface.fit(): Neither a validation set '
                             'nor ensemble weights were provided')

        self.task_type = TaskType.CLASSIFICATION if ds.tensor_infos[
                                                        'y'].get_cat_size_product() > 0 else TaskType.REGRESSION
        val_metric_name = self.config.get('ens_weight_metric_name', self.config.get('val_metric_name', None))
        if val_metric_name is None:
            val_metric_name = Metrics.default_val_metric_name(task_type=self.task_type)

        n_caruana_steps = self.config.get('n_caruana_steps', 40)  # default value is taken from TabRepo paper (IIRC)

        y_preds_oob_list = []

        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
        start_time = time.time()

        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
            if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
                break
            with alg_ctx as alg_interface:
                y_preds = alg_interface.predict(ds)
                # get out-of-bag predictions
                y_preds_oob_list.append(cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]]
                                                          for j in range(idxs_list[0].val_idxs.shape[0])], dim=0))

        # get out-of-bag labels
        y = ds.tensors['y']
        y_oob = cat_if_necessary([y[idxs_list[0].val_idxs[j]] for j in range(idxs_list[0].val_idxs.shape[0])], dim=0)

        weights = np.zeros(len(self.alg_contexts_), dtype=np.int32)
        best_weights = np.copy(weights)
        best_loss = np.inf

        wp = WeightedPrediction(y_preds_oob_list, self.task_type)

        allow_negative_weights = self.config.get('allow_negative_weights', False)

        for step_idx in range(n_caruana_steps):
            best_step_weights = None
            best_step_loss = np.inf
            for weight_idx in range(weights.shape[0]):
                weights[weight_idx] += 1

                y_pred_oob = wp.predict_for_weights(weights)
                loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item()
                # print(f'{weights=}, {loss=}')
                if loss < best_step_loss:
                    best_step_loss = loss
                    best_step_weights = np.copy(weights)

                weights[weight_idx] -= 1

                # negative weights option
                # check weights >= 2 allowing for floating-point errors
                if allow_negative_weights and np.sum(weights) >= 1.5:
                    weights[weight_idx] -= 1

                    y_pred_oob = wp.predict_for_weights(weights)
                    loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item()
                    # print(f'{weights=}, {loss=}')
                    if loss < best_step_loss:
                        best_step_loss = loss
                        best_step_weights = np.copy(weights)

                    weights[weight_idx] += 1

            if best_step_loss < best_loss:
                best_loss = best_step_loss
                best_weights = np.copy(best_step_weights)

            weights = best_step_weights

        logger.log(2, f'Obtained ensemble weights: {best_weights}')

        self.fit_params = [dict(alg_weights=best_weights.tolist(), sub_fit_params=sub_fit_params)]

    def predict(self, ds: DictDataset) -> torch.Tensor:
        weights = self.fit_params[0]['alg_weights']
        sparse_weights = []
        sparse_preds = []
        for i, w in enumerate(weights):
            if w != 0:
                with self.alg_contexts_[i] as alg_interface:
                    sparse_preds.append(alg_interface.predict(ds))
                    sparse_weights.append(w)
        wp = WeightedPrediction(sparse_preds, task_type=self.task_type)
        return wp.predict_for_weights(weights=np.asarray(sparse_weights))

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        single_resources = [
            ssi.get_required_resources(ds, n_cv, n_refit, n_splits=n_splits, split_seeds=split_seeds, n_train=n_train)
            for ssi in self.alg_interfaces]
        return RequiredResources.combine_sequential(single_resources)

    def to(self, device: str) -> None:
        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
            with alg_ctx as alg_interface:
                alg_interface.to(device)


class AlgorithmSelectionAlgInterface(SingleSplitAlgInterface):
    """
    Picks the best model out of a list of candidates.
    """

    def __init__(self, alg_interfaces: List[AlgInterface], fit_params: Optional[List[Dict]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.alg_interfaces = alg_interfaces
        self.task_type = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        # todo: could use sub_fit_params
        refit_interfaces = []
        for alg_context in self.alg_contexts_:
            with alg_context as alg_interface:
                refit_interfaces.append(alg_interface.get_refit_interface(n_refit=n_refit))
        return AlgorithmSelectionAlgInterface(refit_interfaces,
                                              fit_params=fit_params or self.fit_params)

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1

        # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory
        tmp_folder = tmp_folders[0]
        self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for
                              i, ai in enumerate(self.alg_interfaces)]
        # store copies here, but the ones that will actually be trained are in alg_contexts_
        # this means that models should not be held in RAM all the time
        self.alg_interfaces = copy.deepcopy(self.alg_interfaces)

        if self.fit_params is not None:
            # this is the refit stage, there is no validation data set to determine the best model on,
            # instead the best model index is already in fit_params
            best_alg_idx = self.fit_params[0]['best_alg_idx']
            sub_tmp_folders = [tmp_folder / str(best_alg_idx) if tmp_folder is not None else None for tmp_folder in
                               tmp_folders]
            with self.alg_contexts_[best_alg_idx] as alg_interface:
                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
                                  name + f'sub-alg-{best_alg_idx}')

            return

        if idxs_list[0].val_idxs is None:
            raise ValueError('CaruanaEnsembleAlgInterface.fit(): Neither a validation set '
                             'nor fit_params were provided')

        self.task_type = TaskType.CLASSIFICATION if ds.tensor_infos[
                                                        'y'].get_cat_size_product() > 0 else TaskType.REGRESSION
        val_metric_name = self.config.get('alg_sel_metric_name', self.config.get('val_metric_name', None))
        if val_metric_name is None:
            val_metric_name = Metrics.default_val_metric_name(task_type=self.task_type)

        # get out-of-bag labels
        y = ds.tensors['y']
        y_oob = cat_if_necessary([y[idxs_list[0].val_idxs[i]] for i in range(idxs_list[0].val_idxs.shape[0])], dim=0)

        best_alg_idx = 0
        best_alg_loss = np.inf
        best_sub_fit_params = None

        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
        start_time = time.time()

        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
            if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
                break
            with alg_ctx as alg_interface:
                sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
                                   tmp_folders]
                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
                                  name + f'sub-alg-{alg_idx}')
                y_preds = alg_interface.predict(ds)
                # get out-of-bag predictions
                y_pred_oob = cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]]
                                               for j in range(idxs_list[0].val_idxs.shape[0])], dim=0)
                loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item()
                if loss < best_alg_loss:
                    best_alg_loss = loss
                    best_alg_idx = alg_idx
                    best_sub_fit_params = alg_interface.get_fit_params()[0]

        self.fit_params = [dict(best_alg_idx=best_alg_idx,
                                sub_fit_params=best_sub_fit_params)]
        logger.log(2, f'Best algorithm has index {best_alg_idx}')
        logger.log(2, f'Algorithm selection fit parameters: {self.fit_params[0]}')

    def predict(self, ds: DictDataset) -> torch.Tensor:
        alg_idx = self.fit_params[0]['best_alg_idx']
        with self.alg_contexts_[alg_idx] as alg_interface:
            return alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        # too pessimistic for refit...
        single_resources = [
            ssi.get_required_resources(ds, n_cv, n_refit, n_splits=n_splits, split_seeds=split_seeds, n_train=n_train)
            for ssi in self.alg_interfaces]
        return RequiredResources.combine_sequential(single_resources)

    def to(self, device: str) -> None:
        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
            with alg_ctx as alg_interface:
                alg_interface.to(device)


class PrecomputedPredictionsAlgInterface(SingleSplitAlgInterface):
    def __init__(self, y_preds_cv: torch.Tensor, y_preds_refit: Optional[torch.Tensor],
                 fit_params_cv: Dict, fit_params_refit: Optional[Dict]):
        super().__init__()
        self.y_preds_cv = y_preds_cv
        self.y_preds_refit = y_preds_refit
        self.is_refit = None
        self.fit_params_cv = fit_params_cv
        self.fit_params_refit = fit_params_refit

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        return self  # todo: does this work?

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        self.is_refit = idxs_list[0].val_idxs is None
        self.fit_params = [self.fit_params_refit] if self.is_refit else [self.fit_params_cv]

    def predict(self, ds: DictDataset) -> torch.Tensor:
        if ds.n_samples != self.y_preds_cv.shape[1]:
            raise ValueError('Prediction can only be performed on the exact same dataset '
                             'because this uses precomputed predictions')
        return self.y_preds_refit if self.is_refit else self.y_preds_cv

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        return RequiredResources(time_s=1e-5 * ds.n_samples, cpu_ram_gb=2.0, n_threads=1)


================================================
FILE: pytabkit/models/alg_interfaces/lightgbm_interfaces.py
================================================
import copy
from pathlib import Path
from typing import Optional, Dict, Tuple, Any, List

import numpy as np
import torch

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.resource_params import ResourceParams
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import OptAlgInterface, \
    AlgInterface, RandomParamsAlgInterface
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, SingleSplitWrapperAlgInterface, \
    SklearnSubSplitInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer
import warnings

from pytabkit.models.training.metrics import Metrics


class LGBMCustomMetric:
    def __init__(self, metric_name: str, is_classification: bool, is_higher_better: bool = False):
        self.metric_name = metric_name
        self.is_classification = is_classification
        self.is_higher_better = is_higher_better

    def __call__(self, y_pred: np.ndarray, eval_data):
        # eval_data should be of type lgbm.Dataset
        y = torch.as_tensor(eval_data.get_label(), dtype=torch.long if self.is_classification else torch.float32)
        if len(y.shape) == 1:
            y = y[:, None]

        # print(f'{y_pred.shape=}, {eval_data.get_label().shape=}')
        y_pred = torch.as_tensor(y_pred, dtype=torch.float32)
        if len(y_pred.shape) == 1:
            if self.is_classification:
                if y_pred.shape[0] == y.shape[0]:
                    # binary classification, transform into both class probabilities
                    y_pred = torch.stack([1. - y_pred, y_pred], dim=-1)
                else:
                    # bugged multiclass classification, need to reshape
                    # print(y_pred[:7])
                    y_pred = y_pred.view(-1, y.shape[0]).t().contiguous()
                    # print(y_pred[0, :].sum())
            else:
                y_pred = y_pred[:, None]

        if self.is_classification:
            # go from probabilities to logits
            y_pred = torch.log(y_pred + 1e-30)

        eval_result = Metrics.apply(y_pred, y, metric_name=self.metric_name)

        # print(f'LightGBM metric value: {self.metric_name} = {eval_result.item():g}')

        return self.metric_name, eval_result, self.is_higher_better


class LGBMSklearnSubSplitInterface(SklearnSubSplitInterface):
    def _get_cat_indexes_arg_name(self) -> str:
        return 'categorical_feature'

    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None),
                         ('max_depth', None),
                         ('verbosity', None),
                         ('learning_rate', ['lr', 'learning_rate', 'eta']),
                         ('subsample', ['subsample', 'bagging_fraction']),
                         ('colsample_bytree', ['colsample_bytree', 'feature_fraction']),
                         ('bagging_freq', None),
                         ('min_data_in_leaf', None),
                         ('min_sum_hessian_in_leaf', ['min_sum_hessian_in_leaf', 'min_child_weight']),
                         ('lambda_l1', ['lambda_l1', 'alpha', 'reg_alpha']),
                         ('lambda_l2', ['lambda_l2', 'lambda', 'reg_lambda']),
                         ('num_leaves', None),
                         ('min_child_weight', None),
                         ('boosting_type', None),
                         ('max_bin', None),
                         ('cat_smooth', None),
                         ('cat_l2', None),
                         ('n_jobs', ['n_jobs', 'n_threads'], n_threads),
                         ]

        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            from lightgbm import LGBMClassifier

            return LGBMClassifier(random_state=seed, **params)
        else:
            from lightgbm import LGBMRegressor

            return LGBMRegressor(random_state=seed, **params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, num_leaves=31), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.lgbm_class_time,
                               cpu_ram_params=ResourceParams.lgbm_class_ram)
        return rc.get_required_resources(ds)


class LGBMSubSplitInterface(TreeBasedSubSplitInterface):
    def _get_params(self):
        params_config = [('n_estimators', None, 1000),
                         ('max_depth', None),
                         ('verbosity', None, -1),
                         ('learning_rate', ['lr', 'learning_rate', 'eta'], 0.1),
                         ('subsample', ['subsample', 'bagging_fraction'], 1.0),
                         ('colsample_bytree', ['colsample_bytree', 'feature_fraction'], 1.0),
                         ('bagging_freq', None, 1),  # 1 is not the default in the interface but 0 could be misleading
                         ('min_data_in_leaf', None, 20),
                         ('min_sum_hessian_in_leaf', ['min_sum_hessian_in_leaf', 'min_child_weight'], 1e-3),
                         ('lambda_l1', ['lambda_l1', 'alpha', 'reg_alpha'], 0.0),
                         ('lambda_l2', ['lambda_l2', 'lambda', 'reg_lambda'], 0.0),
                         ('num_leaves', None, 31),
                         ('boosting', ['boosting', 'boosting_type'], None),
                         ('max_bin', None),
                         ('cat_smooth', None),
                         ('cat_l2', None),
                         ('early_stopping_round', ['early_stopping_round', 'early_stopping_rounds'], None),
                         ('extra_trees', None),
                         ('max_cat_to_onehot', None),
                         ('min_data_per_group', None),
                         ]

        params = utils.extract_params(self.config, params_config)
        return params

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        assert n_refit == 1
        return LGBMSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config)

    # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/lightgbm_experiment.py
    def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]:
        params = copy.deepcopy(params)
        if n_classes == 0:
            train_metric_name = self.config.get('train_metric_name', 'mse')
            if train_metric_name == 'mse':
                params.update({'objective': 'mean_squared_error'})
            elif train_metric_name.startswith('pinball('):
                quantile = float(train_metric_name[len('pinball('):-1])
                params.update({'objective': 'quantile', 'alpha': quantile})
            else:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
        elif n_classes <= 2:
            params.update({'objective': 'binary'})
        elif n_classes > 2:
            params.update({'objective': 'multiclass', 'num_class': n_classes})

        if 'num_leaves' in params:
            params['num_leaves'] = max(int(params['num_leaves']), 2)
        if 'min_data_in_leaf' in params:
            params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
        return params

    def _convert_ds(self, ds: DictDataset) -> Any:
        import lightgbm as lgbm

        x_cont = ds.tensors['x_cont'].cpu().numpy()
        label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy()
        if label is not None and label.shape[1] == 1:
            label = label[:, 0]
        has_cat = 'x_cat' in ds.tensor_infos and ds.tensor_infos['x_cat'].get_n_features() > 0
        if not has_cat:
            # no categorical columns
            return lgbm.Dataset(x_cont, label=label, categorical_feature=[])

        x_df = ds.without_labels().to_df()
        cat_features = x_df.select_dtypes(include='category').columns.tolist()
        return lgbm.Dataset(x_df, label, categorical_feature=cat_features)

    def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int,
             n_threads: int, val_metric_name: Optional[str] = None,
             tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]:
        import lightgbm as lgbm
        from lightgbm import record_evaluation

        # print(f'Fitting LightGBM')
        n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item()
        params = self._preprocess_params(params, n_classes)
        params.update({
            'data_random_seed': 1 + seed,
            'feature_fraction_seed': 2 + seed,
            'bagging_seed': 3 + seed,
            'drop_seed': 4 + seed,
            'objective_seed': 5 + seed,
            'extra_seed': 6 + seed,
            'num_threads': n_threads
        })

        eval_metric = None
        eval_name = None
        feval = None

        if val_ds is not None:
            if val_metric_name is None:
                val_metric_name = 'class_error' if n_classes > 0 else 'rmse'

            if val_metric_name == 'class_error':
                eval_metric = 'binary_error' if n_classes <= 2 else 'multi_error'
            elif val_metric_name == 'cross_entropy':
                eval_metric = 'binary_logloss' if n_classes <= 2 else 'multi_logloss'
            elif val_metric_name == 'rmse':
                eval_metric = 'rmse'
            elif val_metric_name == 'mae':
                eval_metric = 'mae'
            else:
                eval_name = val_metric_name
                feval = LGBMCustomMetric(val_metric_name, is_classification=n_classes > 0)

            if eval_metric is None:
                # specified custom metric, don't use pre-given metric
                eval_metric = "None"
            else:
                eval_name = eval_metric

            params['metric'] = eval_metric

        if val_ds is None:
            params = utils.update_dict(params, remove_keys=['early_stopping_round', 'early_stopping_rounds'])

        evals = [] if val_ds is None else [self._convert_ds(val_ds)]
        valid_names = [] if val_ds is None else ['val']
        evals_result = {}
        train_ds = self._convert_ds(train_ds)
        # warning filtering taken from https://auto.gluon.ai/dev/_modules/autogluon/tabular/models/lgb/lgb_model.html
        with warnings.catch_warnings():
            # Filter harmless warnings introduced in lightgbm 3.0,
            # future versions plan to remove: https://github.com/microsoft/LightGBM/issues/3379
            warnings.filterwarnings('ignore', message='Overriding the parameters from Reference Dataset.')
            warnings.filterwarnings('ignore', message='categorical_column in param dict is overridden.')
            bst = lgbm.train(utils.update_dict(params, remove_keys=['n_estimators']), train_ds, valid_sets=evals,
                             valid_names=valid_names, feval=feval,
                             callbacks=[record_evaluation(evals_result)],
                             num_boost_round=params['n_estimators'])
            # print(f'{params["n_estimators"]=}')

        if val_ds is not None:
            # print('evals_result val:', evals_result['val'], flush=True)
            val_errors = evals_result['val'][eval_name]
        else:
            val_errors = None
        return bst, val_errors

    def _predict(self, bst, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor:
        # bst should be of type lgbm.Booster
        # print(f'LGBM _predict() with {other_params=}')
        num_iteration = None if other_params is None else other_params['n_estimators']
        y_pred = torch.as_tensor(bst.predict(self._convert_ds(ds).data, num_iteration=num_iteration),
                                 dtype=torch.float32)
        if n_classes == 0:
            y_pred = y_pred.unsqueeze(-1)
        elif n_classes <= 2:
            y_pred = torch.stack([1. - y_pred, y_pred], dim=-1)

        if n_classes >= 1:
            y_pred = torch.log(y_pred + 1e-30)
        # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}')
        return y_pred

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, num_leaves=31, max_n_threads=8), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.lgbm_class_time,
                               cpu_ram_params=ResourceParams.lgbm_class_ram)
        return rc.get_required_resources(ds)


class LGBMHyperoptAlgInterface(OptAlgInterface):
    def __init__(self, space=None, n_hyperopt_steps: int = 50, opt_method: str = 'hyperopt', **config):
        from hyperopt import hp
        default_config = {}
        max_config = dict()
        if space is None:
            space = config.get('hpo_space_name', None)
        if space == 'catboost_quality_benchmarks':
            # space from catboost quality benchmarks,
            # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/lightgbm_experiment.py
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'learning_rate': hp.loguniform('learning_rate', -7, 0),
                'num_leaves': hp.qloguniform('num_leaves', 0, 7, 1),
                'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
                'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
                'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
                'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5),
                'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
                'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
            }
            default_config = dict(n_estimators=5000)
            max_config['num_leaves'] = 1000  # about exp(7)
        elif space == 'tabpfn' or space == 'hollmann':
            # from Hollmann, Müller, Eggensperger, Hutter,
            # TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'n_estimators': hp.quniform('n_estimators', 50, 2000),
                # in the paper it says that this is not log but that's hard to believe,
                # especially when e^{-3} is the lower bound
                'learning_rate': hp.loguniform('learning_rate', -3, 0),
                'num_leaves': hp.qloguniform('num_leaves', np.log(5), np.log(50), 1),
                'max_depth': hp.qloguniform('max_depth', np.log(3), np.log(20), 1),
                'subsample': hp.uniform('subsample', 0.2, 0.8),
                'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -5, 4),  # this is min_child_weight
                'lambda_l1': hp.choice('lambda_l1', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]),  # this is reg_alpha
                'lambda_l2': hp.choice('lambda_l2', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]),  # this is reg_lambda
            }
            max_config['num_leaves'] = 50
        elif space == 'mt-reg':
            # hand-guessed space for regression
            if opt_method == 'smac':
                from ConfigSpace import ConfigurationSpace, Float, Integer
                space = ConfigurationSpace()
                space.add_hyperparameters([
                    Integer('num_leaves', (16, 256), log=True, default=100),
                    Float('feature_fraction', (0.4, 1), default=0.7),
                    Float('bagging_fraction', (0.6, 1), default=1.0),
                    Integer('min_data_in_leaf', (1, 64), log=True, default=3),
                ])
            else:  # assume hyperopt
                space = {
                    'num_leaves': hp.qloguniform('num_leaves', np.log(16), np.log(256), 1),
                    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
                    'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1),
                    'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1), np.log(64), 1),
                }
            default_config = dict(n_estimators=1000, learning_rate=5e-2, min_sum_hessian_in_leaf=1e-5)
            max_config['num_leaves'] = 256
        elif space == 'mt-reg-2':
            # hand-guessed space for regression
            space = {
                'num_leaves': hp.qloguniform('num_leaves', np.log(16), np.log(256), 1),
                'learning_rate': hp.loguniform('learning_rate', np.log(2.5e-2), np.log(1e-1)),
                'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
                'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1),
                'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1), np.log(64), 1),
            }
            default_config = dict(n_estimators=1000, min_sum_hessian_in_leaf=1e-5)
            max_config['num_leaves'] = 256
        config = utils.update_dict(default_config, config)
        opt_class = SMACOptimizer if opt_method == 'smac' else HyperoptOptimizer
        super().__init__(hyper_optimizer=opt_class(space=space, fixed_params=dict(),
                                                   n_hyperopt_steps=n_hyperopt_steps,
                                                   **config),
                         max_resource_config=utils.join_dicts(config, max_config),
                         **config)

    def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**config) for i in range(n_sub_splits)])


class RandomParamsLGBMAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        # adapted from catboost quality benchmarks
        hpo_space_name = self.config.get('hpo_space_name', 'cqb')
        if hpo_space_name == 'cqb':
            space = {
                'learning_rate': np.exp(rng.uniform(-7, 0)),
                'num_leaves': round(np.exp(rng.uniform(0, 7))),
                'feature_fraction': rng.uniform(0.5, 1),
                'bagging_fraction': rng.uniform(0.5, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(0, 6))),
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(-16, 5)),
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(-16, 2))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(-16, 2))]),
                'n_estimators': 1000,
            }
        elif hpo_space_name == 'large':
            space = {
                'early_stopping_rounds': 50,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(5e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(256)))),
                'feature_fraction': rng.uniform(0.3, 1),
                'bagging_fraction': rng.uniform(0.3, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128)))),
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(20.0))),
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),
                'n_estimators': 1000,

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(256)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(2.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v2':
            space = {
                'early_stopping_rounds': 50,
                'learning_rate': np.exp(rng.uniform(np.log(1e-2), np.log(1e-1))),  # shrunk
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),  # shrunk
                'feature_fraction': rng.uniform(0.85, 1),  # shrunk
                'bagging_fraction': rng.uniform(0.7, 1),  # shrunk
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),  # shrunk but not much
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),  # shrunk
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),
                'n_estimators': 1000,

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),  # shrunk
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),  # shrunk
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),  # shrunk
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v2-10k':
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(2e-2))),  # shrunk
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),  # shrunk
                'feature_fraction': rng.uniform(0.85, 1),  # shrunk
                'bagging_fraction': rng.uniform(0.7, 1),  # shrunk
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),  # shrunk but not much
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),  # shrunk
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),  # shrunk
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),  # shrunk
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),  # shrunk
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v3-10k':
            # v2 but with the lr space of tabrepo1
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),  # shrunk
                'feature_fraction': rng.uniform(0.85, 1),  # shrunk
                'bagging_fraction': rng.uniform(0.7, 1),  # shrunk
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),  # shrunk but not much
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),  # shrunk
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),  # shrunk
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),  # shrunk
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),  # shrunk
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v4-10k':
            # v3-10k but without tuning bagging_fraction
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),  # shrunk
                'feature_fraction': rng.uniform(0.85, 1),  # shrunk
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),  # shrunk but not much
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),  # shrunk
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),  # shrunk
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),  # shrunk
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),  # shrunk
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v5-10k':
            # v3-10k but without tuning all the categorical parameters
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),  # shrunk
                'feature_fraction': rng.uniform(0.85, 1),  # shrunk
                'bagging_fraction': rng.uniform(0.7, 1),  # shrunk
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),  # shrunk but not much
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),  # shrunk
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
            }
        elif hpo_space_name == 'large-v6-10k':
            # v3-10k but with the tabrepo1 search space for feature_fraction
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v7-10k':
            # v6-10k but with increased min_data_in_leaf
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(64)))),
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),
                # could shrink more but one may want this for classification
                'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]),
                'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v8-10k':
            # v6-10k but without tuning lambda_l1 and lambda_l2
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),
                'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v9-10k':
            # v8-10k but without tuning min_sum_hessian_in_leaf
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v10-10k':
            # v9-10k but with num_leaves from tabrepo1
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'large-v11-10k':
            # v9-10k but without tuning bagging_fraction
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))),
                'feature_fraction': rng.uniform(0.4, 1),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))),

                'bagging_freq': 1,  # already the default here but not in original LightGBM

                # based on https://arxiv.org/abs/2411.04324
                'extra_trees': rng.choice([False, True]),
                'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))),
                'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))),
                # min_data_in_bin
            }
        elif hpo_space_name == 'tabrepo1-es':
            space = {
                'early_stopping_rounds': 50,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'feature_fraction': rng.uniform(0.4, 1.0),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(60.0)))),
                'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))),
                'extra_trees': rng.choice([False, True]),
            }
        elif hpo_space_name == 'tabrepo1-es-10k':
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'feature_fraction': rng.uniform(0.4, 1.0),
                'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(60.0)))),
                'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))),
                'extra_trees': rng.choice([False, True]),
            }
        elif hpo_space_name == 'tabrepo1-fixed-es-10k':
            space = {
                'early_stopping_rounds': 50,
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'feature_fraction': rng.uniform(0.4, 1.0),
                'min_data_in_leaf': rng.integers(2, 60, endpoint=True),
                'num_leaves': rng.integers(16, 255, endpoint=True),
                'extra_trees': rng.choice([False, True]),
            }
        elif hpo_space_name == 'tabarena':
            space = {
                'early_stopping_rounds': 300,  # not exactly equivalent, probably
                'n_estimators': 10_000,
                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'feature_fraction': rng.uniform(0.4, 1),
                'bagging_fraction': rng.uniform(0.7, 1),
                'bagging_freq': 1,  # already the default here but not in original LightGBM
                'num_leaves': np.floor(np.exp(rng.uniform(np.log(2.0), np.log(201)))),
                'min_data_in_leaf': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(65)))),
                'extra_trees': rng.choice([False, True]),

                'min_data_per_group': round(np.floor(np.exp(rng.uniform(np.log(2.0), np.log(101))))),
                'cat_l2': np.exp(rng.uniform(np.log(5e-3), np.log(2.0))),
                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
                'max_cat_to_onehot': round(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0))))),

                'lambda_l1': np.exp(rng.uniform(np.log(1e-5), np.log(1.0))),
                'lambda_l2': np.exp(rng.uniform(np.log(1e-5), np.log(2.0))),
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**config) for i in range(n_tv_splits)])


================================================
FILE: pytabkit/models/alg_interfaces/nn_interfaces.py
================================================
import copy
import warnings
from pathlib import Path
from typing import List, Optional, Dict, Any, Union

import numpy as np
import torch

from pytabkit.models.training.nn_creator import get_realmlp_auto_batch_size

try:
    import lightning.pytorch as pl
except ImportError:
    import pytorch_lightning as pl

import logging

from datetime import timedelta

from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer
from pytabkit.models.nn_models.base import Layer, Variable
from pytabkit.models.nn_models.models import NNFactory
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.torch_utils import cat_if_necessary
from pytabkit.models.training.lightning_modules import TabNNModule, postprocess_multiquantile
from pytabkit.models.training.logging import Logger
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, OptAlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources


def get_lignting_accel_and_devices(device: str):
    if device == 'cpu':
        pl_accelerator = 'cpu'
        pl_devices = 'auto'
    elif device == 'mps':
        pl_accelerator = 'mps'
        pl_devices = 'auto'
    elif device == 'cuda':
        pl_accelerator = 'gpu'
        pl_devices = [0]
    elif device.startswith('cuda:'):
        pl_accelerator = 'gpu'
        pl_devices = [int(device[len('cuda:'):])]
    else:
        raise ValueError(f'Unknown device "{device}"')

    return pl_accelerator, pl_devices


class NNAlgInterface(AlgInterface):
    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.model: Optional[TabNNModule] = None
        self.device = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        return NNAlgInterface(fit_params if fit_params is not None else self.fit_params, **self.config)

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str):
        # the code below requires all splits to have the same number of sub-splits
        assert np.all([idxs_list[i].train_idxs.shape[0] == idxs_list[0].train_idxs.shape[0]
                       for i in range(len(idxs_list))])
        # we can then decompose the overall number of sub-splits into the number of splits
        # and the number of sub-splits per split

        # print(f'Starting NN fit')

        # have the option to change the seeds (for comparing NNs with different random seeds)
        random_seed_offset = self.config.get('random_seed_offset', 0)
        if random_seed_offset != 0:
            idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs,
                                   test_idxs=idxs.test_idxs, split_seed=idxs.split_seed + random_seed_offset,
                                   sub_split_seeds=[seed + random_seed_offset for seed in idxs.sub_split_seeds],
                                   split_id=idxs.split_id) for idxs in idxs_list]
        if self.config.get('same_seed_for_sub_splits', False):
            idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs,
                                   test_idxs=idxs.test_idxs, split_seed=idxs.split_seed,
                                   sub_split_seeds=[idxs.sub_split_seeds[0]] * len(idxs.sub_split_seeds),
                                   split_id=idxs.split_id) for idxs in idxs_list]

        # https://stackoverflow.com/questions/74364944/how-to-get-rid-of-info-logging-messages-in-pytorch-lightning
        log = logging.getLogger("lightning")
        log.propagate = False
        log.setLevel(logging.ERROR)

        warnings.filterwarnings("ignore", message="You defined a `validation_step` but have no `val_dataloader`.")

        old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
        torch.backends.cuda.matmul.allow_tf32 = False  # to be safe wrt rounding errors, but might not be necessary
        # todo: allow preprocessing on CPU and then only put batches on GPU in data loader?
        gpu_devices = interface_resources.gpu_devices
        self.device = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu'
        ds = ds.to(self.device)

        fit_params = self.fit_params
        if self.fit_params is None and 'stop_epoch' in self.config:
            fit_params = [dict(stop_epoch=self.config['stop_epoch'])] * len(idxs_list)

        n_epochs = self.config.get('n_epochs', 256)
        self.model = TabNNModule(**utils.join_dicts({'n_epochs': 256, 'logger': logger}, self.config),
                                 fit_params=fit_params)
        self.model.compile_model(ds, idxs_list, interface_resources)

        pl_accelerator, pl_devices = get_lignting_accel_and_devices(self.device)

        max_time = None if interface_resources.time_in_seconds is None else timedelta(
            seconds=interface_resources.time_in_seconds)

        self.min_trainer_kwargs = dict(
            max_time=max_time,
            accelerator=pl_accelerator,
            devices=pl_devices,
            max_epochs=n_epochs,
            enable_checkpointing=False,
            enable_progress_bar=False,
            num_sanity_val_steps=0,
            enable_model_summary=False,
            log_every_n_steps=1,
        )

        # don't save the trainer in self, otherwise it stores the dataset
        trainer = pl.Trainer(
            max_time=max_time,
            accelerator=pl_accelerator,
            devices=pl_devices,
            callbacks=self.model.create_callbacks(),
            max_epochs=n_epochs,
            enable_checkpointing=False,
            enable_progress_bar=False,
            num_sanity_val_steps=0,
            logger=pl.loggers.logger.DummyLogger(),
            enable_model_summary=False,
            log_every_n_steps=1,
        )

        trainer.fit(
            model=self.model, train_dataloaders=self.model.train_dl, val_dataloaders=self.model.val_dl
        )

        if hasattr(self.model, 'fit_params'):
            self.fit_params = self.model.fit_params

        torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32

        # remove all remaining references to GPU tensors, for some reason this can't be done in the model
        del self.model._trainer

        # self.model.to('cpu')  # to allow serialization without GPU issues, but doesn't work

        # print(f'Importances (sorted):', self.get_importances().sort()[0])

    def predict(self, ds: DictDataset) -> torch.Tensor:
        pred_dict = self.get_current_predict_params_dict()
        if 'val_metric_name' in pred_dict:
            self.model.restore_ckpt_for_val_metric_name(pred_dict['val_metric_name'])
        old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
        torch.backends.cuda.matmul.allow_tf32 = False
        self.model.to(self.device)
        ds = ds.to(self.device)
        ds_x, _ = ds.split_xy()

        pl_accelerator, pl_devices = get_lignting_accel_and_devices(self.device)

        # create new trainer so we don't have to pickle the full trainer that references the dataset somehow
        # update devices since the model device may have been moved since
        trainer = pl.Trainer(**(self.min_trainer_kwargs | dict(accelerator=pl_accelerator, devices=pl_devices,
                                                               logger=pl.loggers.logger.DummyLogger())))
        y_pred = trainer.predict(model=self.model, dataloaders=self.model.get_predict_dataloader(ds_x))
        y_pred = cat_if_necessary(y_pred, dim=-2).to('cpu')  # concat along batch dimension
        y_pred = postprocess_multiquantile(y_pred, **self.config)  # postprocessing in case of multiquantile loss
        torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32

        # remove all remaining references to GPU tensors, for some reason this can't be done in the model
        del self.model._trainer

        return y_pred

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        val_metric_names = self.config.get('val_metric_names', None)
        if val_metric_names is None:
            return {'': dict()}
        else:
            return {f'_val-{val_metric_name}': dict(val_metric_name=val_metric_name) for val_metric_name in
                    val_metric_names}

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        tensor_infos = ds.tensor_infos
        factory = self.config.get('factory', None)
        if factory is None:
            factory = NNFactory(**self.config)
        fitter = factory.create(tensor_infos)
        static_fitter, dynamic_fitter = fitter.split_off_dynamic()
        static_tensor_infos = static_fitter.forward_tensor_infos(tensor_infos)
        n_params = fitter.get_n_params(tensor_infos)
        n_forward = fitter.get_n_forward(tensor_infos)
        n_parallel = max(n_cv, n_refit) * n_splits * self.config.get('n_ens', 1)
        batch_size = self.config.get('batch_size', 256)
        if batch_size == 'auto':
            batch_size = get_realmlp_auto_batch_size(n_train)
        # print(f'{batch_size=}')
        n_epochs = self.config.get('n_epochs', 256)
        # per-element RAM usage:
        # continuous data requires 4 bytes for forward pass and 4 for backward pass
        # categorical data requires 8 bytes for forward pass (because torch.long is required) and none for backward pass
        pass_memory = n_forward * batch_size * 8  # initial batch size ignored
        ds_size_gb = ds.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4)
                                         for ti in static_tensor_infos.values()]) / (1024 ** 3)
        ds_ram_gb = 5 * ds_size_gb
        # ds_ram_gb = 3 * task_info.get_ds_size_gb() / (1024**3)
        param_memory = 5 * n_params * 8  # 5 because of model, model copy, grads, adam mom, adam sq_mom
        fixed_ram_gb = 0.3  # go safe

        # print(f'{pass_memory=}, {param_memory=}')

        # max memory that would be used if the dataset wasn't used
        init_ram_gb_full = n_forward * ds.n_samples * 8 / (1024 ** 3)
        init_ram_gb_max = 1.2  # todo: rough estimate, a bit larger than what is allowed in fit_transform_subsample()
        init_ram_gb = min(init_ram_gb_max, init_ram_gb_full)
        # init_ram_gb = 1.5

        # print(f'{ds_ram_gb=}, {pass_memory/(1024**3)=}, {param_memory/(1024**3)=}, {init_ram_gb=}')

        factor = 1.2  # to go safe on ram
        gpu_ram_gb = fixed_ram_gb + ds_ram_gb + max(init_ram_gb,
                                                    factor * (n_parallel * (pass_memory + param_memory)) / (1024 ** 3))

        gpu_usage = min(1.0, n_parallel / 200)  # rather underestimate it and use up all the ram on the gpu
        # go somewhat safe, should be small anyway
        cpu_ram_gb = 0.3 + ds_ram_gb + 1.3 * (pass_memory + param_memory) / (1024 ** 3)

        time_approx = ds.n_samples * n_epochs * 4e-5 * (2 if n_refit > 0 else 1)
        if self.config.get('use_gpu', True):
            return RequiredResources(time_s=time_approx, n_threads=1.0, cpu_ram_gb=cpu_ram_gb,
                                     n_gpus=1, gpu_usage=gpu_usage, gpu_ram_gb=gpu_ram_gb)
        else:
            return RequiredResources(time_s=time_approx, n_threads=1.0, cpu_ram_gb=cpu_ram_gb + gpu_ram_gb)

    def get_model_ram_gb(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                         split_seeds: List[int]):
        tensor_infos = ds.tensor_infos
        factory = self.config.get('factory', None)
        if factory is None:
            factory = NNFactory(**self.config)
        fitter = factory.create(tensor_infos)
        n_params = fitter.get_n_params(tensor_infos)
        n_parallel = max(n_cv, n_refit) * n_splits

        factor = 1.2  # to go safe on ram
        return factor * n_parallel * n_params * 4 / (1024 ** 3)

    def to(self, device: str) -> None:
        # print(f'Move RealMLP model to device {device}')
        self.model.to(device)
        self.device = device

    def get_importances(self) -> torch.Tensor:
        net: Layer = self.model.model
        params = net.parameters()
        scale = None
        weight = None
        importances_param = self.config.get('feature_importances', None)
        for param in params:
            param: Variable = param
            scope_str = str(param.context.scope)
            if scope_str.endswith('layer-0/scale'):
                scale = param
            elif scope_str.endswith('layer-0/weight'):
                weight = param

            # print(scope_str)

        assert weight is not None

        with torch.no_grad():
            # shape: (vectorized network dims) x n_features
            importances = weight.norm(dim=-1)

            if scale is not None:
                importances *= scale[..., 0, :].abs()

            p = self.config.get('importances_exponent', 1.0)
            importances = importances ** p
            #
            # # hard feature selection
            # n_remove = int(0.9 * importances.shape[-1])
            # new_importances = torch.ones_like(importances)
            # for i in range(importances.shape[0]):
            #     new_importances[i, torch.argsort(importances[i])[:n_remove]] = 0.0
            # importances = new_importances
            # print(importances)

            if importances_param is not None:
                print(f'Using importances_param')
                importances *= importances_param[..., :]

            importances /= (importances.norm(dim=-1, keepdim=True) / np.sqrt(importances.shape[-1]))
            return importances

    def get_first_layer_weights(self, with_scale: bool) -> torch.Tensor:
        net: Layer = self.model.model
        params = net.parameters()
        scale = None
        weight = None
        for param in params:
            param: Variable = param
            scope_str = str(param.context.scope)
            if scope_str.endswith('layer-0/scale'):
                scale = param
            elif scope_str.endswith('layer-0/weight'):
                weight = param
        assert weight is not None
        if scale is not None and with_scale:
            with torch.no_grad():
                return weight * scale[..., 0, :, None]
        else:
            return weight.data

    # todo: have option to move to/from GPU


class NNHyperoptAlgInterface(OptAlgInterface):
    def __init__(self, space: Optional[Union[str, Dict[str, Any]]] = None, n_hyperopt_steps: int = 50,
                 opt_method: str = 'hyperopt', **config):
        from hyperopt import hp
        default_config = config  # todo
        max_config = copy.copy(default_config)
        if space == 'default':
            space = {
                'lr': hp.loguniform('lr', np.log(2e-2), np.log(3e-1)),
                'num_emb_type': hp.choice('num_emb_type', ['none', 'pl', 'plr', 'pbld']),
                'add_front_scale': hp.choice('add_front_scale', [(0.6, True), (0.4, False)]),
                'p_drop': hp.choice('p_drop', [(0.3, 0.0), (0.5, 0.15), (0.2, 0.3)]),
                'wd': hp.choice('wd', [0.0, 0.02]),
                'plr_sigma': hp.loguniform('plr_sigma', np.log(0.05), np.log(0.5)),
                'act': hp.choice('act', ['relu', 'selu', 'mish']),
                'hidden_sizes': hp.choice('hidden_sizes', [(0.6, [256] * 3), (0.2, [512]), (0.2, [64] * 5)]),
                'ls_eps': hp.choice('ls_eps', [(0.3, 0.0), (0.7, 0.1)])
            }
            utils.update_dict(default_config, remove_keys=list(space.keys()))
        elif not isinstance(space, dict):
            print(f'Unknown hyperparameter space: {space}')

        config = utils.update_dict(default_config, config)
        opt_class = SMACOptimizer if opt_method == 'smac' else HyperoptOptimizer
        super().__init__(hyper_optimizer=opt_class(space=space, fixed_params=default_config,
                                                   n_hyperopt_steps=n_hyperopt_steps,
                                                   **config),
                         max_resource_config=utils.join_dicts(config),
                         **config)

    def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
        return NNAlgInterface(**config)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        required_resources = super().get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train)

        # add n_steps * model_ram_gb to required resources, because these will be stored
        alg_interface = NNAlgInterface(**self.max_resource_config)
        model_ram_gb = alg_interface.get_model_ram_gb(ds, n_cv, n_refit, n_splits, split_seeds)
        required_resources.cpu_ram_gb += self.hyper_optimizer.get_n_hyperopt_steps() * model_ram_gb
        return required_resources


class RealMLPParamSampler:
    def __init__(self, is_classification: bool, hpo_space_name: str = 'default', **config):
        self.is_classification = is_classification
        self.hpo_space_name = hpo_space_name

    def sample_params(self, seed: int) -> Dict[str, Any]:
        assert self.hpo_space_name in ['default', 'clr', 'moresigma', 'moresigmadim', 'moresigmadimreg',
                                       'moresigmadimsize', 'moresigmadimlr', 'probclass', 'probclass-mlp', 'large',
                                       'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10',
                                       'tabarena', 'tabarena-new', 'alt11', 'alt12', 'alt13', 'alt14', 'alt15', 'alt16',
                                       'alt17', 'alt18', 'alt19', 'alt20']
        rng = np.random.default_rng(seed=seed)

        if self.hpo_space_name == 'probclass-mlp':
            params = {'lr': np.exp(rng.uniform(np.log(1e-4), np.log(1e-2))),
                      'p_drop': rng.choice([0.0, 0.1, 0.2, 0.3]),
                      'wd': rng.choice([0.0, 1e-5, 1e-4, 1e-3])}
            default_params = DefaultParams.VANILLA_MLP_CLASS if self.is_classification else DefaultParams.VANILLA_MLP_REG
            return utils.join_dicts(default_params, params)

        hidden_size_options = [[256] * 3, [64] * 5, [512]]

        params = {'num_emb_type': rng.choice(['none', 'pbld', 'pl', 'plr']),
                  'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]),
                  # convert to actual bool so it can be serialized
                  'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                  'p_drop': rng.choice([0.0, 0.15, 0.3], p=[0.3, 0.5, 0.2]),
                  'wd': rng.choice([0.0, 2e-2]),
                  'plr_sigma': np.exp(rng.uniform(np.log(0.05), np.log(0.5))),
                  'act': rng.choice(['relu', 'selu', 'mish']),
                  'hidden_sizes': hidden_size_options[rng.choice([0, 1, 2], p=[0.6, 0.2, 0.2])]}

        if self.is_classification:
            params['ls_eps'] = rng.choice([0.0, 0.1], p=[0.3, 0.7])

        if self.hpo_space_name == 'clr':
            params['lr'] = np.exp(rng.uniform(np.log(2e-3), np.log(3e-1)))
            params['lr_sched'] = 'constant'
            params['use_early_stopping'] = True
            params['early_stopping_multiplicative_patience'] = 1
            params['early_stopping_additive_patience'] = 16
        elif self.hpo_space_name == 'moresigma':
            params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))
        elif self.hpo_space_name == 'moresigmadim':
            params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))
            params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32))))
            params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64))))
        elif self.hpo_space_name == 'moresigmadimreg':
            params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))
            params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32))))
            params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64))))
            params['p_drop'] = rng.choice([0.0, rng.uniform(0.0, 0.5)])
            params['wd'] = np.exp(rng.uniform(np.log(1e-5), np.log(4e-2)))
        elif self.hpo_space_name == 'moresigmadimsize':
            params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))
            params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32))))
            params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64))))
            params['hidden_sizes'] = [rng.choice(np.arange(8, 513))] * rng.choice(np.arange(1, 6))
        elif self.hpo_space_name == 'moresigmadimlr':
            params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))
            params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32))))
            params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64))))
            params['lr'] = np.exp(rng.uniform(np.log(5e-3), np.log(5e-1)))
        elif self.hpo_space_name == 'probclass':
            params['ls_eps'] = rng.choice([0.0, 0.1])
            params['wd'] = rng.choice([0.0, 2e-3, 2e-2])
        elif self.hpo_space_name == 'large':
            params = {'num_emb_type': rng.choice(['none', 'pbld', 'pl', 'plr']),
                      'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]),
                      'n_hidden': round(np.exp(rng.uniform(np.log(64), np.log(512)))),
                      'n_layers': rng.integers(1, 5, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.0, 0.6),
                      'wd': rng.choice([rng.uniform(0.0, 1e-3), np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                      'act': rng.choice(['relu', 'selu', 'mish', 'silu', 'gelu']),
                      'use_parametric_act': rng.choice([False, True]),
                      'p_drop_sched': rng.choice(['flat_cos', 'constant']),
                      'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'ls_eps': rng.choice([0.0, rng.uniform(0.0, 0.2)]),
                      'lr_sched': rng.choice(['coslog4', 'cos']),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      }

            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']
        elif self.hpo_space_name == 'alt1':
            params = {'num_emb_type': rng.choice(['none', 'pbld']),
                      'n_hidden': rng.choice([128, 256, 384]),
                      'n_layers': rng.integers(1, 3, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                      'p_drop': rng.uniform(0.0, 0.5),
                      'wd': np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e1))),
                      'act': rng.choice(['selu', 'mish', 'silu']),
                      # 'use_parametric_act': rng.choice([False, True]),
                      # 'p_drop_sched': rng.choice(['flat_cos', 'constant']),
                      # 'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(5e-2)))]),
                      # 'lr_sched': rng.choice(['coslog4', 'cos']),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'use_early_stopping': True,
                      'early_stopping_multiplicative_patience': 2,
                      'early_stopping_additive_patience': 20,
                      }

            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt2':
            # refined version of large
            params = {'num_emb_type': 'pbld',
                      'n_hidden': round(np.exp(rng.uniform(np.log(198), np.log(512)))),
                      'n_layers': rng.integers(1, 3, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.06, 0.6),
                      'wd': np.exp(rng.uniform(np.log(6e-3), np.log(1e-1))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))),
                      'act': rng.choice(['mish', 'silu']),
                      'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(5e-2)))]),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'p_drop_sched': 'constant',
                      }
            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt3':
            # refined version of alt2 (better for 20 steps but worse for 50)
            params = {'num_emb_type': 'pbld',
                      'n_hidden': round(np.exp(rng.uniform(np.log(323), np.log(480)))),
                      'n_layers': rng.integers(1, 2, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(3e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.1, 0.5),
                      'wd': np.exp(rng.uniform(np.log(6e-3), np.log(6e-2))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))),
                      'act': 'mish',
                      'wd_sched': 'flat_cos',
                      'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(2e-2)))]),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(4e-2))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(1e-1), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.5), np.log(7.5))),
                      'p_drop_sched': 'constant',
                      }
            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt4':
            # large space for regression
            params = {'num_emb_type': 'pbld',
                      'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]),
                      'n_hidden': round(np.exp(rng.uniform(np.log(128), np.log(512)))),
                      'n_layers': rng.integers(1, 4, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.0, 0.5),
                      'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                      'act': rng.choice(['mish', 'silu', 'elu']),
                      'use_parametric_act': True,
                      'p_drop_sched': rng.choice(['flat_cos', 'constant']),
                      'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'lr_sched': rng.choice(['coslog4', 'cos']),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      }

            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']
        elif self.hpo_space_name == 'alt5':
            # refined space for regression
            params = {'num_emb_type': 'pbld',
                      'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]),
                      'n_hidden': round(np.exp(rng.uniform(np.log(128), np.log(512)))),
                      'n_layers': 4,
                      'lr': np.exp(rng.uniform(np.log(3e-2), np.log(1e-1))),
                      'p_drop': rng.uniform(0.0, 0.45),
                      'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                      'act': 'mish',
                      'use_parametric_act': True,
                      'p_drop_sched': 'flat_cos',
                      'wd_sched': 'flat_cos',
                      'lr_sched': 'coslog4',
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(3e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(7.5))),
                      }
            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt6':
            # regression, manually adjusted from alt5
            params = {'num_emb_type': 'pbld',
                      'add_front_scale': True,
                      'n_hidden': 256,
                      'n_layers': rng.choice([2, 3, 4]),
                      'lr': np.exp(rng.uniform(np.log(4e-2), np.log(2e-1))),
                      'p_drop': rng.uniform(0.0, 0.5),
                      'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                      'act': 'mish',
                      'use_parametric_act': True,
                      'p_drop_sched': 'flat_cos',
                      'wd_sched': 'flat_cos',
                      'lr_sched': 'coslog4',
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(3e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(7.5))),
                      }
            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt7':
            # refined version of alt2 (classification)
            params = {'num_emb_type': 'pbld',
                      'n_hidden': 256,
                      'n_layers': rng.integers(1, 4, endpoint=True),
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.0, 0.6),
                      'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(30))),
                      'act': 'mish',
                      'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(2e-1)))]),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'p_drop_sched': 'constant',
                      }
            params['hidden_sizes'] = [params['n_hidden']] * params['n_layers']

        elif self.hpo_space_name == 'alt8':
            # version of alt2 (classification) with some new hyperparameters
            params = {'num_emb_type': 'pbld',
                      'hidden_sizes': 'rectangular',
                      'hidden_width': 256,
                      'ls_eps_sched': 'coslog4',
                      'tfms': [['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                               ['one_hot', 'mean_center', 'l2_normalize', 'embedding']][rng.choice([0, 1])],
                      'batch_size': [256, 'auto'][rng.choice([0, 1])],
                      'n_hidden_layers': rng.integers(1, 4, endpoint=True),
                      'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(5.0))),
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),
                      'p_drop': rng.uniform(0.06, 0.6),
                      'wd': np.exp(rng.uniform(np.log(6e-3), np.log(1e-1))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))),
                      'act': rng.choice(['mish', 'silu']),
                      'wd_sched': rng.choice(['flat_cos', 'constant']),
                      'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(1e-1)))]),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'p_drop_sched': 'constant',
                      }
        elif self.hpo_space_name == 'alt9':
            # version of alt8 (classification) with reduced search spaces, and increased with space
            # removed batch_size tuning, tfms tuning
            params = {'num_emb_type': 'pbld',
                      'hidden_sizes': 'rectangular',
                      'hidden_width': rng.choice([256, 384, 512]),  # added
                      'ls_eps_sched': 'coslog4',
                      'n_hidden_layers': rng.integers(1, 3, endpoint=True),  # reduced
                      'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),  # reduced
                      'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))),  # todo: could reduce this
                      'p_drop': rng.uniform(0.0, 0.5),  # reduced
                      'wd': np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),  # reduced
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))),
                      'act': rng.choice(['mish', 'silu']),
                      'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),  # reduced
                      'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),  # reduced
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'p_drop_sched': 'constant',
                      }
        elif self.hpo_space_name == 'alt10':
            # version of alt9, similar to tabrepo
            params = {'num_emb_type': 'pbld',
                      'hidden_sizes': 'rectangular',
                      'hidden_width': rng.choice([256, 384, 512]),
                      'ls_eps_sched': 'coslog4',
                      'act': 'mish',
                      'n_hidden_layers': rng.integers(1, 4, endpoint=True),
                      'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                      'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                      'p_drop': rng.uniform(0.0, 0.5),
                      'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                      'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                      'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(2e-1))),
                      'use_ls': rng.choice([False, True]),
                      'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                      'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                      'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                      'p_drop_sched': 'flat_cos',
                      }
        elif self.hpo_space_name == 'tabarena':
            # common search space
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
            }

            if rng.uniform(0.0, 1.0) > 0.5:
                # large configs
                params['plr_hidden_1'] = rng.choice([8, 16, 32, 64]).item()
                params['plr_hidden_2'] = rng.choice([8, 16, 32, 64]).item()
                params['n_epochs'] = rng.choice([256, 512]).item()
                params['use_early_stopping'] = True

                # set in the defaults of RealMLP in TabArena
                params['early_stopping_multiplicative_patience'] = 3
                params['early_stopping_additive_patience'] = 40
            else:
                # default values, used here to always set the same set of parameters
                params['plr_hidden_1'] = 16
                params['plr_hidden_2'] = 4
                params['n_epochs'] = 256
                params['use_early_stopping'] = False
        elif self.hpo_space_name == 'tabarena-new':
            # common search space
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)

                # added in tabarena-new compared to tabarena
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'n_ens': 8,
                "ens_av_before_softmax": False,
            }

            if rng.uniform(0.0, 1.0) > 0.5:
                # large configs
                params['plr_hidden_1'] = rng.choice([8, 16, 32, 64]).item()
                params['plr_hidden_2'] = rng.choice([8, 16, 32, 64]).item()
                params['n_epochs'] = rng.choice([256, 512]).item()
                params['use_early_stopping'] = True

                # set in the defaults of RealMLP in TabArena
                params['early_stopping_multiplicative_patience'] = 3
                params['early_stopping_additive_patience'] = 40
            else:
                # default values, used here to always set the same set of parameters
                params['plr_hidden_1'] = 16
                params['plr_hidden_2'] = 4
                params['n_epochs'] = 256
                params['use_early_stopping'] = False
        elif self.hpo_space_name == 'alt11':
            # tabarena without the large configs
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
            }
        elif self.hpo_space_name == 'alt12':
            # alt11 with n_hidden_layers=1 in the search space
            params = {
                'n_hidden_layers': rng.integers(1, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
            }
        elif self.hpo_space_name == 'alt13':
            # alt11 with more categorical hyperparameters
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
            }
        elif self.hpo_space_name == 'alt14':
            # alt13 with weight_init_mode='normal'
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'weight_init_mode': 'normal',
            }
        elif self.hpo_space_name == 'alt15':
            # alt13 with tuning momentum (beta1)
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'mom': 1.0 - np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), # tune in [0.7, 0.98]
            }
        elif self.hpo_space_name == 'alt16':
            # alt13 with n_ens=2
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'n_ens': 2,
                'ens_av_before_softmax': True,
            }
        elif self.hpo_space_name == 'alt17':
            # alt13 with n_ens=4
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'n_ens': 4,
                'ens_av_before_softmax': True,
            }
        elif self.hpo_space_name == 'alt18':
            # alt17 but with averaging after softmax
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'n_ens': 4,
                'ens_av_before_softmax': False,
            }
        elif self.hpo_space_name == 'alt19':
            # alt13 with numerical preprocessing tuning
            tfms_list = [
                ['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                ['one_hot', 'mean_center', 'l2_normalize', 'smooth_clip', 'embedding'],
            ]
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'tfms': tfms_list[int(rng.choice([0, 1]))],
                'smooth_clip_max_abs_value': np.exp(rng.uniform(np.log(1.0), np.log(10.0)))
            }
        elif self.hpo_space_name == 'alt20':
            # alt13 with numerical preprocessing tuning (but without the max_abs_value unlike alt19)
            tfms_list = [
                ['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                ['one_hot', 'mean_center', 'l2_normalize', 'smooth_clip', 'embedding'],
            ]
            params = {
                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
                'hidden_sizes': 'rectangular',
                'hidden_width': rng.choice([256, 384, 512]),
                'p_drop': rng.uniform(0.0, 0.5),
                'act': 'mish',
                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
                'ls_eps_sched': 'coslog4',
                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'p_drop_sched': 'flat_cos',
                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
                'embedding_size': int(rng.choice([4, 8, 16])),
                'tfms': tfms_list[int(rng.choice([0, 1]))],
            }

        # print(f'{params=}')

        default_params = DefaultParams.RealMLP_TD_CLASS if self.is_classification else DefaultParams.RealMLP_TD_REG
        return utils.join_dicts(default_params, params)


class RandomParamsNNAlgInterface(SingleSplitAlgInterface):
    def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        # model_idx is used for seeding along with the seed given in fit(),
        # so we can do HPO by combining multiple RandomParamsNNAlgInterface objects with different model_idx values
        super().__init__(fit_params=fit_params, **config)
        self.model_idx = model_idx
        self.alg_interface = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        raise NotImplementedError('Refit is not fully implemented...')
        # return RandomParamsNNAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params,
        #                                   **self.config)

    def _create_sub_interface(self, ds: DictDataset, seed: int):
        # this is also set in get_required_resources, but okay
        if self.fit_params is None:
            hparam_seed = utils.combine_seeds(seed, self.model_idx)
            is_classification = not ds.tensor_infos['y'].is_cont()
            self.fit_params = [RealMLPParamSampler(is_classification, **self.config).sample_params(hparam_seed)]
        # todo: need epoch for refit
        params = utils.join_dicts(self.config, self.fit_params[0], self.config.get('override_params', dict()) or dict())
        # params = utils.update_dict(self.fit_params[0], self.config)
        if 'n_epochs' in self.config:
            params['n_epochs'] = self.config['n_epochs']
        self.fit_params[0] = params
        return NNAlgInterface(fit_params=None, **params)

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1
        self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed)
        logger.log(1, f'{self.fit_params=}')
        self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name)
        self.fit_params[0]['sub_fit_params'] = self.alg_interface.fit_params[0]

    def predict(self, ds: DictDataset) -> torch.Tensor:
        self.alg_interface.set_current_predict_params(self.get_current_predict_params_name())
        return self.alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert len(split_seeds) == 1
        alg_interface = self._create_sub_interface(ds, split_seeds[0])
        return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train)

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        return NNAlgInterface(**self.config).get_available_predict_params()

    def to(self, device: str) -> None:
        self.alg_interface.to(device)

# class NNHyperoptAlgInterface(OptAlgInterface):
#     def __init__(self, space=None, n_hyperopt_steps: int = 50, **config):
#         from hyperopt import hp
#         default_config = {}
#         max_config = {}
#         if space is None:
#             space = {
#                 'num_emb_type': hp.choice(['none', 'pl-densenet', 'plr']),
#                 'add_front_scale': hp.choice([True, False]),
#                 'lr': hp.loguniform([2e-2, 1.5e-1]),
#                 'p_drop': hp.choice([0.0, 0.15, 0.3, 0.45]),
#                 'hidden_sizes': hp.choice([[256]*3, [512]]),
#                 'act': hp.choice(['selu', 'mish', 'relu']),
#                 'ls_eps': hp.choice([0.0, 1.0])
#             }
#         # todo: have conversion function?
#         config = utils.update_dict(default_config, config)
#         super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(),
#                                                            n_hyperopt_steps=n_hyperopt_steps,
#                                                            **config),
#                          max_resource_config=utils.join_dicts(config, max_config),
#                          **config)
#
#     def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
#         return NNAlgInterface(**config)


================================================
FILE: pytabkit/models/alg_interfaces/other_interfaces.py
================================================
import os
from typing import Any, List, Optional

import numpy as np
import pandas as pd
import torch
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, \
    GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler

from pytabkit.models.alg_interfaces.alg_interfaces import RandomParamsAlgInterface
from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface, SingleSplitWrapperAlgInterface
from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset


class RFSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None),
                         ('criterion', None),
                         ('max_depth', None),
                         ('min_samples_split', None),
                         ('max_features', None),
                         ('min_samples_leaf', None),
                         ('bootstrap', None),
                         ('min_impurity_decrease', None),
                         ('min_weight_fraction_leaf', None),
                         ('max_leaf_nodes', None),
                         ('max_samples', None),
                         ('n_jobs', ['n_jobs', 'n_threads'], n_threads),
                         ('verbose', ['verbose', 'verbosity'])]

        params = utils.extract_params(self.config, params_config)
        if not params.get('bootstrap', True) and 'max_samples' in params:
            del params['max_samples']
        if self.n_classes > 0:
            return RandomForestClassifier(random_state=seed, **params)
        else:
            train_metric_name = self.config.get('train_metric_name', None)
            if train_metric_name == 'mse':
                params['criterion'] = 'squared_error'  # is the default anyway
            elif train_metric_name == 'mae':
                params['criterion'] = 'absolute_error'
            elif train_metric_name is not None:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
            reg = RandomForestRegressor(random_state=seed, **params)
            if self.config.get('standardize_target', False):
                reg = TransformedTargetRegressor(reg, transformer=StandardScaler())
            return reg

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class RandomParamsRFAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        hpo_space_name = self.config.get('hpo_space_name', 'grinsztajn')
        if hpo_space_name == 'grinsztajn':
            # adapted from Grinsztajn et al. (2022)
            space = {
                'n_estimators': 250,
                'max_depth': rng.choice([None, 2, 3, 4], p=[0.7, 0.1, 0.1, 0.1]),
                'criterion': rng.choice(['gini', 'entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': rng.choice([2, 3], p=[0.95, 0.05]),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(50.5)))),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, 0.01, 0.02, 0.05], p=[0.85, 0.05, 0.05, 0.05]),
                'tfms': ['one_hot'],
            }
        elif hpo_space_name == 'large-v1':
            space = {
                'n_estimators': 300,
                # this wasn't used in the experiments
                # 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 2, 3, 4, 6, 8, 12, 16]),
                'criterion': rng.choice(['gini', 'entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(128.0)))),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-1)))]),
                'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)],
            }
        elif hpo_space_name == 'large-v2':
            # large-v1 but reduced max_depth, criterion, min_samples_leaf, min_impurity_decrease
            # added max_leaf_nodes back in
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)],
            }
        elif hpo_space_name == 'large-v3':
            # large-v2 but not tuning min_impurity_decrease, reduced max_depth, reduced min_samples_split,
            # only 100 estimators
            space = {
                'n_estimators': 100,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)],
            }
        elif hpo_space_name == 'large-v4':
            # large-v2 but only ordinal encoding
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v5':
            # large-v3 but with 300 estimators and only ordinal encoding
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v6':
            # large-v4 but only bootstrap=True
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v7':
            # large-v6 but not tuning max_leaf_nodes
            space = {
                'n_estimators': 300,
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error', 'absolute_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v8':
            # large-v4 but not tuning max_leaf_nodes, not allowing absolute_error
            space = {
                'n_estimators': 300,
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v9':
            # large-v8 but tuning max_leaf_nodes again
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v10':
            # large-v9 but not tuning min_impurity_decrease
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v11':
            # large-v9 but tuning one-hot encoding
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 12, 16]),
                'criterion': rng.choice(['entropy']) if is_classification
                else rng.choice(['squared_error']),
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))),
                'min_samples_leaf': 1,
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]),
                'bootstrap': rng.choice([True, False]),
                'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)],
            }
        elif hpo_space_name == 'large-v12':
            # very large space like large-v1 but a bit different
            # only 50 estimators -> use with bagging
            space = {
                'n_estimators': 50,
                'max_depth': rng.choice([6, 8, 12, 16, 20]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': rng.choice(['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, None]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(64.0)))),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]),
                # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v12':
            # very large space like large-v1 but a bit different
            # only 50 estimators -> use with bagging
            space = {
                'n_estimators': 50,
                'max_depth': rng.choice([6, 8, 12, 16, 20]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': rng.choice(['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, None]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(64.0)))),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]),
                # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v13':
            # reduced version on large-v12 based on talent-reg-small
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_depth': rng.choice([16, 20]),
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]),
                # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v14':
            # reduced version of large-v13 based on talent-reg-small
            # changed max_features, removed max_depth, changed min_impurity_decrease
            # removed tuning max_samples since it doesn't seem to do much?
            # this doesn't perform very well (target was not standardized for regression)
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': rng.uniform(0.2, 0.9),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
                'tfms': ['ordinal_encoding'],
            }
        elif hpo_space_name == 'large-v15':
            # large-v14 but with standardized target
            # better
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': rng.uniform(0.2, 0.9),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v16':
            # large-v15 but don't tune min_impurity_decrease. Also go back to old max_features
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v17':
            # large-v16 but with tuning max_samples (wasn't used)
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v18':
            # large-v16 but with max_depth limit  (equivalent to large-v13 without tuning min_impurity_decrease)
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v19':
            # large-v18 but with tuning max_samples
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v20':
            # large-v19 but with tuning min_impurity_decrease, with 300 estimator, a few more max_depth options
            space = {
                'n_estimators': 300,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([12, 16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(5e-3)))]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v21':
            # large-v20 but with different max_depth, min_impurity_decrease, and 50 estimators
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v22':
            # large-v21 but without bootstrap=False
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v23':
            # large-v21 but with 100 estimators
            space = {
                'n_estimators': 100,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v24':
            # large-v21 but without tuning min_impurity_decrease
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v25':
            # large-v21 but with different min_impurity_decrease space
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': 1,
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v26':
            # large-v25 but with tuning min_samples_leaf
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'max_depth': rng.choice([16, 20, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))),
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v27':
            # inspired from XT but with both bootstrap options
            space = {
                'n_estimators': 50,
                'max_features': ['sqrt', 0.5, 0.75, 1.0][rng.integers(4)],
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1e-3)))]),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'tabrepo1':
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True),
                'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]),
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'tfms': ['one_hot'],
            }
        elif hpo_space_name == 'tabrepo1-ordinal':
            space = {
                'n_estimators': 300,
                'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True),
                'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]),
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'tfms': ['ordinal_encoding'],  # failed to fix it
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**config) for i in range(n_tv_splits)])


class ExtraTreesSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None),
                         ('criterion', None),
                         ('max_depth', None),
                         ('min_samples_split', None),
                         ('max_features', None),
                         ('min_samples_leaf', None),
                         ('bootstrap', None),
                         ('min_impurity_decrease', None),
                         ('min_weight_fraction_leaf', None),
                         ('max_leaf_nodes', None),
                         ('max_samples', None),
                         ('n_jobs', ['n_jobs', 'n_threads'], n_threads),
                         ('verbose', ['verbose', 'verbosity'])]

        params = utils.extract_params(self.config, params_config)
        if not params.get('bootstrap', True) and 'max_samples' in params:
            del params['max_samples']
        if self.n_classes > 0:
            return ExtraTreesClassifier(random_state=seed, **params)
        else:
            train_metric_name = self.config.get('train_metric_name', None)
            if train_metric_name == 'mse':
                params['criterion'] = 'squared_error'  # is the default anyway
            elif train_metric_name == 'mae':
                params['criterion'] = 'absolute_error'
            elif train_metric_name is not None:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
            reg = ExtraTreesRegressor(random_state=seed, **params)
            if self.config.get('standardize_target', False):
                reg = TransformedTargetRegressor(reg, transformer=StandardScaler())
            return reg

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class RandomParamsExtraTreesAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        hpo_space_name = self.config['hpo_space_name']
        if hpo_space_name == 'large-v1':
            space = {
                'n_estimators': 50,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'max_depth': rng.choice([None, 8, 12, 16]),
                'criterion': rng.choice(['gini', 'entropy']) if is_classification
                else 'squared_error',
                'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(8.0)))),
                'max_samples': float(rng.uniform(0.4, 1.0)),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v2':
            # large-v1 shrunken
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(4.5)))),
                'bootstrap': rng.choice([True, False]),
                'max_samples': rng.uniform(0.4, 1.0),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v3':
            # large-v2 shrunken
            # very good for classification
            # tuning of max_features may be unnecessary, default might work just as well
            # maybe could go even larger with min_samples_split
            space = {
                'n_estimators': 50,
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'min_samples_leaf': 1,
                'bootstrap': False,
                # 'max_samples': rng.uniform(0.4, 1.0),  # irrelevant without bootstrap
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                # could decrease upper bound to 5e-4
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v4':
            # large space for regression tests
            space = {
                'n_estimators': 50,
                'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))),
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': rng.choice([0.4, 0.6, 0.8, None]),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': 1,
                'max_samples': float(rng.uniform(0.4, 1.0)),
                'bootstrap': rng.choice([True, False]),
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v5':
            # shrunken version of large-v4 for regression
            # min_impurity_decrease could be shrunk more
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': float(rng.uniform(0.5, 1.0)),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': 1,
                # 'max_samples': float(rng.uniform(0.4, 1.0)),    # irrelevant without bootstrap
                'bootstrap': False,
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-6), np.log(5e-4))),
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v6':
            # large-v5 without tuning min_impurity_decrease
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': float(rng.uniform(0.5, 1.0)),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': 1,
                # 'max_samples': float(rng.uniform(0.4, 1.0)),    # irrelevant without bootstrap
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v7':
            # large-v6 with tuning max_leaf_nodes
            # doesn't help
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True),
                'max_features': float(rng.uniform(0.5, 1.0)),
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': 1,
                # 'max_samples': float(rng.uniform(0.4, 1.0)),    # irrelevant without bootstrap
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v8':
            # large-v6 but with different tuning space for max_features
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': 1,
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v9':
            # large-v8 but tuning min_samples_leaf
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(8.5)))),
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v10':
            # large-v9 but without tuning min_samples_split
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'min_samples_split': 2,
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(8.5)))),
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v11':
            # large-v10 but with fixed tuning space for min_samples_leaf
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'min_samples_split': 2,
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.5), np.log(8.5)))),
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v12':
            # large-v9 but with fixed tuning space for min_samples_leaf
            space = {
                'n_estimators': 50,
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))),
                'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.5), np.log(8.5)))),
                'bootstrap': False,
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v13':
            # large-v3 with different max_features space
            space = {
                'n_estimators': 50,
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'min_samples_leaf': 1,
                'bootstrap': False,
                # 'max_samples': rng.uniform(0.4, 1.0),  # irrelevant without bootstrap
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                # could decrease upper bound to 5e-4
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'large-v14':
            # large-v3 with different max_features space
            space = {
                'n_estimators': 50,
                'max_features': ['sqrt', 0.5, 0.75, 1.0][rng.integers(4)],
                'criterion': 'entropy' if is_classification else 'squared_error',
                'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))),
                'min_samples_leaf': 1,
                'bootstrap': False,
                # 'max_samples': rng.uniform(0.4, 1.0),  # irrelevant without bootstrap
                'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
                # could decrease upper bound to 5e-4
                'tfms': ['ordinal_encoding'],
                'standardize_target': True,
            }
        elif hpo_space_name == 'tabrepo1-mod':
            space = {
                'n_estimators': 50,
                # not completely sure if tabrepo1 uses entropy
                'criterion': 'entropy' if is_classification else 'squared_error',
                'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True),
                'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]),
                'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)],
                'tfms': ['ordinal_encoding'],
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([ExtraTreesSubSplitInterface(**config) for i in range(n_tv_splits)])


class GBTSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None),
                         ('learning_rate', None),
                         ('subsample', None),
                         ('max_depth', None),
                         ('verbose', ['verbose', 'verbosity'])]

        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            return GradientBoostingClassifier(random_state=seed, **params)
        else:
            train_metric_name = self.config.get('train_metric_name', 'mse')
            if train_metric_name == 'mse':
                pass  # is the default anyway
            elif train_metric_name.startswith('pinball('):
                quantile = float(train_metric_name[len('pinball('):-1])
                params['loss'] = f'quantile'
                params['alpha'] = quantile
            elif train_metric_name == 'mae':
                params['loss'] = 'absolute_error'
            else:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
            return GradientBoostingRegressor(random_state=seed, **params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class KNNSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_neighbors', None),
                         ('weights', None),
                         ('p', None),
                         ('n_jobs', ['n_jobs', 'n_threads'], n_threads)]

        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            from sklearn.neighbors import KNeighborsClassifier
            return KNeighborsClassifier(**params)
        else:
            from sklearn.neighbors import KNeighborsRegressor
            return KNeighborsRegressor(**params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class RandomParamsKNNAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        hpo_space_name = self.config['hpo_space_name']
        if hpo_space_name == 'v1':
            space = {
                'n_neighbors': int(np.exp(rng.uniform(np.log(1.0), np.log(101.0)))),
                'weights': rng.choice(['uniform', 'distance']),
                # 'p': np.exp(rng.uniform(np.log(0.2), np.log(8.0))),  # values outside of 1 and 2 can be very slow
                'p': rng.choice([1, 2]),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'tabrepo1':
            space = {
                'n_neighbors': rng.choice([3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 20, 30, 40, 50]),
                'weights': rng.choice(['uniform', 'distance']),
                'p': rng.choice([1, 2]),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([KNNSubSplitInterface(**config) for i in range(n_tv_splits)])


class LinearModelSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [
            # ('l1_ratio', None),
            ('fit_intercept', None),
            # ('n_jobs', ['n_jobs', 'n_threads'], n_threads)
        ]

        penalty = self.config.get('penalty', 'l2')
        n_jobs = self.config.get('n_jobs', self.config.get('n_threads', None))

        params = utils.extract_params(self.config, params_config)
        l1_ratio = self.config.get('l1_ratio', 0.5)

        C = self.config.get('C', 1.0)
        if self.n_classes > 0:
            from sklearn.linear_model import LogisticRegression
            return LogisticRegression(random_state=seed, penalty=penalty,
                                      solver='lbfgs' if penalty == 'l2' else 'saga',
                                      C=C, l1_ratio=l1_ratio if penalty == 'elasticnet' else None,
                                      n_jobs=n_jobs, **params)
            # return LogisticRegression(random_state=seed, penalty='l2', solver='newton-cholesky', C=C, **params)
        else:
            alpha = self.config.get('alpha', 1 / C)
            from sklearn.linear_model import Ridge, Lasso, ElasticNet
            if penalty == 'l2':
                return Ridge(random_state=seed, alpha=alpha, **params)
            elif penalty == 'l1':
                return Lasso(random_state=seed, alpha=alpha, **params)
            elif penalty == 'elasticnet':
                return ElasticNet(random_state=seed, alpha=alpha, l1_ratio=l1_ratio, **params)
            else:
                raise ValueError()
            # from sklearn.linear_model import ElasticNet
            # return ElasticNet(random_state=seed, alpha=alpha, **params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, n_threads=1), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class RandomParamsLinearModelAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        hpo_space_name = self.config['hpo_space_name']
        if hpo_space_name == 'v1':
            space = {
                'penalty': rng.choice(['l1', 'l2', 'elasticnet']),
                'l1_ratio': rng.uniform(0.01, 1.0),
                'C': np.exp(rng.uniform(np.log(1e-2), np.log(1e7))),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'v2':
            # smaller version of v1
            space = {
                'penalty': rng.choice(['l1', 'l2', 'elasticnet']),
                'l1_ratio': rng.uniform(0.01, 0.8),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e5))),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'v3':
            # smaller version of v1
            space = {
                'penalty': rng.choice(['l1', 'l2', 'elasticnet']),
                'l1_ratio': rng.uniform(0.01, 0.5),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e4))),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'v4':
            # smaller version of v1
            space = {
                'penalty': rng.choice(['l1', 'l2']),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e5))),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'tabrepo1':
            space = {
                'penalty': rng.choice(['l1', 'l2']),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))),
                'tfms': ['mean_center', 'l2_normalize', 'one_hot'],
            }
        elif hpo_space_name == 'tabrepo1-rssc3':
            space = {
                'penalty': rng.choice(['l1', 'l2']),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))),
                'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'],
                'smooth_clip_max_abs_value': 3,
            }
        elif hpo_space_name == 'tabrepo1-rssc5':
            space = {
                'penalty': rng.choice(['l1', 'l2']),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))),
                'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'],
                'smooth_clip_max_abs_value': 5,
            }
        elif hpo_space_name == 'tabrepo1-rssc10':
            space = {
                'penalty': rng.choice(['l1', 'l2']),
                'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))),
                'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'],
                'smooth_clip_max_abs_value': 10,
            }
        else:
            raise ValueError()
        return space

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([LinearModelSubSplitInterface(**config) for i in range(n_tv_splits)])


class SklearnMLPSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = []  # todo: add parameters
        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            return MLPClassifier(random_state=seed, **params)
        else:
            reg = MLPRegressor(random_state=seed, **params)
            return TransformedTargetRegressor(regressor=reg, transformer=StandardScaler())

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_onehot_size_gb': 10.0, '1/n_threads*n_samples': 4e-5}
        ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)


class KANSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        import imodelsx.kan
        params_config = []  # todo: add parameters
        params = utils.extract_params(self.config, params_config)
        params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
        if self.n_classes > 0:
            return imodelsx.kan.KANClassifier(**params)
        else:
            reg = imodelsx.kan.KANRegressor(**params)
            return TransformedTargetRegressor(regressor=reg, transformer=StandardScaler())

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5}
        ram_params = {'': 0.15, 'ds_onehot_size_gb': 1.5}
        gpu_ram_params = {'': 0.4, 'n_features': 1e-4}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02)  # , gpu_ram_params)
        return rc.get_required_resources(ds)

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        # by default, we ignore the validation set since most sklearn methods do not support it
        n_samples = len(x_df)
        train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
        train_mask[val_idxs] = False
        # give train+valid to KAN since it does its own train+valid split
        # (even though that one uses 20% valid instead of 25%)
        # x_df = x_df.iloc[train_mask, :]
        x_np = x_df.to_numpy()
        # y = y[train_mask]
        if cat_col_names is not None and len(cat_col_names) > 0:
            self.model.fit(x_np, y, **{self._get_cat_indexes_arg_name(): cat_col_names})
        else:
            self.model.fit(x_np, y)

    def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict(x_df.to_numpy())

    def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict_proba(x_df.to_numpy())


class GrandeWrapper:
    """
    Wrapper class for GRANDE that allows to pass cat_features in fit() instead of the constructor.
    """

    def __init__(self, **config):
        self.config = config

    def fit(self, X, y, X_val, y_val, cat_features: Optional[List[str]] = None):
        # params_config = []  # todo: add parameters
        # params = utils.extract_params(self.config, params_config)
        params = {
            'depth': 5,  # tree depth
            'n_estimators': 2048,  # number of estimators / trees

            'learning_rate_weights': 0.005,  # learning rate for leaf weights
            'learning_rate_index': 0.01,  # learning rate for split indices
            'learning_rate_values': 0.01,  # learning rate for split values
            'learning_rate_leaf': 0.01,  # learning rate for leaves (logits)

            'optimizer': 'adam',  # optimizer
            'cosine_decay_steps': 0,  # decay steps for lr schedule (CosineDecayRestarts)

            # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression)
            'focal_loss': False,  # use focal loss {True, False}
            'temperature': 0.0,  # temperature for stochastic re-weighted GD (0.0, 1.0)

            'from_logits': True,  # use logits for weighting {True, False}
            'use_class_weights': True,  # use class weights for training {True, False}

            'dropout': 0.0,
            # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training)

            'selected_variables': 0.8,  # feature subset percentage (0.0, 1.0)
            'data_subset_fraction': 1.0,  # data subset percentage (0.0, 1.0)
        }

        args = {
            'epochs': 1,  # number of epochs for training
            'early_stopping_epochs': 25,  # patience for early stopping (best weights are restored)
            'batch_size': 64,  # batch size for training
            'random_seed': 42,
            'verbose': 1,
        }

        if issubclass(y.dtype.type, np.floating):
            print(f'regression')
            self.is_regression_ = True
            params['loss'] = 'mse'
            args['objective'] = 'regression'
        elif len(np.unique(y)) <= 2:
            self.is_regression_ = False
            params['loss'] = 'crossentropy'
            args['objective'] = 'binary'
        else:
            self.is_regression_ = False
            params['loss'] = 'crossentropy'
            args['objective'] = 'classification'

        if cat_features is not None:
            args['cat_idx'] = [X.columns.get_loc(name) for name in cat_features]
        else:
            args['cat_idx'] = []

        device = self.config.get('device', 'cpu')
        if device.startswith('cuda'):
            gpu_idx_str = device[len('cuda:'):]
            os.environ['CUDA_VISIBLE_DEVICES'] = gpu_idx_str
            os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

        from GRANDE import GRANDE

        self.model_ = GRANDE(params=params, args=args)
        self.model_.fit(X.copy(), y, X_val.copy(), y_val)

    def predict_proba(self, X):
        return self.model_.predict(X)

    def predict(self, X):
        y_pred = self.model_.predict(X)
        if not self.is_regression_:
            return np.argmax(y_pred, axis=1)
        else:
            return y_pred


class GrandeSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        model = GrandeWrapper(**self.config, device='cpu' if len(gpu_devices) == 0 else gpu_devices[0])
        # if self.n_classes == 0:  # doesn't work with validation sets anyway
        #     model = TransformedTargetRegressor(regressor=model, transformer=StandardScaler())
        return model

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_onehot_size_gb': 10.0, '1/n_threads*n_samples': 4e-5}
        ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params)
        return rc.get_required_resources(ds)

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        # by default, we ignore the validation set since most sklearn methods do not support it
        n_samples = len(x_df)
        train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
        train_mask[val_idxs] = False
        x_val_df = x_df.iloc[~train_mask, :]
        y_val_df = y[~train_mask]
        x_df = x_df.iloc[train_mask, :]
        y = y[train_mask]
        if cat_col_names is not None and len(cat_col_names) > 0:
            self.model.fit(x_df, y, x_val_df, y_val_df, cat_features=cat_col_names)
        else:
            self.model.fit(x_df, y, x_val_df, y_val_df)


class TabPFN2SubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [
            # ('n_jobs', ['n_jobs', 'n_threads'], n_threads),
            ('softmax_temperature', None),
            ('average_before_softmax', None),
            ('inference_precision', None),
            ('fit_mode', None),
            ('model_path', None),
        ]

        params = utils.extract_params(self.config, params_config)
        if self.config.get('use_float32', False):
            params['inference_precision'] = torch.float32
        # print(f'{gpu_devices=}')
        if self.n_classes > 0:
            from tabpfn import TabPFNClassifier
            return TabPFNClassifier(random_state=seed,
                                    device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu',
                                    # device='cuda' if len(gpu_devices) > 0 else 'cpu',
                                    ignore_pretraining_limits=True, **params)
        else:
            from tabpfn import TabPFNRegressor
            return TabPFNRegressor(random_state=seed,
                                   device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu',
                                   # device='cuda' if len(gpu_devices) > 0 else 'cpu',
                                   ignore_pretraining_limits=True, **params)

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        # by default, we ignore the validation set since most sklearn methods do not support it
        if not self.config.get('fit_on_valid', False):
            n_samples = len(x_df)
            train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
            train_mask[val_idxs] = False
            x_df = x_df.iloc[train_mask, :]
            y = y[train_mask]
        # don't provide a categorical indicator, it should work like this as well
        self.model.fit(x_df, y)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=1.0, gpu_ram_params={'': 10.0})
        return rc.get_required_resources(ds)


class TabICLSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [
            # ('n_jobs', ['n_jobs', 'n_threads'], n_threads),
            ('n_estimators', None),
            ('softmax_temperature', None),
            ('average_logits', None),
            ('use_amp', None),
            ('batch_size', None),
            ('model_path', None),
            ('allow_auto_download', None),
            ('norm_methods', None)
        ]

        params = utils.extract_params(self.config, params_config)
        if self.config.get('use_float32', False):
            params['inference_precision'] = torch.float32
        # print(f'{gpu_devices=}')
        if self.n_classes > 0:
            if self.config.get('use_tabiclex', False):
                from tabiclv2 import TabICLClassifier
            else:
                from tabicl import TabICLClassifier
            return TabICLClassifier(random_state=seed,
                                    device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu',
                                    **params)
        else:
            raise ValueError(f'TabICL for regression does not exist')

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        # by default, we ignore the validation set since most sklearn methods do not support it
        if not self.config.get('fit_on_valid', False):
            n_samples = len(x_df)
            train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
            train_mask[val_idxs] = False
            x_df = x_df.iloc[train_mask, :]
            y = y[train_mask]
        x_df = x_df.copy()
        if self.config.get('add_fingerprint_feature', False):
            x_df['__fingerprint_feature'] = np.random.randn(len(x_df))
        if self.config.get('mirror_numerical_features', False):
            self.float_cols_ = x_df.select_dtypes(include=['float']).columns
            print(f'{len(self.float_cols_)=}')
            # Generate random signs (+1 or -1) for each column
            self.signs_ = np.random.choice([-1, 1], size=len(self.float_cols_))
            # Multiply each float column by its random sign
            x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_
        # don't provide a categorical indicator, it should work like this as well
        self.model.fit(x_df, y)

    def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        x_df = x_df.copy()
        if self.config.get('add_fingerprint_feature', False):
            x_df['__fingerprint_feature'] = np.random.randn(len(x_df))
        if self.config.get('mirror_numerical_features', False):
            x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_
        return super()._predict_sklearn(x_df)

    def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        x_df = x_df.copy()
        if self.config.get('add_fingerprint_feature', False):
            x_df['__fingerprint_feature'] = np.random.randn(len(x_df))
        if self.config.get('mirror_numerical_features', False):
            x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_
        return super()._predict_proba_sklearn(x_df)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
        time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
        ram_params = {'': 0.5}
        rc = ResourcePredictor(config=updated_config, time_params=time_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.999, gpu_ram_params={'': 10.0})
        return rc.get_required_resources(ds)


================================================
FILE: pytabkit/models/alg_interfaces/resource_computation.py
================================================
import numbers
import time
from collections.abc import Callable
from typing import Dict, Union, List, Any, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset, TensorInfo
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.training.metrics import pinball_loss


# This file contains code to predict required resources (time and RAM) of a ML model on a dataset.
# There are two components:
# - Computing the predicted resources based on a linear model on raw and product features
# - Fitting the linear model coefficients based on evaluations on random parameters.


def get_resource_features(config: Dict, ds: DictDataset, n_cv: int, n_refit: int,
                          n_splits: int, **extra_params) -> Dict[str, float]:
    """
    Extracts features that can be used in a linear model for predicting resource usage.
    """
    # in hyperopt method also on number of steps (for time estimation)
    tensor_infos = ds.tensor_infos
    n_samples = ds.n_samples
    n_classes = tensor_infos['y'].get_cat_size_product()
    prep_factory = PreprocessingFactory(**config)
    onehot_factory = PreprocessingFactory(tfms=['one_hot'])
    fitter, out_tensor_infos = prep_factory.create_transform(tensor_infos)
    _, onehot_tensor_infos = onehot_factory.create_transform(tensor_infos)
    n_features = sum([ti.get_n_features() for key, ti in out_tensor_infos.items()
                      if key in ['x_cont', 'x_cat']])
    ds_prep = DictDataset(tensors=None, tensor_infos=out_tensor_infos, device=ds.device, n_samples=n_samples)
    ds_onehot = DictDataset(tensors=None, tensor_infos=onehot_tensor_infos, device=ds.device, n_samples=n_samples)
    cat_size_sum = 0 if 'x_cat' not in out_tensor_infos else out_tensor_infos['x_cat'].get_cat_sizes().sum().item()
    n_classes = ds.tensor_infos['y'].get_cat_size_product()
    n_cat = ds.tensor_infos['x_cat'].get_n_features()

    ds_size_gb = ds.get_size_gb()
    ds_prep_size_gb = ds_prep.get_size_gb()
    ds_onehot_size_gb = ds_onehot.get_size_gb()

    n_tree_repeats = 1 if n_classes <= 2 else n_classes

    features = dict()
    features['1/n_threads'] = 1 / config.get('n_threads', 1)
    features['ds_size_gb'] = ds_size_gb
    features['ds_prep_size_gb'] = ds_prep_size_gb
    features['ds_onehot_size_gb'] = ds_onehot_size_gb
    features['n_features'] = n_features
    features['n_samples'] = n_samples
    features['n_tree_repeats'] = n_tree_repeats
    features['n_cv_refit'] = n_cv + n_refit
    features['n_splits'] = n_splits
    max_depth = config.get('max_depth', 6)
    if isinstance(max_depth, numbers.Number):
        features['2_power_maxdepth'] = 2 ** max_depth
    features['log_num_leaves'] = np.log(max(1, config.get('num_leaves', 31)))
    features['cat_size_sum'] = cat_size_sum
    features['n_classes'] = n_classes
    features['n_cat'] = n_cat

    return utils.join_dicts(config, features, extra_params)


def process_resource_features(raw_features: Dict[str, Any], feature_spec: List[str]):
    """
    Adds product features to raw features.
    :param raw_features: Raw feature values
    :param feature_spec: List of strings. Each string should be of the form 'feature_1*...*feature_n',
        using the names of the features whose products should be added
    :return: Returns a dictionary of the raw features along with the newly computed product features.
    """
    results = dict()
    for combination in feature_spec:
        # ignore empty factors
        factors = [factor for factor in combination.split('*') if factor != '']
        value = 1.0
        for factor in factors:
            value *= raw_features[factor]
        results[combination] = value
    return results


def eval_linear_product_model(raw_features: Dict[str, Any], params: Dict[str, float]):
    """
    Computes the "inner product" between the feature dictionaries
    (obtained from raw features and products according to the keys in params).
    :return:
    """
    result = 0.0
    for key, param in params.items():
        # ignore empty factors
        factors = [factor for factor in key.split('*') if factor != '']
        value = 1.0
        for factor in factors:
            value *= raw_features[factor]
        result += param * value
    return result


class FeatureSpec:
    """
    Allows to create a list of product feature names from product and powerset operations etc.
    """
    @staticmethod
    def _listify(spec: Union[List, str]):
        if isinstance(spec, list):
            return spec
        elif isinstance(spec, str):
            return [spec]
        else:
            raise ValueError(f'Unsupported spec type {type(spec)}')

    @staticmethod
    def _product_str(first: str, second: str) -> str:
        if len(first) == 0:
            if len(second) == 0:
                return ''
            else:
                return second
        else:
            if len(second) == 0:
                return first
            else:
                return f'{first}*{second}'

    @staticmethod
    def concat(*feature_specs):
        feature_specs = [FeatureSpec._listify(spec) for spec in feature_specs]
        flattened = [spec for lst in feature_specs for spec in lst]
        return flattened

    @staticmethod
    def product(*feature_specs):
        if len(feature_specs) <= 0:
            raise ValueError()
        elif len(feature_specs) == 1:
            return FeatureSpec._listify(feature_specs[0])
        else:
            first, rest = feature_specs[0], feature_specs[1:]
            first_list = FeatureSpec._listify(first)
            rest_product = FeatureSpec.product(*rest)
            return [FeatureSpec._product_str(first_spec, rest_spec)
                    for first_spec in first_list for rest_spec in rest_product]

    @staticmethod
    def powerset_products(*feature_specs):
        if len(feature_specs) == 0:
            return ['']
        elif len(feature_specs) == 1:
            return FeatureSpec.concat('', feature_specs[0])
        else:
            return FeatureSpec.product(FeatureSpec.concat('', feature_specs[0]),
                                       FeatureSpec.powerset_products(*feature_specs[1:]))


# some code for linear regression with different losses, to estimate coefficients for resource prediction

class NormalizedDataRegressor:
    def __init__(self, sub_regressor):
        self.sub_regressor = sub_regressor

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.x_norms_ = np.sqrt(np.mean(X ** 2, axis=0))
        self.y_norm_ = np.sqrt(np.mean(y ** 2))
        self.sub_regressor.fit(X / self.x_norms_[None, :], y / self.y_norm_)

    def get_coefs(self) -> np.ndarray:
        return self.sub_regressor.get_coefs() * self.y_norm_ / self.x_norms_

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.sub_regressor.predict(X / self.x_norms_) * self.y_norm_


class LogLinearModule(nn.Module):
    def __init__(self, n_features: int):
        super().__init__()
        self.params = nn.Parameter(torch.zeros(n_features, dtype=torch.float64))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x @ torch.exp(self.params)


class LogLinearRegressor:
    def __init__(self, pessimistic: bool):
        self.pessimistic = pessimistic

    def fit(self, X: np.ndarray, y: np.ndarray):
        x = torch.as_tensor(X, dtype=torch.float64)
        y = torch.as_tensor(y, dtype=torch.float64)
        y_log = torch.log(y + 1e-8)
        n_features = x.shape[1]
        self.model_ = LogLinearModule(n_features=n_features)
        opt = torch.optim.Adam(params=self.model_.parameters(), betas=(0.9, 0.95))

        n_it = 10000
        max_lr = 1e-1

        for i in range(n_it):
            for param_group in opt.param_groups:
                # linearly decaying lr schedule
                param_group['lr'] = (1 - i / n_it) * max_lr
            y_pred_log = torch.log(self.model_(x))
            if self.pessimistic:
                loss = pinball_loss(torch.exp(y_pred_log), y, quantile=0.99)
            else:
                loss = ((y_pred_log - y_log) ** 2).mean()
            if i % (n_it // 10) == 0:
                print(f'Loss: {loss.item():g}')
            loss.backward()
            opt.step()
            opt.zero_grad()

    def get_coefs(self) -> np.ndarray:
        return np.exp(self.model_.params.detach().numpy())


def fit_resource_factors(data: List[Tuple[Dict[str, float], float]], pessimistic: bool, coef_factor: float = 1.0):
    feature_names = list(data[0][0].keys())
    y = np.asarray([data[i][1] for i in range(len(data))])
    X = np.asarray([[data[i][0][feature_names[j]] for j in range(len(feature_names))] for i in range(len(data))])

    # transform data set to implicitly learn with relative mse
    # ((y_pred - y)/y)^2 = ((X/y)c - 1)^2
    # X = X / y[:, None]
    # y = np.ones_like(y)

    # coefs: np.ndarray = np.linalg.lstsq(X, y)[0]
    # always use pessimistic version
    reg = NormalizedDataRegressor(LogLinearRegressor(pessimistic=True))
    reg.fit(X, y)
    coefs = reg.get_coefs()
    coefs[coefs < 0.0] = 0.0

    if pessimistic:
        # rescale to a bit larger than the maximum on the training set
        y_pred = X @ coefs
        coefs *= coef_factor * np.max(y / y_pred)
    else:
        y_pred = X @ coefs
        coefs *= np.mean(y) / np.mean(y_pred)
        # # align their geometric means
        # coefs *= np.exp(np.mean(np.log(y)) - np.mean(np.log(y_pred)))
    return {name: coef for name, coef in zip(feature_names, coefs)}


class TimeWrapper:
    def __init__(self, f: Callable):
        self.f = f

    def __call__(self):
        start_time = time.time()
        self.f()
        end_time = time.time()
        return end_time - start_time


def create_ds(n_samples: int, n_cont: int, n_cat: int, cat_size: int, n_classes: int) -> DictDataset:
    torch.manual_seed(0)
    x_cont = torch.randn(n_samples, n_cont)
    x_cont_info = TensorInfo(feat_shape=[n_cont])
    x_cat = torch.randint(0, cat_size, size=(n_samples, n_cat))
    x_cat_info = TensorInfo(cat_sizes=[cat_size] * n_cat)
    if n_classes > 0:
        y = torch.randint(0, n_classes, size=(n_samples, 1))
        y_info = TensorInfo(cat_sizes=[n_classes])
    else:
        y = torch.randn(n_samples, 1)
        y_info = TensorInfo(feat_shape=[1])
    return DictDataset(tensors=dict(x_cont=x_cont, x_cat=x_cat, y=y),
                       tensor_infos=dict(x_cont=x_cont_info, x_cat=x_cat_info, y=y_info))


class Sampler:
    def sample(self) -> Union[int, float]:
        raise NotImplementedError()


class UniformSampler(Sampler):
    def __init__(self, low: Union[int, float], high: Union[int, float], log=False, is_int=False):
        self.low = low
        self.high = high
        self.log = log
        self.is_int = is_int

    def sample(self) -> Union[int, float]:
        low = self.low
        high = self.high + 1 if self.is_int else self.high  # in the integer case, make the upper bound inclusive
        if self.log:
            sample = np.exp(np.random.uniform(np.log(low), np.log(high)))
        else:
            sample = np.random.uniform(low, high)
        return int(sample) if self.is_int else sample


# class ChoiceSampler:
#     def __init__(self):


def ds_to_xy(ds: DictDataset) -> Tuple[pd.DataFrame, np.ndarray]:
    X = ds.without_labels().to_df()
    y = ds.tensors['y'].numpy()
    return X, y


class ResourcePredictor:
    """
    Predicts resource usages based on a linear model on raw and product features.
    """
    def __init__(self, config: Dict[str, Any], time_params: Dict[str, float], cpu_ram_params: Dict[str, float],
                 gpu_ram_params: Optional[Dict[str, float]] = None, n_gpus: int = 0, gpu_usage: float = 1.0):
        """
        :param config: Configuration parameters.
        :param time_params: Coefficients for the linear model for time prediction.
        :param cpu_ram_params: Coefficients for the linear model for CPU RAM prediction.
        :param gpu_ram_params: Coefficients for the linear model for GPU RAM prediction.
        :param n_gpus: Number of GPUs that should be used.
        :param gpu_usage: Usage level of each GPU (between 0 and 1).
        """
        self.config = config
        self.time_params = time_params
        self.cpu_ram_params = cpu_ram_params
        self.gpu_ram_params = gpu_ram_params
        self.n_gpus = n_gpus
        self.gpu_usage = gpu_usage

    def get_required_resources(self, ds: DictDataset, **extra_params) -> RequiredResources:
        """
        Function that provides an estimate of the required resources
        :param ds: Dataset (does not need to contain the tensors, just the n_samples and tensor_infos)
        :return: RequiredResources estimate.
        """
        # in hyperopt method also on number of steps
        # moreover it should depend on n_threads, and scaling law should be able to be configured
        # should allow n_threads to depend on the task_info  (based on certain thresholds and possibly scaling law)
        # include a time_factor depending on the method
        n_samples = ds.n_samples
        n_classes = ds.tensor_infos['y'].get_cat_sizes()[0].item()

        ds = DictDataset(tensors=None, tensor_infos=ds.tensor_infos, device='cpu', n_samples=ds.n_samples)
        raw_features_prelim = get_resource_features(self.config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params)
        n_features = raw_features_prelim['n_features']

        if 'n_threads' in self.config:
            n_threads = self.config['n_threads']
        else:
            # for dionis, it's roughly 100k * 60 * 355 = 2_130_000_000
            # for robert it's 10k * 7200 * 10 = 720_000_000
            # for indoor_loc_building it's roughly 20k * 520 * 3 = 31_200_000
            ds_complexity = n_samples * n_features * n_classes
            thresh = self.config.get('single_thread_complexity_threshold', 200_000_000)
            # n_threads = min(self.config.get('max_complexity_threads', 128), 1 + int(ds_complexity / thresh))
            n_threads = 1 + int(ds_complexity / thresh)

            config = utils.update_dict(self.config, dict(n_threads=n_threads))
            raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params)
            cpu_ram_gb = eval_linear_product_model(raw_features, self.cpu_ram_params)

            min_threads_per_gb = self.config.get('min_threads_per_gb', 0.3)
            n_threads = min(self.config.get('max_n_threads', 8), max(n_threads, int(min_threads_per_gb * cpu_ram_gb)))

        config = utils.update_dict(self.config, dict(n_threads=n_threads))
        raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params)
        time_s = eval_linear_product_model(raw_features, self.time_params)
        cpu_ram_gb = eval_linear_product_model(raw_features, self.cpu_ram_params)
        gpu_ram_gb = 0.0 if self.gpu_ram_params is None \
            else eval_linear_product_model(raw_features, self.gpu_ram_params)

        # todo: rough correction to prioritize dionis even if it's run with too many threads,
        #  should use better time estimation model
        time_s += 0.2 * n_threads * time_s

        return RequiredResources(time_s=time_s,
                                 n_threads=n_threads,
                                 cpu_ram_gb=cpu_ram_gb,
                                 gpu_ram_gb=gpu_ram_gb,
                                 n_gpus=self.n_gpus,
                                 gpu_usage=0.0 if self.n_gpus == 0 else self.gpu_usage)


# if __name__ == '__main__':
#     features = FeatureSpec.concat('', 'ds_size_gb',
#                                   FeatureSpec.product('n_cv_refit', 'n_splits',
#                                                       FeatureSpec.powerset_products('1/n_threads', 'n_features',
#                                                                                     'n_samples',
#                                                                                     'n_estimators', 'n_tree_repeats')))
#     print(features)
#     print(f'{len(features)=}')


================================================
FILE: pytabkit/models/alg_interfaces/resource_params.py
================================================
class ResourceParams:
    # determined using estimate_resource_params.py
    cb_class_time = {'': 1.1074866100217955, 'ds_size_gb': 6.2276292117813865, 'ds_prep_size_gb': 6.2276292117813865,
                     'ds_onehot_size_gb': 2.0150542417790342e-07,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads': 2.214973220043591,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 5.1876881836135774e-09,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.035559075362487e-06,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 7.13999461225352e-07,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.000849954711796066,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.964226717465322e-12,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 2.3531597535778573e-14,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.2994223618739465e-15,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.7778533486675952e-08,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 8.378247017832774e-10,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 4.732937240944653e-13,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 5.508439525827261e-13,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 1.285253358050953e-10,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.629510161784679e-13,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 2.627359007275516e-15,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.133320942151551e-15,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 2.651274595052903e-10,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 3.5098969397077584e-11,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 3.673856471950424e-15,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.267867148099078e-16,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 2.3903321610037346e-05,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.589892590504275e-14,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 2.3930248376103085e-16,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 8.531748659348444e-11}
    cb_class_ram = {'': 0.9345478156433287, 'ds_size_gb': 1.804116547565268e-05,
                    'ds_prep_size_gb': 1.804116547565268e-05, 'ds_onehot_size_gb': 0.012758554137232066,
                    'n_tree_repeats': 4.077606761367131e-09, 'n_samples': 7.243808011863237e-07,
                    'n_samples*n_tree_repeats': 1.2285638949747794e-07, 'n_features': 3.8863715875356e-09,
                    'n_features*n_tree_repeats': 1.1947420344843242e-12, 'n_features*n_samples': 3.767039504566679e-08,
                    'n_features*n_samples*n_tree_repeats': 7.361290583089635e-16, 'max_depth': 0.004088255941858752,
                    'max_depth*n_tree_repeats': 1.1590969030724202e-09, 'max_depth*n_samples': 1.4477032736637855e-13,
                    'max_depth*n_samples*n_tree_repeats': 3.3706497906893135e-13,
                    'max_depth*n_features': 0.0006014917997388746,
                    'max_depth*n_features*n_tree_repeats': 1.834250929757216e-13,
                    'max_depth*n_features*n_samples': 4.241634070711833e-09,
                    'max_depth*n_features*n_samples*n_tree_repeats': 1.197601653926371e-16,
                    '2_power_maxdepth': 2.576133502607949e-09, '2_power_maxdepth*n_tree_repeats': 2.356086562374563e-05,
                    '2_power_maxdepth*n_samples': 1.3036510550142841e-15,
                    '2_power_maxdepth*n_samples*n_tree_repeats': 1.9523394732422347e-09,
                    '2_power_maxdepth*n_features': 7.810833280259485e-12,
                    '2_power_maxdepth*n_features*n_tree_repeats': 6.14544078331367e-15,
                    '2_power_maxdepth*n_features*n_samples': 1.5863977594541182e-13,
                    '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 2.3171956595374328e-17}
    xgb_class_time = {'': 1.5850150119193643e-06, 'ds_size_gb': 67.40780781613621, 'ds_prep_size_gb': 67.40780781613621,
                      'ds_onehot_size_gb': 7.555892653328937e-06,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads': 3.1700300238387285e-06,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 0.416152219367654,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 1.7981743709586172e-06,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 3.1379386919643983e-12,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 4.361726529019224e-09,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.433229074601185e-11,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 3.348195651528877e-12,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 3.4142887744033714e-13,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 9.578781115632407e-08,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 1.2099510988434818e-08,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 1.7180121037111673e-12,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 7.916471324379998e-14,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 0.007922594727428374,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 8.113108001263881e-12,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 6.758297160216264e-08,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.4232541896951673e-10,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 6.35528424560118e-10,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 8.810550042257941e-11,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 7.369774923827121e-15,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.186297360838691e-16,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 3.4755127308109863e-05,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 1.1585222842499338e-13,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 2.652000680981318e-10,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.1214153087760665e-11}
    xgb_class_ram = {'': 0.899804501497566, 'ds_size_gb': 0.41986843027802623, 'ds_prep_size_gb': 0.41986843027802623,
                     'ds_onehot_size_gb': 7.280007472890875e-06, 'n_tree_repeats': 0.0012854309387287798,
                     'n_samples': 8.808932580897527e-08, 'n_samples*n_tree_repeats': 8.625259564591089e-10,
                     'n_features': 1.6375678219943912e-10, 'n_features*n_tree_repeats': 1.302388952570238e-12,
                     'n_features*n_samples': 3.488627499883473e-11,
                     'n_features*n_samples*n_tree_repeats': 4.2124781789579334e-11, 'max_depth': 3.280529943711475e-08,
                     'max_depth*n_tree_repeats': 5.768929558524772e-10, 'max_depth*n_samples': 6.291962320207664e-14,
                     'max_depth*n_samples*n_tree_repeats': 5.126839919323976e-15,
                     'max_depth*n_features': 6.35648749681192e-05,
                     'max_depth*n_features*n_tree_repeats': 1.935402530195678e-13,
                     'max_depth*n_features*n_samples': 1.28838675675802e-08,
                     'max_depth*n_features*n_samples*n_tree_repeats': 1.69854661852343e-16,
                     '2_power_maxdepth': 3.26910486762921e-11,
                     '2_power_maxdepth*n_tree_repeats': 1.4676442049665057e-12,
                     '2_power_maxdepth*n_samples': 2.64316777243899e-16,
                     '2_power_maxdepth*n_samples*n_tree_repeats': 1.4901204061072977e-17,
                     '2_power_maxdepth*n_features': 1.140492447521818e-08,
                     '2_power_maxdepth*n_features*n_tree_repeats': 2.404137742885295e-15,
                     '2_power_maxdepth*n_features*n_samples': 3.6325731146686714e-13,
                     '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 3.723108372490702e-19}
    lgbm_class_time = {'': 0.07952271409861912, 'ds_size_gb': 24.914198992356777, 'ds_prep_size_gb': 24.914198992356777,
                       'ds_onehot_size_gb': 0.6707498854892533,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.15904542819723824,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.75292585133515e-07,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.995934332919547e-09,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 4.51061814549484e-13,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.015836831101031235,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.885892548234532e-11,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 2.320710370608533e-08,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.006248880421662e-14,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads': 1.6421556695965297e-07,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.015956943711852814,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples': 2.330829367448416e-12,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.2170171882409568e-13,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features': 0.001802775666445253,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.072475113612503e-12,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples': 3.376112165195102e-07,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 8.92885930282138e-09,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads': 7.505014868911757e-10,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.00041603300901854167,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples': 9.05403593468941e-15,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.3824258787970722e-15,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features': 2.152594512387446e-12,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.26406208478857e-14,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples': 9.221334002333759e-16,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.8809384428115866e-11}
    lgbm_class_ram = {'': 0.8604627263253337, 'ds_size_gb': 2.0214168208781946, 'ds_prep_size_gb': 2.0214168208781946,
                      'ds_onehot_size_gb': 3.622669179301401e-06, 'n_tree_repeats': 0.0015219464100389682,
                      'n_samples': 3.856682701344501e-07, 'n_samples*n_tree_repeats': 1.544688671627044e-10,
                      'n_features': 3.1028960780988996e-10, 'n_features*n_tree_repeats': 1.4858440058980697e-12,
                      'n_features*n_samples': 2.5173717397818705e-08,
                      'n_features*n_samples*n_tree_repeats': 6.656160609292717e-11,
                      'log_num_leaves': 1.573053922451339e-08, 'log_num_leaves*n_tree_repeats': 1.626145985707e-06,
                      'log_num_leaves*n_samples': 1.617414150367892e-13,
                      'log_num_leaves*n_samples*n_tree_repeats': 6.161688826595097e-13,
                      'log_num_leaves*n_features': 2.930068871528871e-11,
                      'log_num_leaves*n_features*n_tree_repeats': 2.7540140942935337e-13,
                      'log_num_leaves*n_features*n_samples': 3.939554526330466e-15,
                      'log_num_leaves*n_features*n_samples*n_tree_repeats': 3.851475872271092e-15,
                      'num_leaves': 7.114807543594747e-11, 'num_leaves*n_tree_repeats': 7.004349205794621e-07,
                      'num_leaves*n_samples': 6.063719974576439e-16,
                      'num_leaves*n_samples*n_tree_repeats': 1.1825948996367154e-14,
                      'num_leaves*n_features': 6.127161836179573e-06,
                      'num_leaves*n_features*n_tree_repeats': 4.723694325860319e-15,
                      'num_leaves*n_features*n_samples': 5.682583426130539e-17,
                      'num_leaves*n_features*n_samples*n_tree_repeats': 2.820814699620109e-14}

class ResourceParamsOld:
    cb_class_time = {'': 0.060695272326207535, 'ds_size_gb': 2.4268955178538847, 'ds_prep_size_gb': 2.4268955178538847,
                     'ds_onehot_size_gb': 0.040427221672569374,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.12139054465241507,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.0362927572255956e-09,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 5.259225293072914e-06,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.1159977413280863e-07,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.002034550389178136,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 1.972850747965341e-12,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 1.590097554595333e-14,
                     'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.280000915439824e-15,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.374338752023958e-08,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 4.062768369148915e-10,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 1.242824030666801e-12,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 9.32433742185293e-08,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 7.126063129715731e-11,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.344879400790812e-13,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 2.631878772648314e-15,
                     'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.4077434831895832e-15,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 1.99445077397377e-10,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 1.2520160532307873e-11,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 3.0511461549128756e-15,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.873281614024595e-16,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 1.2644593910088394e-05,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.235731644015564e-14,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 1.1517663973680398e-15,
                     'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.4847067022145893e-11}
    cb_class_ram = {'': 0.8683295939412378, 'ds_size_gb': 2.1956796547330758e-05,
                    'ds_prep_size_gb': 2.1956796547330758e-05, 'ds_onehot_size_gb': 0.054809311336043706,
                    'n_tree_repeats': 3.16259450440823e-09, 'n_samples': 5.359259624964122e-07,
                    'n_samples*n_tree_repeats': 1.817237502556807e-07, 'n_features': 1.728902260462638e-09,
                    'n_features*n_tree_repeats': 1.1883754249270118e-12, 'n_features*n_samples': 3.2106346545767416e-08,
                    'n_features*n_samples*n_tree_repeats': 8.080444898120663e-16, 'max_depth': 0.00023942254928693192,
                    'max_depth*n_tree_repeats': 7.662207891804141e-10, 'max_depth*n_samples': 2.0135633249657367e-13,
                    'max_depth*n_samples*n_tree_repeats': 1.9065381412052897e-13,
                    'max_depth*n_features': 0.0006188384463276942,
                    'max_depth*n_features*n_tree_repeats': 1.825891231551508e-13,
                    'max_depth*n_features*n_samples': 4.017104578325911e-09,
                    'max_depth*n_features*n_samples*n_tree_repeats': 1.2652983818045863e-16,
                    '2_power_maxdepth': 0.0001056123359157812, '2_power_maxdepth*n_tree_repeats': 2.694024798514516e-06,
                    '2_power_maxdepth*n_samples': 1.3780270956209364e-15,
                    '2_power_maxdepth*n_samples*n_tree_repeats': 2.064100170958034e-09,
                    '2_power_maxdepth*n_features': 1.0080022114889349e-10,
                    '2_power_maxdepth*n_features*n_tree_repeats': 6.15051597263584e-15,
                    '2_power_maxdepth*n_features*n_samples': 2.3070275489115195e-12,
                    '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 2.7850591221080067e-17}
    xgb_class_time = {'': 0.04616911535729873, 'ds_size_gb': 3.47457744189382, 'ds_prep_size_gb': 3.47457744189382,
                      'ds_onehot_size_gb': 0.0698867127341342,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.09233823071459746,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.035228262559771e-08,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 6.154537890478014e-07,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 8.63288843709104e-14,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 3.291166164590293e-10,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.670077849317217e-12,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 1.914319987041818e-13,
                      'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.926688203905133e-15,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.68587043083397e-08,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 0.0026046534716614215,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 3.601942853784541e-13,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.5052320282512473e-14,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 0.0007712724349247164,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.967156404769764e-13,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 1.7162683220472862e-09,
                      'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.226904474214378e-10,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 9.064818572421352e-11,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 6.993000397349683e-07,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 2.9578963011700153e-15,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.991428507510768e-16,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 2.802431219594177e-06,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 9.943166031719296e-15,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 5.094046852454207e-14,
                      'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.515896055082407e-12}
    xgb_class_ram = {'': 0.89800664010472, 'ds_size_gb': 0.8958165176491728, 'ds_prep_size_gb': 0.8958165176491728,
                     'ds_onehot_size_gb': 1.2775211008166364e-05, 'n_tree_repeats': 0.0005355693710144896,
                     'n_samples': 7.445989056176149e-08, 'n_samples*n_tree_repeats': 1.1095360093190593e-08,
                     'n_features': 1.419262523195433e-10, 'n_features*n_tree_repeats': 1.189619404783309e-12,
                     'n_features*n_samples': 2.1948939540241107e-11,
                     'n_features*n_samples*n_tree_repeats': 6.761378006837745e-13, 'max_depth': 4.602455291339385e-08,
                     'max_depth*n_tree_repeats': 5.846802665464209e-10, 'max_depth*n_samples': 6.003527146823594e-14,
                     'max_depth*n_samples*n_tree_repeats': 5.458849368989926e-15,
                     'max_depth*n_features': 8.276969896399465e-05,
                     'max_depth*n_features*n_tree_repeats': 1.73562626225241e-13,
                     'max_depth*n_features*n_samples': 1.1188204977077247e-08,
                     'max_depth*n_features*n_samples*n_tree_repeats': 1.2101329730965103e-16,
                     '2_power_maxdepth': 3.500391185762912e-11,
                     '2_power_maxdepth*n_tree_repeats': 1.446703161897511e-12,
                     '2_power_maxdepth*n_samples': 2.6046111134557463e-16,
                     '2_power_maxdepth*n_samples*n_tree_repeats': 1.4647083952656776e-17,
                     '2_power_maxdepth*n_features': 8.730859656468559e-07,
                     '2_power_maxdepth*n_features*n_tree_repeats': 2.253274531849529e-15,
                     '2_power_maxdepth*n_features*n_samples': 5.586329461516387e-11,
                     '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 3.406456640909277e-19}
    lgbm_class_time = {'': 0.028063263911210914, 'ds_size_gb': 2.970270224525262, 'ds_prep_size_gb': 2.970270224525262,
                       'ds_onehot_size_gb': 0.09163862856656434,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.05612652782242183,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.7651417047281056e-08,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.057993467818764e-07,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.264643485181751e-14,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.0018753906815885733,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.1257067882553722e-12,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 8.471355616223231e-12,
                       'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 3.3001370294885434e-15,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads': 6.47442904885375e-08,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.0011608214817588585,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples': 9.964309915135878e-13,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.608150056678177e-14,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features': 0.0001926020481234091,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.598542008079632e-13,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples': 1.3986995179321424e-08,
                       'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 6.208468162170729e-10,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads': 1.1569746986292633e-09,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_tree_repeats': 7.442820019642213e-05,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples': 4.6777144377244544e-14,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.075739698121751e-15,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features': 2.0127433109741758e-13,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 5.291223606102416e-15,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples': 2.39530599680757e-16,
                       'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.8233627245552183e-12}
    lgbm_class_ram = {'': 0.8545661661490145, 'ds_size_gb': 2.3080037837801175, 'ds_prep_size_gb': 2.3080037837801175,
                      'ds_onehot_size_gb': 4.0697094447404033e-07, 'n_tree_repeats': 0.0018080853926450316,
                      'n_samples': 2.994431799612211e-07, 'n_samples*n_tree_repeats': 1.1377985339470745e-09,
                      'n_features': 4.08148741723376e-07, 'n_features*n_tree_repeats': 1.4109066020140611e-12,
                      'n_features*n_samples': 2.3506833903706615e-08,
                      'n_features*n_samples*n_tree_repeats': 8.047116933926301e-12,
                      'log_num_leaves': 1.8470627691115034e-08, 'log_num_leaves*n_tree_repeats': 4.350203928522753e-07,
                      'log_num_leaves*n_samples': 1.4244297306885883e-13,
                      'log_num_leaves*n_samples*n_tree_repeats': 7.582204707419711e-13,
                      'log_num_leaves*n_features': 4.90256931677757e-11,
                      'log_num_leaves*n_features*n_tree_repeats': 2.6408516124748747e-13,
                      'log_num_leaves*n_features*n_samples': 3.020317664222622e-15,
                      'log_num_leaves*n_features*n_samples*n_tree_repeats': 2.1876975907194365e-15,
                      'num_leaves': 1.0490359582375276e-10, 'num_leaves*n_tree_repeats': 1.0650528506541837e-07,
                      'num_leaves*n_samples': 5.943342181332617e-16,
                      'num_leaves*n_samples*n_tree_repeats': 1.9123390691308356e-14,
                      'num_leaves*n_features': 6.105483514684091e-06,
                      'num_leaves*n_features*n_tree_repeats': 4.533114041820276e-15,
                      'num_leaves*n_features*n_samples': 3.668665655364504e-17,
                      'num_leaves*n_features*n_samples*n_tree_repeats': 1.2053037667373442e-13}


================================================
FILE: pytabkit/models/alg_interfaces/rtdl_interfaces.py
================================================
import copy
from typing import List, Any, Optional, Dict, Tuple
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, \
    RandomParamsAlgInterface
from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface, SingleSplitWrapperAlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.data.data import DictDataset
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import insert_missing_class_columns


def allow_single_underscore(params_config: List[Tuple]) -> List[Tuple]:
    # allow to specify the parameters with __ or with just _
    # the reason is that in the sklearn interfaces using __ is problematic
    # since sklearn thinks these belong to a sub-estimator
    params_config = copy.deepcopy(params_config)
    for i in range(len(params_config)):
        cfg = list(params_config[i])
        if cfg[1] is None and '__' in cfg[0]:
            cfg[1] = [cfg[0], cfg[0].replace('__', '_')]
            params_config[i] = tuple(cfg)
    return params_config


class SkorchSubSplitInterface(SklearnSubSplitInterface):
    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        from skorch.helper import predefined_split
        from skorch.dataset import Dataset

        # set number of classes
        if self.n_classes > 0:  # classification
            self.model.set_n_classes(self.n_classes)
        # get transformed_target from config
        transformed_target = self.config.get("transformed_target", False)
        if transformed_target:
            # do TransformedTargetRegressor by hand (because setting the
            # validation set in skorch conflicts with TransformedTargetRegressor)
            self.transformer = StandardScaler()
            y = self.transformer.fit_transform(y.reshape(-1, 1))
        else:
            self.transformer = None
        n_samples = len(x_df)
        train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
        train_mask[val_idxs] = False
        # create val_ds for skorch (see FAQ)
        # Note that this break TransformedTargetRegressor, which is why we do it by hand
        x_train = np.array(x_df.iloc[train_mask, :], dtype=np.float32)
        x_val = np.array(x_df.iloc[~train_mask, :], dtype=np.float32)
        y_train = y[train_mask]
        y_val = y[~train_mask] if self.n_classes else y[~train_mask].reshape(-1, 1)
        self.categorical_indicator = None
        if cat_col_names is not None and len(cat_col_names) > 0:
            self.categorical_indicator = np.array([name in cat_col_names for name in x_df.columns])
            self.model.set_categorical_indicator(self.categorical_indicator)
            # we do OrdinalEncoder one more time to be sure that there are no "holes"
            # in the categories
            # missing values were encoded as zero, we need to make them missing again
            self.replace_zero_by_nans = SimpleImputer(missing_values=0.,
                                                      strategy="constant",
                                                      fill_value=np.nan,
                                                      keep_empty_features=True)
            x_train[:, self.categorical_indicator] = self.replace_zero_by_nans.fit_transform(
                x_train[:, self.categorical_indicator])
            self.ord_enc = OrdinalEncoder(dtype=np.float32, handle_unknown='use_encoded_value', unknown_value=-1,
                                          encoded_missing_value=-1)
            x_train[:, self.categorical_indicator] = self.ord_enc.fit_transform(x_train[:, self.categorical_indicator])
            x_val[:, self.categorical_indicator] = self.replace_zero_by_nans.transform(
                x_val[:, self.categorical_indicator])
            x_val[:, self.categorical_indicator] = self.ord_enc.transform(x_val[:, self.categorical_indicator])
        val_ds = Dataset(x_val, y_val)
        self.model.set_params(train_split=predefined_split(val_ds))

        self.model.fit(x_train, y_train)

    def predict(self, ds: DictDataset) -> torch.Tensor:
        # adapted from SklearnSubSplitLearner
        # should return tensor of shape len(ds) x output_shape
        if self.tfm is not None:
            ds = self.tfm.forward_ds(ds)

        x_df = ds.without_labels().to_df()
        x_array = np.array(x_df, dtype=np.float32)  # added

        if self.categorical_indicator is not None:
            x_array[:, self.categorical_indicator] = self.replace_zero_by_nans.transform(
                x_array[:, self.categorical_indicator])
            x_array[:, self.categorical_indicator] = self.ord_enc.transform(x_array[:, self.categorical_indicator])

        # skorch doesn't support pandas dataframe

        if self.n_classes > 0:
            # classification
            y_pred = np.log(self.model.predict_proba(x_array) + 1e-30)
        else:
            # regression
            y_pred = self.model.predict(x_array)
            if len(y_pred.shape) == 1:
                y_pred = y_pred[:, None]

        y_pred = torch.as_tensor(y_pred, dtype=torch.float32)
        # guard against missing classes in the training set
        # (GBDT interfaces don't need this because they get passed n_classes as a parameter)
        y_pred = insert_missing_class_columns(y_pred, self.train_ds)
        # added
        if self.transformer is not None:
            y_pred = self.transformer.inverse_transform(y_pred.reshape(-1, 1))
            # transform to tensor
            y_pred = torch.from_numpy(y_pred)
        return y_pred[None]  # add vectorized dimension


class RTDL_MLPSubSplitInterface(SkorchSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        # the random state is handled by SklearnSubSplitLearner.fit() which sets
        # numpy and torch seeds based on self.random_state
        # which is all we need for skorch, so
        # we don't need to use seed here
        params_config = allow_single_underscore([
            ("lr_scheduler", None),
            ("lr", None),
            ("optimizer", None),
            ("module__n_layers", None),
            ("module__d_layers", None),
            ("module__d_first_layer", None),
            ("module__d_last_layer", None),
            ("module__activation", None),
            ("module__dropout", None),
            ("module__num_emb_type", None),
            ("module__num_emb_dim", None),
            ("module__num_emb_hidden_dim", None),
            ("module__num_emb_sigma", None),
            ("module__num_emb_lite", None),
            ("module__d_embedding", None),
            ("optimizer__weight_decay", None),
            ("batch_size", None),
            ("max_epochs", None),
            ("use_checkpoints", None),
            ("es_patience", None),
            ("lr_patience", None),
            ("verbose", None),
            ("checkpoint_dir", "tmp_folder"),
            ("val_metric_name", None),
        ])
        params = utils.extract_params(self.config, params_config)
        params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
        if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
            params['checkpoint_dir'] = './rtdl_checkpoints'
        from pytabkit.models.nn_models.rtdl_resnet import create_mlp_classifier_skorch, create_mlp_regressor_skorch
        if self.n_classes > 0:
            return create_mlp_classifier_skorch(**params)
        else:
            return create_mlp_regressor_skorch(**params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5, 'n_samples*n_features': 8e-7}
        ram_params = {'': 0.3, 'ds_onehot_size_gb': 3.0}
        gpu_ram_params = {'': 0.4, 'ds_onehot_size_gb': 1.5,
                          'n_features': 1.5e-3 if self.config.get('module_num_emb_type', 'none') != 'none' else 1e-4}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02)  # , gpu_ram_params)
        return rc.get_required_resources(ds)


class ResnetSubSplitInterface(SkorchSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        # the random state is handled by SklearnSubSplitLearner.fit() which sets
        # numpy and torch seeds based on self.random_state
        # which is all we need for skorch, so
        # we don't need to use seed here
        params_config = allow_single_underscore([
            ("lr_scheduler", None),
            ("module__activation", None),
            ("module__normalization", None),
            ("module__n_layers", None),
            ("module__d", None),
            ("module__d_hidden_factor", None),
            ("module__hidden_dropout", None),
            ("module__residual_dropout", None),
            ("optimizer__weight_decay", None),
            ("module__d_embedding", None),
            ("lr", None),
            ("optimizer", None),
            ("batch_size", None),
            ("max_epochs", None),
            ("use_checkpoints", None),
            ("es_patience", None),
            ("lr_patience", None),
            ("verbose", None),
            ("checkpoint_dir", "tmp_folder"),
            ("val_metric_name", None),
        ])

        # allow to specify these parameters with __ or with just _
        # the reason is that in the sklearn interfaces using __ is problematic
        # since sklearn thinks these belong to a sub-estimator
        # params_config.extend([(key, [key, key.replace('__', '_')], None) for key, source in
        # ]
        params = utils.extract_params(self.config, params_config)
        params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
        if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
            params['checkpoint_dir'] = './rtdl_checkpoints'

        from pytabkit.models.nn_models.rtdl_resnet import create_resnet_classifier_skorch, create_resnet_regressor_skorch
        if self.n_classes > 0:
            return create_resnet_classifier_skorch(**params)
        else:
            return create_resnet_regressor_skorch(**params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8}
        ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0}
        # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4,
        #                   'cat_size_sum': 2e-3}
        gpu_ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0, 'n_train': 4e-6, 'n_features': 1e-3,
                          'cat_size_sum': 1e-3}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02)  # , gpu_ram_params)
        return rc.get_required_resources(ds, n_train=n_train)


class FTTransformerSubSplitInterface(SkorchSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        # the random state is handled by SklearnSubSplitLearner.fit() which sets
        # numpy and torch seeds based on self.random_state
        # which is all we need for skorch, so
        # we don't need to use seed here
        params_config = allow_single_underscore([
            ("lr_scheduler", None),
            ("module__activation", None),
            ("module__n_layers", None),
            ("module__n_heads", None),
            ("module__token_bias", None),
            ("module__d_token", None),
            ("module__d_ffn_factor", None),
            ("module__attention_dropout", None),
            ("module__ffn_dropout", None),
            ("module__residual_dropout", None),
            ("module__prenormalization", None),
            ("module__initialization", None),
            ("module__kv_compression", None, None),
            ("module__kv_compression_sharing", None, None),
            ("lr", None),
            ("optimizer__weight_decay", None),
            ("optimizer", None),
            ("batch_size", None),
            ("max_epochs", None),
            ("use_checkpoints", None),
            ("es_patience", None),
            ("lr_patience", None),
            ("verbose", None),
            ("checkpoint_dir", "tmp_folder"),
            ("val_metric_name", None),
        ])
        params = utils.extract_params(self.config, params_config)
        params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
        if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
            params['checkpoint_dir'] = './rtdl_checkpoints'
        from pytabkit.models.nn_models.rtdl_resnet import create_ft_transformer_classifier_skorch, create_ft_transformer_regressor_skorch
        if self.n_classes > 0:
            return create_ft_transformer_classifier_skorch(**params)
        else:
            return create_ft_transformer_regressor_skorch(**params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        # Bioresponse has 419 features and uses 12.8 GB RAM with batch size 256
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_train*n_features': 8e-6}
        ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0}
        # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4,
        #                   'cat_size_sum': 2e-3}
        # ram computation: attention matrix is n_layers * n_heads * 4bytes * n_features**2
        # (coef = 4*8*4 * 1e-9 -> just use 1e-7?)
        # then there is also 3 (QKV) * n_features * d_token * batch_size * n_heads * 4bytes * n_layers * 2(forward+backward)
        # coef = 3 * 384 * 128 * 8 * 4 * 4 * 2 / (1024)**3 = 3.5e-2
        # and embedding: cat_sizes * d_token
        gpu_ram_params = {'': 0.2, 'ds_onehot_size_gb': 3.0, 'n_train': 4e-6,
                          'n_features': 3.5e-2,  # use slightly smaller value (based on empirical observations)
                          'n_features*n_features': 4e-6,
                          'cat_size_sum': 1e-4}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02)  # , gpu_ram_params)
        return rc.get_required_resources(ds, n_train=n_train)


def choose_batch_size_rtdl(train_size) -> int:
    # set batch_size depending on the number of samples
    # as in the rtdl paper
    # if train_size < 10_000:
    #     return 128  # taken from tabr paper, not used in our paper due to a bug
    if train_size < 30_000:
        return 256
    elif train_size < 100_000:
        return 512
    else:
        return 1024


def choose_batch_size_rtdl_new(train_size: int) -> int:
    # set batch_size depending on the number of samples
    # as in the rtdl paper
    if train_size < 10_000:
        return 128
    elif train_size < 30_000:
        return 256
    elif train_size < 100_000:
        return 512
    else:
        return 1024


class RTDL_MLP_ParamSamplerNew:
    def __init__(self, is_classification: bool, train_size: int, num_emb_type: str = 'none'):
        self.is_classification = is_classification
        self.train_size = train_size
        self.num_emb_type = num_emb_type

    def sample_params(self, seed: int) -> Dict[str, Any]:
        rng = np.random.default_rng(seed=seed)
        # cutoff to change hp space for large datasets
        # as in rtdl
        # the cutoff is between 70K and 300K
        cutoff_train_size_rtdl = 100_000
        is_large_dataset = self.train_size > cutoff_train_size_rtdl

        params = {
            # reduced d_layers
            "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \
                else rng.choice(np.arange(1, 9)),
            "module_d_layers": rng.choice(np.arange(1, 1025)) if is_large_dataset \
                else rng.choice(np.arange(1, 513)),
            # "Note that the size of the first and the last layers are tuned and set separately, while the size for
            # “in-between” layers is the same for all of them." from rtdl paper
            "module_d_first_layer": rng.choice(np.arange(1, 1025)) if is_large_dataset \
                else rng.choice(np.arange(1, 513)),
            "module_d_last_layer": rng.choice(np.arange(1, 1025)) if is_large_dataset \
                else rng.choice(np.arange(1, 513)),
            "module_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]),
            "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
            "optimizer_weight_decay": rng.choice(
                [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.]
            ),
            "module_d_embedding": rng.choice(np.arange(1, 65)),  # have smaller embedding sizes to avoid RAM issues
            "batch_size": choose_batch_size_rtdl_new(self.train_size),
            "lr_scheduler": False,
            "optimizer": "adamw",
            "max_epochs": 400,
            "use_checkpoints": True,
            "es_patience": 16,
            'verbose': 0,
            'tfms': ['quantile_tabr'],
        }

        # MLP-PLR space from
        # https://github.com/yandex-research/rtdl-num-embeddings/blob/main/exp/mlp-plr/adult/log_linear_fixed_tuning.toml
        # lr: loguniform(5e-5, 5e-3)
        # wd: 0, loguniform(1e-6, 1e-3)
        # sigma: 1e-3, 1e2 (or 1e-2, 1e2 for a different version)
        # had one-hot encodings
        # d_layers: ?
        if self.num_emb_type != 'none':
            params['module_num_emb_type'] = self.num_emb_type
            params['module_num_emb_dim'] = rng.choice(np.arange(1, 65))  # reduced from upper bound 128
            params['module_num_emb_hidden_dim'] = rng.choice(np.arange(1, 65))  # reduced from upper bound 128
            params['module_num_emb_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1)))

        if self.is_classification:
            params["transformed_target"] = False
        else:
            params["transformed_target"] = True

        return params


class RTDL_ResNet_ParamSampler:
    def __init__(self, is_classification: bool, train_size: int):
        self.is_classification = is_classification
        self.train_size = train_size

    def sample_params(self, seed: int) -> Dict[str, Any]:
        rng = np.random.default_rng(seed=seed)
        # cutoff to change hp space for large datasets
        # as in rtdl
        # the cutoff is between 70K and 300K
        cutoff_train_size_rtdl = 100_000
        is_large_dataset = self.train_size > cutoff_train_size_rtdl

        params = {
            "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \
                else rng.choice(np.arange(1, 9)),
            "module_d": rng.choice(np.arange(64, 1025)) if is_large_dataset \
                else rng.choice(np.arange(64, 513)),
            "module_d_hidden_factor": rng.choice(np.arange(1, 5)),
            "module_hidden_dropout": rng.uniform(0.0, 0.5),
            "module_residual_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]),
            "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
            "optimizer_weight_decay": rng.choice(
                [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.]
            ),
            "module_d_embedding": rng.choice(np.arange(8, 32)),  # we go lower (than 64)
            # because we have smaller datasets with categorical features
            "batch_size": choose_batch_size_rtdl(self.train_size),
            "module_activation": "relu",
            "module_normalization": "batchnorm",
            "lr_scheduler": False,
            "optimizer": "adamw",
            "max_epochs": 400,
            "use_checkpoints": True,
            "es_patience": 16,
            'verbose': 0,
            'tfms': ['quantile_tabr'],
        }

        if self.is_classification:
            params["transformed_target"] = False
        else:
            params["transformed_target"] = True

        return params


class RTDL_ResNet_ParamSamplerNew:
    def __init__(self, is_classification: bool, train_size: int):
        self.is_classification = is_classification
        self.train_size = train_size

    def sample_params(self, seed: int) -> Dict[str, Any]:
        rng = np.random.default_rng(seed=seed)
        # cutoff to change hp space for large datasets
        # as in rtdl
        # the cutoff is between 70K and 300K
        cutoff_train_size_rtdl = 100_000
        is_large_dataset = self.train_size > cutoff_train_size_rtdl

        params = {
            "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \
                else rng.choice(np.arange(1, 9)),
            "module_d": rng.choice(np.arange(64, 1025)) if is_large_dataset \
                else rng.choice(np.arange(64, 513)),
            "module_d_hidden_factor": rng.choice(np.arange(1, 5)),
            "module_hidden_dropout": rng.uniform(0.0, 0.5),
            "module_residual_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]),
            "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))),
            "optimizer_weight_decay": rng.choice(
                [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.]
            ),
            "module_d_embedding": rng.choice(np.arange(1, 65)),  # use smaller embedding dimensions
            "batch_size": choose_batch_size_rtdl_new(self.train_size),
            "module_activation": "relu",
            "module_normalization": "batchnorm",
            "lr_scheduler": False,
            "optimizer": "adamw",
            "max_epochs": 400,
            "use_checkpoints": True,
            "es_patience": 16,
            'verbose': 0,
            'tfms': ['quantile_tabr'],
        }

        if self.is_classification:
            params["transformed_target"] = False
        else:
            params["transformed_target"] = True

        return params


class RandomParamsResnetAlgInterface(SingleSplitAlgInterface):
    def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.model_idx = model_idx
        self.alg_interface = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        return RandomParamsResnetAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params,
                                              **self.config)

    def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int):
        # this is also set in get_required_resources, but okay
        if self.fit_params is None:
            hparam_seed = utils.combine_seeds(seed, self.model_idx)
            is_classification = not ds.tensor_infos['y'].is_cont()
            self.fit_params = [RTDL_ResNet_ParamSamplerNew(is_classification, n_train).sample_params(hparam_seed)]
            # self.fit_params = [RTDL_ResNet_ParamSamplerNew(is_classification, n_train).sample_params(hparam_seed)]
        return SingleSplitWrapperAlgInterface(
            [ResnetSubSplitInterface(**utils.update_dict(self.config, self.fit_params[0]))])

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1
        self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, idxs_list[0].n_train)
        self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name)

    def predict(self, ds: DictDataset) -> torch.Tensor:
        return self.alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert len(split_seeds) == 1
        alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train)
        return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train)


class RandomParamsFTTransformerAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed=seed)
        params = {
            "module_n_layers": rng.choice(np.arange(1, 5)),
            "module_d_token": 8 * rng.choice(np.arange(2, 49)),
            # this is different in https://github.com/yandex-research/rtdl-revisiting-models/blob/main/output/adult/ft_transformer/tuning/0.toml
            # but used like this in the newer tabr paper spaces
            "module_d_ffn_factor": rng.uniform(2 / 3, 8 / 3),
            "module_ffn_dropout": rng.uniform(0.0, 0.5),
            "module_attention_dropout": rng.uniform(0.0, 0.5),
            "module_residual_dropout": rng.choice([rng.uniform(0, 0.2)] + [0.]),
            "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))),
            "optimizer_weight_decay": rng.choice(
                [np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))] + [0.]
            ),
            "batch_size": choose_batch_size_rtdl_new(n_train),
            "lr_scheduler": False,
            "max_epochs": 400,  # introduced a limit, like for MLP and ResNet
            "use_checkpoints": True,
            "es_patience": 16,
            'verbose': 0,
            'tfms': ['quantile_tabr'],
        }

        if is_classification:
            params["transformed_target"] = False
            return utils.join_dicts(DefaultParams.FTT_D_CLASS, params)
        else:
            params["transformed_target"] = True
            return utils.join_dicts(DefaultParams.FTT_D_REG, params)

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([FTTransformerSubSplitInterface(**config) for i in range(n_tv_splits)])


class RandomParamsRTDLMLPAlgInterface(SingleSplitAlgInterface):
    def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.model_idx = model_idx
        self.alg_interface = None

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        return RandomParamsRTDLMLPAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params,
                                               **self.config)

    def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int):
        if self.fit_params is None:
            hparam_seed = utils.combine_seeds(seed, self.model_idx)
            is_classification = not ds.tensor_infos['y'].is_cont()
            self.fit_params = [RTDL_MLP_ParamSamplerNew(is_classification,
                                                        n_train,
                                                        num_emb_type=self.config.get('num_emb_type', 'none')
                                                        ).sample_params(hparam_seed)]

        return SingleSplitWrapperAlgInterface(
            [RTDL_MLPSubSplitInterface(**utils.update_dict(self.config, self.fit_params[0]))])

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
        assert len(idxs_list) == 1
        self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, n_train=idxs_list[0].n_train)
        print(f'{self.fit_params[0]=}')
        self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name)

    def predict(self, ds: DictDataset) -> torch.Tensor:
        return self.alg_interface.predict(ds)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert len(split_seeds) == 1
        alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train)
        return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train)


================================================
FILE: pytabkit/models/alg_interfaces/sub_split_interfaces.py
================================================
import copy
import random
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple

import numpy as np
import pandas as pd
import torch

from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.data.data import DictDataset
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import insert_missing_class_columns


class SingleSplitWrapperAlgInterface(SingleSplitAlgInterface):
    """
    AlgInterface that takes multiple AlgInterfaces that can only handle a single train-val-test split
    and wraps them to handle a trainval-test split (possibly with multiple train-val splits)
    """

    def __init__(self, sub_split_interfaces: List[AlgInterface], fit_params: Optional[List[Dict[str, Any]]] = None,
                 **config):
        """
        :param sub_split_interfaces: Interfaces for each sub-split (train-val split).
        """
        super().__init__(fit_params=fit_params, **config)
        self.sub_split_interfaces = sub_split_interfaces

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        if fit_params is not None:
            assert len(fit_params) == 1  # single split required
        orig_fit_params = fit_params
        fit_params = fit_params or self.fit_params

        config = utils.join_dicts(self.sub_split_interfaces[0].config, self.config)
        if config.get('use_best_mean_iteration_for_refit', True):
            sub_fit_params = [utils.update_dict(fit_params[0], remove_keys='sub_fit_params')]
            return SingleSplitWrapperAlgInterface(
                [self.sub_split_interfaces[0].get_refit_interface(
                    n_refit=1, fit_params=sub_fit_params)
                 for i in
                 range(n_refit)], fit_params=fit_params)
        else:
            if n_refit != len(self.sub_split_interfaces):
                raise ValueError('When use_best_mean_iteration_for_refit==False, we must have n_cv==n_refit, '
                                 f'but got n_cv={len(self.sub_split_interfaces)} and {n_refit=}')
            if orig_fit_params is not None:
                raise ValueError('When use_best_mean_iteration_for_refit==False, '
                                 'fit_params in get_refit_interface() should be None')
            return SingleSplitWrapperAlgInterface(
                [ssi.get_refit_interface(n_refit=1) for ssi in self.sub_split_interfaces], fit_params=fit_params)

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1  # this is a SingleSplitAlgInterface
        assert len(tmp_folders) == 1  # this is a SingleSplitAlgInterface

        if self.config.get('same_seed_for_sub_splits', False):
            idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs,
                                   test_idxs=idxs.test_idxs, split_seed=idxs.split_seed,
                                   sub_split_seeds=[idxs.sub_split_seeds[0]] * len(idxs.sub_split_seeds),
                                   split_id=idxs.split_id) for idxs in idxs_list]

        split_idxs = idxs_list[0]
        tmp_folder = tmp_folders[0]
        hyper_results_list = []
        # todo: this could be parallelized if necessary, but not for now
        for i in range(split_idxs.n_trainval_splits):
            sub_split_idxs = [split_idxs.get_sub_split_idxs_alt(i)]
            sub_tmp_folder = tmp_folder / f'sub_split_{i}' if tmp_folder is not None else None
            # don't set fit_params here
            # because we might intentionally not want to set them if use_best_mean_iteration_for_refit==False
            # see get_refit_interfaces()
            # if self.fit_params is not None:
            #     self.sub_split_interfaces[i].fit_params = self.fit_params
            hyper_results = self.sub_split_interfaces[i].fit(ds, sub_split_idxs, interface_resources, logger,
                                                             [sub_tmp_folder], name=name)
            hyper_results = hyper_results[0][0] if hyper_results is not None else []
            hyper_results_list.append(hyper_results)

        if self.fit_params is None:
            # determine best fit parameters (early stopping epoch or so)
            # by averaging losses across cv splits and then taking the minimum of that

            n_hyper_results = [len(hyper_result) for hyper_result in hyper_results_list]
            # print(f'{n_hyper_results=}')

            # truncate all hyper results to minimum length (could be different in case of early stopping)
            min_n_hyper_results = min(n_hyper_results)

            if min_n_hyper_results > 0:
                for i in range(len(hyper_results_list)):
                    hyper_results_list[i] = hyper_results_list[i][:min_n_hyper_results]

                n_hyper_results = [len(hyper_result) for hyper_result in hyper_results_list]
                if not utils.all_equal(n_hyper_results):
                    raise RuntimeError(f'Got hyperparameter results of different lengths: {n_hyper_results}')
                for i in range(n_hyper_results[0]):
                    if not utils.all_equal([frozenset(hyper_result[i][0]) for hyper_result in hyper_results_list]):
                        raise RuntimeError(f'Hyperparameter result lists did not use the same hyperparameters')
                mean_hyper_results = np.asarray([np.mean([hyper_result[i][1] for hyper_result in hyper_results_list])
                                                 for i in range(n_hyper_results[0])])
                # use reverse argmin for ties since it sometimes gives better results
                best_idx = utils.reverse_argmin(mean_hyper_results)
                self.fit_params = [copy.copy(hyper_results_list[0][best_idx][0])]
                self.fit_params[0]['sub_fit_params'] = [ssi.fit_params for ssi in self.sub_split_interfaces]

                # steal the config from the sub_split_interface because it usually gets all the kwargs
                config = utils.join_dicts(self.sub_split_interfaces[0].config, self.config)

                if config.get('use_best_mean_iteration_for_cv', False):
                    for ssi in self.sub_split_interfaces:
                        ssi.fit_params = self.fit_params
            else:
                self.fit_params = [dict(
                    sub_fit_params=[(ssi.fit_params[0] if ssi.fit_params is not None else None) for ssi in
                                    self.sub_split_interfaces])]

        return None

    def predict(self, ds: DictDataset) -> torch.Tensor:
        # todo: pay attention to dimensions
        return torch.cat([s.predict(ds) for s in self.sub_split_interfaces], dim=0)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_splits == 1
        assert n_cv == len(self.sub_split_interfaces)
        # todo: this is ignoring the refit stage
        single_resources = [
            ssi.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[split_seed], n_train=n_train)
            for ssi, split_seed in zip(self.sub_split_interfaces, split_seeds)]
        return RequiredResources.combine_sequential(single_resources)

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        return self.sub_split_interfaces[0].get_available_predict_params()

    def set_current_predict_params(self, name: str) -> None:
        super().set_current_predict_params(name)
        for ssi in self.sub_split_interfaces:
            ssi.set_current_predict_params(name)


class SklearnSubSplitInterface(SingleSplitAlgInterface):  # todo: have another base class
    """
    Base class for AlgInterfaces based on scikit-learn methods.
    """

    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.tfm = None
        self.n_classes = None
        self.model = None
        self.train_ds = None

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1
        assert idxs_list[0].n_trainval_splits == 1

        # print(f'fit(): {torch.cuda.is_initialized()=}')

        # return List[Tuple[Dict, float]]], i.e., validation scores for every hyperparameter combination
        # (could be number of trees, early stopping epoch, or hyperparameters from hyperparameter optimization)
        # if hyperparams is not None, use these and maybe only return one list element?
        seed = idxs_list[0].sub_split_seeds[0]
        torch.manual_seed(seed)  # can be useful for label encoding with randomized permutation
        np.random.seed(seed)
        random.seed(seed)
        # print(f'Seeding with seed {seed}')
        # print(f'{type(seed)=}')
        self.n_classes = ds.get_n_classes()
        if idxs_list[0].val_idxs is None:
            trainval_idxs = idxs_list[0].train_idxs[0]
            # validation indices such that trainval_idxs[rel_val_idxs] is the val_idxs
            # can be used to index trainval_ds later
            rel_val_idxs = torch.zeros(0, dtype=torch.long)
        else:
            trainval_idxs = torch.cat([idxs_list[0].train_idxs[0], idxs_list[0].val_idxs[0]], dim=0)
            rel_val_idxs = torch.arange(idxs_list[0].n_train, trainval_idxs.shape[0], dtype=torch.long)

        trainval_ds = ds.get_sub_dataset(trainval_idxs)
        # for filling in missing classes in the train dataset later
        # might not work when the validation set contains classes that the training set doesn't contain
        self.train_ds = ds.get_sub_dataset(idxs_list[0].train_idxs[0])

        self.config["tmp_folder"] = tmp_folders[0]
        self.config['interface_resources'] = interface_resources

        # create preprocessing factory
        factory = self.config.get('factory', None)
        if factory is None:
            factory = PreprocessingFactory(**self.config)

        # transform according to factory
        fitter = factory.create(ds.tensor_infos)
        self.tfm, trainval_ds = fitter.fit_transform(trainval_ds)

        y = trainval_ds.tensors['y']

        self.model = self._create_sklearn_model(seed=seed,
                                                n_threads=interface_resources.n_threads,
                                                gpu_devices=interface_resources.gpu_devices)
        if self.n_classes == 0 and trainval_ds.tensor_infos['y'].get_n_features() > 1 \
                and self.config.get('use_multioutput_regressor', False):
            from sklearn.multioutput import MultiOutputRegressor
            self.model = MultiOutputRegressor(self.model)  # todo: test this
            y = y.numpy()
        else:
            y = y[:, 0].numpy()

        x_df = trainval_ds.without_labels().to_df()
        cat_col_names = list(x_df.select_dtypes(include='category').columns)
        self._fit_sklearn(x_df=x_df, y=y, val_idxs=rel_val_idxs.numpy(), cat_col_names=cat_col_names)

        return None

    def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
                     cat_col_names: Optional[List[str]] = None):
        # by default, we ignore the validation set since most sklearn methods do not support it
        n_samples = len(x_df)
        train_mask = np.ones(shape=(n_samples,), dtype=np.bool_)
        train_mask[val_idxs] = False
        x_df = x_df.iloc[train_mask, :]
        y = y[train_mask]
        if cat_col_names is not None and len(cat_col_names) > 0:
            self.model.fit(x_df, y, **{self._get_cat_indexes_arg_name(): cat_col_names})
        else:
            self.model.fit(x_df, y)

    def predict(self, ds: DictDataset) -> torch.Tensor:
        # should return tensor of shape len(ds) x output_shape
        if self.tfm is not None:
            ds = self.tfm.forward_ds(ds)

        x_df = ds.without_labels().to_df()

        if self.n_classes > 0:
            # classification
            y_pred = np.log(self._predict_proba_sklearn(x_df) + 1e-30)
        else:
            # regression
            y_pred = self._predict_sklearn(x_df)
            if len(y_pred.shape) == 1:
                y_pred = y_pred[:, None]

        y_pred = torch.as_tensor(y_pred, dtype=torch.float32)
        # guard against missing classes in the training set
        # (GBDT interfaces don't need this because they get passed n_classes as a parameter)
        y_pred = insert_missing_class_columns(y_pred, self.train_ds)
        return y_pred[None]  # add n_models dimension

    def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict(x_df)

    def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray:
        return self.model.predict_proba(x_df)

    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        # override this in subclasses
        raise NotImplementedError()

    def _get_cat_indexes_arg_name(self) -> str:
        # override this in subclasses if categorical features are supported
        raise NotImplementedError()


class TreeBasedSubSplitInterface(SingleSplitAlgInterface):  # todo: insert more appropriate class to inherit from?
    """
    Base class for tree-based ML models (XGB, LGBM, CatBoost).
    """

    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)
        self.config = config
        self.tfm = None
        self.n_classes = None
        self.model = None

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1
        assert idxs_list[0].n_trainval_splits == 1
        # return List[Tuple[Dict, float]]], i.e., validation scores for every hyperparameter combination
        # (could be number of trees, early stopping epoch, or hyperparameters from hyperparameter optimization)
        # if hyperparams is not None, use these and maybe only return one list element?
        seed = idxs_list[0].sub_split_seeds[0]
        torch.manual_seed(seed)  # can be useful for label encoding with randomized permutation
        np.random.seed(seed)
        random.seed(seed)
        self.n_classes = ds.get_n_classes()
        train_idxs = idxs_list[0].train_idxs[0]
        val_idxs = idxs_list[0].val_idxs[0] if idxs_list[0].val_idxs is not None else None
        train_ds = ds.get_sub_dataset(train_idxs)
        is_cv = val_idxs is not None
        val_ds = ds.get_sub_dataset(val_idxs) if is_cv else None

        # create preprocessing factory
        factory = self.config.get('factory', None)
        if factory is None:
            factory = PreprocessingFactory(**self.config)

        # transform according to factory
        fitter = factory.create(ds.tensor_infos)
        if is_cv:
            trainval_ds = ds.get_sub_dataset(torch.cat([train_idxs, val_idxs], dim=0))
        else:
            trainval_ds = train_ds
        self.tfm = fitter.fit(trainval_ds)
        train_ds = self.tfm.forward_ds(train_ds)
        if is_cv:
            val_ds = self.tfm.forward_ds(val_ds)

        params = self._get_params()
        if self.fit_params is not None:
            params = utils.update_dict(params, self.fit_params[0])
        gpu_ids = [int(dev_str[len('cuda:'):])
                   for dev_str in interface_resources.gpu_devices if dev_str.startswith('cuda:')]
        if len(gpu_ids) > 0 and self.config.get('allow_gpu', True):
            params['device'] = f'cuda:{gpu_ids[0]}'  # this is for XGBoost 2.0 and CatBoost
        self.model, val_errors = self._fit(train_ds, val_ds, params=params, seed=seed,
                                           n_threads=interface_resources.n_threads,
                                           val_metric_name=self.config.get('val_metric_name', None),
                                           tmp_folder=tmp_folders[0])
        if val_errors is None:
            return None
        else:
            if self.config.get('use_best_checkpoint', True):
                if isinstance(val_errors, dict):
                    # have multiple errors for different metrics
                    self.fit_params = [dict(
                        n_estimators={key: utils.reverse_argmin(values) + 1 for key, values in val_errors.items()})]
                else:
                    self.fit_params = [dict(n_estimators=utils.reverse_argmin(val_errors) + 1)]
            else:
                self.fit_params = [dict(n_estimators=len(val_errors))]

            if isinstance(val_errors, dict):
                return None  # not implemented
            else:
                return [[[(dict(n_estimators=i + 1), err) for i, err in enumerate(val_errors)]]]

    def predict(self, ds: DictDataset) -> torch.Tensor:
        # should return tensor of shape len(ds) x output_shape
        pred_dict = self.get_current_predict_params_dict()
        pred_params = dict()
        if self.fit_params is not None:
            if 'val_metric_name' in pred_dict:
                pred_params = dict(n_estimators=self.fit_params[0]['n_estimators'][pred_dict['val_metric_name']])
            else:
                pred_params = self.fit_params[0]
        if self.tfm is not None:
            ds = self.tfm.forward_ds(ds)
        return self._predict(self.model, ds, self.n_classes,
                             pred_params)[None]

    def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int,
             n_threads: int, val_metric_name: Optional[str] = None,
             tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]:
        raise NotImplementedError()

    def _predict(self, bst: Any, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor:
        raise NotImplementedError()

    def _get_params(self) -> Dict[str, Any]:
        raise NotImplementedError()

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        val_metric_names = self.config.get('val_metric_names', None)
        if val_metric_names is None:
            return {'': dict()}
        else:
            return {f'_val-{val_metric_name}': dict(val_metric_name=val_metric_name) for val_metric_name in
                    val_metric_names}


================================================
FILE: pytabkit/models/alg_interfaces/tabm_interface.py
================================================
import functools
import math
import random
from pathlib import Path

import scipy
import sklearn
import torch
import numpy as np
from pytabkit.models.training.metrics import Metrics
from torch import nn

from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, RandomParamsAlgInterface
from typing import Optional, List, Dict, Any, Union, Tuple, Literal

from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.nn_models import rtdl_num_embeddings
from pytabkit.models.nn_models.base import Fitter
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.nn_models.tabm import Model, make_parameter_groups
from pytabkit.models.training.logging import Logger


def get_tabm_auto_batch_size(n_train: int) -> int:
    # by Yury Gorishniy, inferred from the choices in the TabM paper.
    if n_train < 2_800:
        return 32
    if n_train < 4_500:
        return 64
    if n_train < 6_400:
        return 128
    if n_train < 32_000:
        return 256
    if n_train < 108_000:
        return 512
    return 1024


class TabMSubSplitInterface(SingleSplitAlgInterface):
    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        raise NotImplementedError()

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1
        assert idxs_list[0].n_trainval_splits == 1

        seed = idxs_list[0].sub_split_seeds[0]
        # print(f'Setting seed: {seed}')
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

        # hyperparams
        arch_type = self.config.get('arch_type', 'tabm')
        num_emb_type = self.config.get('num_emb_type', 'none')
        n_epochs = self.config.get('n_epochs', 1_000_000_000)
        patience = self.config.get('patience', 16)
        batch_size = self.config.get('batch_size', 256)
        compile_model = self.config.get('compile_model', False)
        lr = self.config.get('lr', 2e-3)
        d_embedding = self.config.get('d_embedding', 16)
        d_block = self.config.get('d_block', 512)
        dropout = self.config.get('dropout', 0.1)
        tabm_k = self.config.get('tabm_k', 32)
        allow_amp = self.config.get('allow_amp', False)
        n_blocks = self.config.get('n_blocks', 'auto')
        num_emb_n_bins = self.config.get('num_emb_n_bins', 48)
        # set default to True for backward compatibility
        share_training_batches = self.config.get("share_training_batches", False)
        val_metric_name = self.config.get('val_metric_name', None)
        train_metric_name = self.config.get('train_metric_name', None)

        weight_decay = self.config.get('weight_decay', 0.0)
        gradient_clipping_norm = self.config.get('gradient_clipping_norm', None)

        TaskType = Literal['regression', 'binclass', 'multiclass']

        n_train = idxs_list[0].n_train
        n_classes = ds.get_n_classes()
        cat_cardinalities = ds.tensor_infos['x_cat'].get_cat_sizes().numpy().tolist()
        task_type: TaskType = 'regression' if n_classes == 0 else ('binclass' if n_classes == 2 else 'multiclass')
        device = interface_resources.gpu_devices[0] if len(interface_resources.gpu_devices) >= 1 else 'cpu'
        device = torch.device(device)

        if num_emb_n_bins >= n_train:
            print(f'Reducing num_emb_n_bins to be smaller than n_train')
            num_emb_n_bins = n_train-1

        if val_metric_name is None:
            val_metric_name = 'rmse' if task_type == 'regression' else 'class_error'

        if batch_size == "auto":
            batch_size = get_tabm_auto_batch_size(n_train=n_train)

        self.n_classes_ = n_classes
        self.task_type_ = task_type
        self.device_ = device

        # create preprocessing factory
        factory = self.config.get('factory', None)
        if 'tfms' not in self.config:
            self.config['tfms'] = ['quantile_tabr']
        if factory is None:
            factory = PreprocessingFactory(**self.config)

        if idxs_list[0].val_idxs is None:
            raise ValueError(f'Training without validation set is currently not implemented')

        ds_parts = {'train': ds.get_sub_dataset(idxs_list[0].train_idxs[0]),
                    'val': ds.get_sub_dataset(idxs_list[0].val_idxs[0]),
                    # 'test': ds.get_sub_dataset(idxs_list[0].test_idxs)
                    }

        part_names = ['train', 'val']  # no test
        non_train_part_names = ['val']

        # transform according to factory
        fitter: Fitter = factory.create(ds.tensor_infos)
        self.tfm_, ds_parts['train'] = fitter.fit_transform(ds_parts['train'])
        for part in non_train_part_names:
            ds_parts[part] = self.tfm_(ds_parts[part])

        # filter out numerical columns with only a single value
        x_cont_train = ds_parts['train'].tensors['x_cont']

        for part in part_names:
            ds_parts[part] = ds_parts[part].to(device)

        # mask of which columns are not constant
        self.num_col_mask_ = ~torch.all(x_cont_train == x_cont_train[0:1, :], dim=0)

        for part in part_names:
            ds_parts[part].tensors['x_cont'] = ds_parts[part].tensors['x_cont'][:, self.num_col_mask_]
            # tensor infos are not correct anymore, but might not be used either

        # update
        n_cont_features = ds_parts['train'].tensors['x_cont'].shape[1]

        Y_train = ds_parts['train'].tensors['y'].clone()
        if task_type == 'regression':
            assert Y_train.shape[-1] == 1
            self.y_mean_ = ds_parts['train'].tensors['y'].mean(dim=0, keepdim=True).item()
            self.y_std_ = ds_parts['train'].tensors['y'].std(dim=0, keepdim=True, correction=0).item()
            self.y_max_ = ds_parts['train'].tensors['y'].max().item()
            self.y_min_ = ds_parts['train'].tensors['y'].min().item()

            Y_train = (Y_train - self.y_mean_) / (self.y_std_ + 1e-30)

        data = {part: utils.join_dicts(
            dict(x_cont=ds_parts[part].tensors['x_cont'], y=ds_parts[part].tensors['y']),
            dict(x_cat=ds_parts[part].tensors['x_cat']) if ds.tensor_infos['x_cat'].get_n_features() > 0 else dict())
                for part in part_names}

        # adapted from https://github.com/yandex-research/tabm/blob/main/example.ipynb

        # Automatic mixed precision (AMP)
        # torch.float16 is implemented for completeness,
        # but it was not tested in the project,
        # so torch.bfloat16 is used by default.
        amp_dtype = (
            torch.bfloat16
            if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
            else torch.float16
            if torch.cuda.is_available()
            else None
        )
        # Changing False to True will result in faster training on compatible hardware.
        amp_enabled = allow_amp and amp_dtype is not None and device.type == 'cuda'
        grad_scaler = torch.cuda.amp.GradScaler() if amp_dtype is torch.float16 else None  # type: ignore

        # fmt: off
        logger.log(1,
            f'Device:        {device.type.upper()}'
            f'\nAMP:           {amp_enabled} (dtype: {amp_dtype})'
            f'\ntorch.compile: {compile_model}'
        )
        # fmt: on
        pass

        # Choose one of the two configurations below.

        # TabM
        bins = None if num_emb_type != 'pwl' or n_cont_features == 0 else rtdl_num_embeddings.compute_bins(data['train']['x_cont'], n_bins=num_emb_n_bins)
        d_out = n_classes if n_classes > 0 else 1
        if train_metric_name is not None and train_metric_name.startswith('multi_pinball'):
            d_out = train_metric_name.count(',')+1

        model = Model(
            n_num_features=n_cont_features,
            cat_cardinalities=cat_cardinalities,
            n_classes=d_out,
            backbone={
                'type': 'MLP',
                'n_blocks': n_blocks if n_blocks != 'auto' else (3 if bins is None else 2),
                'd_block': d_block,
                'dropout': dropout,
            },
            bins=bins,
            num_embeddings=(
                None
                if bins is None
                else {
                    'type': 'PiecewiseLinearEmbeddings',
                    'd_embedding': d_embedding,
                    'activation': False,
                    'version': 'B',
                }
            ),
            arch_type=arch_type,
            k=tabm_k,
            share_training_batches=share_training_batches,
        ).to(device)

        # import tabm
        # num_embeddings = None if bins is None else rtdl_num_embeddings.PiecewiseLinearEmbeddings(
        #     bins=bins,
        #     d_embedding=d_embedding,
        #     activation=False,
        #     version='B',
        # )
        # model = tabm.TabM(
        #     n_num_features=n_cont_features,
        #     cat_cardinalities=cat_cardinalities,
        #     d_out = n_classes if n_classes > 0 else 1,
        #     num_embeddings = num_embeddings,
        #     n_blocks=n_blocks if n_blocks != 'auto' else (3 if bins is None else 2),
        #     d_block=d_block,
        #     dropout=dropout,
        #     arch_type=arch_type,
        #     k=tabm_k,
        #     # todo: can introduce activation
        #     share_training_batches=share_training_batches,  # todo: disappeared?
        # )
        optimizer = torch.optim.AdamW(make_parameter_groups(model), lr=lr, weight_decay=weight_decay)


        if compile_model:
            # NOTE
            # `torch.compile` is intentionally called without the `mode` argument
            # (mode="reduce-overhead" caused issues during training with torch==2.0.1).
            model = torch.compile(model)
            evaluation_mode = torch.no_grad
        else:
            evaluation_mode = torch.inference_mode

        @torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
        def apply_model(part: str, idx: torch.Tensor) -> torch.Tensor:
            return (
                model(
                    data[part]['x_cont'][idx],
                    data[part]['x_cat'][idx] if 'x_cat' in data[part] else None,
                )
                .float()
            )

        if train_metric_name is None:
            train_metric_name = 'mse' if self.n_classes_ == 0 else 'cross_entropy'

        if train_metric_name == 'mse':
            base_loss_fn = torch.nn.functional.mse_loss
        elif train_metric_name == 'cross_entropy':
            base_loss_fn = lambda a, b: torch.nn.functional.cross_entropy(a, b.squeeze(-1))
        else:
            base_loss_fn = functools.partial(Metrics.apply, metric_name=train_metric_name)

        def loss_fn(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
            # TabM produces k predictions per object. Each of them must be trained separately.
            # (regression)     y_pred.shape == (batch_size, k)
            # (classification) y_pred.shape == (batch_size, k, n_classes)
            k = y_pred.shape[1]
            # print(f'{y_pred.flatten(0, 1).shape=}, {y_true.shape=}')
            return base_loss_fn(
                y_pred.flatten(0, 1),
                y_true.repeat_interleave(k) if model.share_training_batches else y_true,
            )

        @evaluation_mode()
        def evaluate(part: str) -> float:
            model.eval()

            # When using torch.compile, you may need to reduce the evaluation batch size.
            eval_batch_size = 1024
            y_pred: torch.Tensor = (
                torch.cat(
                    [
                        apply_model(part, idx)
                        for idx in torch.arange(len(data[part]['y']), device=device).split(
                        eval_batch_size
                    )
                    ]
                )
            )
            if task_type == 'regression':
                # Transform the predictions back to the original label space.
                y_pred = y_pred * self.y_std_ + self.y_mean_

            # Compute the mean of the k predictions.
            average_logits = self.config.get('average_logits', False)
            if average_logits:
                y_pred = y_pred.mean(dim=1)
            if task_type != 'regression':
                # For classification, the mean must be computed in the probability space.
                y_pred = y_pred.softmax(dim=-1)
            if not average_logits:
                y_pred = y_pred.mean(dim=1)

            y_true = data[part]['y'].cpu()
            y_pred = y_pred.cpu()

            if task_type == 'regression' and len(y_true.shape) == 1:
                y_true = y_true.unsqueeze(-1)
            if task_type == 'regression' and len(y_pred.shape) == 1:
                y_pred = y_pred.unsqueeze(-1)
            # use minus so higher=better
            score = -Metrics.apply(y_pred, y_true, val_metric_name).item()
            return float(score)  # The higher -- the better.

        # print(f'Test score before training: {evaluate("test"):.4f}')

        epoch_size = math.ceil(n_train / batch_size)
        best = {
            'val': -math.inf,
            # 'test': -math.inf,
            'epoch': -1,
        }
        best_params = [p.clone() for p in model.parameters()]
        # Early stopping: the training stops when
        # there are more than `patience` consecutive bad updates.
        remaining_patience = patience

        try:
            if self.config.get('verbosity', 0) >= 1:
                from tqdm.std import tqdm
            else:
                tqdm = lambda arr, desc: arr
        except ImportError:
            tqdm = lambda arr, desc: arr

        logger.log(1, '-' * 88 + '\n')
        for epoch in range(n_epochs):
            batches = (
                torch.randperm(n_train, device=device).split(batch_size)
                if model.share_training_batches
                else [
                    x.transpose(0, 1).flatten()
                    for x in torch.rand((model.k, n_train), device=device).argsort(dim=1).split(batch_size, dim=1)
                ]
            )

            model.train()
            for batch_idx in tqdm(batches, desc=f"Epoch {epoch}"):
                optimizer.zero_grad(set_to_none=True)

                preds = apply_model('train', batch_idx)
                loss = loss_fn(preds, Y_train[batch_idx])

                if grad_scaler is None:
                    loss.backward()
                    if gradient_clipping_norm not in (None, 'none'):
                        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping_norm)  # type: ignore
                    optimizer.step()
                else:
                    grad_scaler.scale(loss).backward()
                    if gradient_clipping_norm not in (None, 'none'):
                        # unscale before clipping so the grads are in FP32
                        grad_scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping_norm)  # type: ignore
                    grad_scaler.step(optimizer)
                    grad_scaler.update()


            val_score = evaluate('val')
            # test_score = evaluate('test')
            # logger.log(1, f'(val) {val_score:.4f} (test) {test_score:.4f}')
            logger.log(1, f'(val) {val_score:.4f}')

            if val_score > best['val']:
                logger.log(1, '🌸 New best epoch! 🌸')
                # best = {'val': val_score, 'test': test_score, 'epoch': epoch}
                best = {'val': val_score, 'epoch': epoch}
                remaining_patience = patience
                with torch.no_grad():
                    for bp, p in zip(best_params, model.parameters()):
                        bp.copy_(p)
            else:
                remaining_patience -= 1

            if remaining_patience < 0:
                break

            logger.log(1, '')

        logger.log(1, '\n\nResult:')
        logger.log(1, str(best))

        logger.log(1, f'Restoring best model')
        with torch.no_grad():
            for bp, p in zip(best_params, model.parameters()):
                p.copy_(bp)

        self.model_ = model

        return None

    def predict(self, ds: DictDataset) -> torch.Tensor:
        self.model_.eval()

        ds = self.tfm_(ds).to(self.device_)

        ds.tensors['x_cont'] = ds.tensors['x_cont'][:, self.num_col_mask_]

        eval_batch_size = 1024
        with torch.no_grad():
            y_pred: torch.Tensor = (
                torch.cat(
                    [
                        self.model_(
                            ds.tensors['x_cont'][idx],
                            ds.tensors['x_cat'][idx] if not ds.tensor_infos['x_cat'].is_empty() else None,
                        )
                        .float()
                        for idx in torch.arange(ds.n_samples, device=self.device_).split(
                        eval_batch_size
                    )
                    ]
                )
            )
        if self.task_type_ == 'regression':
            # Transform the predictions back to the original label space.
            y_pred = y_pred.mean(1)
            y_pred = y_pred * self.y_std_ + self.y_mean_
            if self.config.get('clamp_output', False):
                y_pred = torch.clamp(y_pred, self.y_min_, self.y_max_)
        else:
            average_logits = self.config.get('average_logits', False)
            if average_logits:
                y_pred = y_pred.mean(1)
            else:
                # For classification, the mean must be computed in the probability space.
                y_pred = torch.log(torch.softmax(y_pred, dim=-1).mean(1) + 1e-30)

        return y_pred[None].cpu()  # add n_models dimension

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8}
        ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0}
        # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4,
        #                   'cat_size_sum': 2e-3}
        gpu_ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0, 'n_train': 6e-6, 'n_features': 1.5e-3,  # reduced from 2e-3
                          'cat_size_sum': 1e-4}  # reduced from 1e-3
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02)  # , gpu_ram_params)
        return rc.get_required_resources(ds, n_train=n_train)


class RandomParamsTabMAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        # adapted from Grinsztajn et al. (2022)
        hpo_space_name = self.config.get('hpo_space_name', 'default')
        if hpo_space_name == 'default':
            params = {
                "batch_size": "auto",
                "patience": 16,
                "allow_amp": True,
                "arch_type": "tabm-mini",
                "tabm_k": 32,
                # "gradient_clipping_norm": 1.0, # wasn't correctly implemented so we remove it in v1.7.0
                # this makes it probably slower with numerical embeddings, and also more RAM intensive
                # according to the paper it's not very important but should be a bit better (?)
                "share_training_batches": False,
                "lr": np.exp(rng.uniform(np.log(1e-4), np.log(3e-3))),
                "weight_decay": rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-1)))]),
                "n_blocks": rng.choice([1, 2, 3, 4]),
                "d_block": rng.choice([i for i in range(64, 1024 + 1) if i % 16 == 0]),
                "dropout": rng.choice([0.0, rng.uniform(0.0, 0.5)]),
                # numerical embeddings
                "num_emb_type": "pwl",
                "d_embedding": rng.choice([i for i in range(8, 32 + 1) if i % 4 == 0]),
                "num_emb_n_bins": rng.integers(2, 128, endpoint=True),
            }
        elif hpo_space_name == 'tabarena':
            params = {
                "batch_size": "auto",
                "patience": 16,
                "allow_amp": False,  # only for GPU, maybe we should change it to True?
                "arch_type": "tabm-mini",
                "tabm_k": 32,
                # "gradient_clipping_norm": 1.0, # wasn't correctly implemented so we remove it in v1.7.0
                # this makes it probably slower with numerical embeddings, and also more RAM intensive
                # according to the paper it's not very important but should be a bit better (?)
                "share_training_batches": False,
                "lr": np.exp(rng.uniform(np.log(1e-4), np.log(3e-3))),
                "weight_decay": rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-1)))]),
                # removed n_blocks=1 according to Yury Gurishniy's advice
                "n_blocks": rng.choice([2, 3, 4, 5]),
                # increased lower limit from 64 to 128 according to Yury Gorishniy's advice
                "d_block": rng.choice([i for i in range(128, 1024 + 1) if i % 16 == 0]),
                "dropout": rng.choice([0.0, rng.uniform(0.0, 0.5)]),
                # numerical embeddings
                "num_emb_type": "pwl",
                "d_embedding": rng.choice([i for i in range(8, 32 + 1) if i % 4 == 0]),
                "num_emb_n_bins": rng.integers(2, 128, endpoint=True),
            }
        else:
            raise ValueError(f'Unknown {hpo_space_name=}')
        return params

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**config) for i in range(n_tv_splits)])

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        return TabMSubSplitInterface(**self.config).get_available_predict_params()

    def set_current_predict_params(self, name: str) -> None:
        super().set_current_predict_params(name)


================================================
FILE: pytabkit/models/alg_interfaces/tabr_interface.py
================================================
from typing import List, Any, Optional, Dict, Tuple
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, \
    RandomParamsAlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources, SubSplitIdxs
from pytabkit.models.alg_interfaces.rtdl_interfaces import choose_batch_size_rtdl_new
from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.logging import Logger
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.nn_models.tabr import TabrLightning, TabrModel
from pytabkit.models.nn_models.tabr_context_freeze import TabrModelContextFreeze, TabrLightningContextFreeze
from pytabkit.models.training.metrics import insert_missing_class_columns

import torch.utils.data
try:
    import lightning.pytorch as pl
    from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    import pytorch_lightning as pl
    from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint


class ExceptionPrintingCallback(pl.callbacks.Callback):
    def on_exception(self, trainer, pl_module, exception):
        import traceback
        print(f'caught exception')
        traceback.print_exception(exception)


class TabRSubSplitInterface(AlgInterface):
    def __init__(self, **config):
        super().__init__(**config)
        self.tfm = None
        self.n_classes = None
        self.model = None
        self.train_ds = None
    
    def create_model(self, n_num_features, n_bin_features,
                     cat_cardinalities, n_classes, freeze_contexts_after_n_epochs: Optional[int]) -> Any:
        
        params_config = [
            ('num_embeddings', None, None),
            ('d_main', None),
            ('d_multiplier', None),
            ('encoder_n_blocks', None),
            ('predictor_n_blocks', None),
            ('mixer_normalization', None),
            ('context_dropout', None),
            ('dropout0', None),
            ('dropout1', None),
            ('normalization', None),
            ('activation', None),
            # The following options should be used only when truly needed.
            ('memory_efficient', None),
            ('candidate_encoding_batch_size', None),
            ('add_scaling_layer', None),
            ('scale_lr_factor', None),
            ('use_ntp_linear', None),
            ('linear_init_type', None),
            ('use_ntp_encoder', None),
        ]
        params = utils.extract_params(self.config, params_config)

        if freeze_contexts_after_n_epochs is not None:
            return TabrModelContextFreeze(
                n_num_features=n_num_features,
                n_bin_features=n_bin_features,
                cat_cardinalities=cat_cardinalities,
                n_classes=n_classes,
                **params
            )
        else:
            return TabrModel(
                n_num_features=n_num_features,
                n_bin_features=n_bin_features,
                cat_cardinalities=cat_cardinalities,
                n_classes=n_classes,
                **params
                )
    
    def infer_batch_size(self, n_samples_train: int) -> int:
        # taken from tabr paper table 14
        # the cutoffs might not be exactly the same
        if n_samples_train < 10_000:
            return 128
        elif n_samples_train < 30_000:
            return 256
        elif n_samples_train < 200_000:
            return 512
        else:
            return 1024
    
    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1
        assert idxs_list[0].n_trainval_splits == 1
        pl.seed_everything(idxs_list[0].sub_split_seeds[0])

        use_deterministic_before = torch.are_deterministic_algorithms_enabled()
        torch.use_deterministic_algorithms(False)

        self.n_classes = ds.get_n_classes()
        train_idxs = idxs_list[0].train_idxs[0]
        val_idxs = idxs_list[0].val_idxs[0] if idxs_list[0].val_idxs is not None else None
        train_ds = ds.get_sub_dataset(train_idxs)
        self.train_ds = train_ds
        is_cv = val_idxs is not None
        val_ds = ds.get_sub_dataset(val_idxs) if is_cv else None

        # create preprocessing factory
        factory = self.config.get('factory', None)
        if factory is None:
            factory = PreprocessingFactory(**self.config)

        # transform according to factory
        fitter = factory.create(ds.tensor_infos)
        if is_cv:
            trainval_ds = ds.get_sub_dataset(torch.cat([train_idxs, val_idxs], dim=0))
        else:
            trainval_ds = train_ds
        self.tfm = fitter.fit(trainval_ds)
        train_ds = self.tfm.forward_ds(train_ds)
        if is_cv:
            val_ds = self.tfm.forward_ds(val_ds)

        y = train_ds.tensors['y']
        if is_cv:
            y_val = val_ds.tensors['y']
        # equivalent of sklearn's TransformedTargetRegressor
        transformed_target = self.config.get("transformed_target", False)
        if transformed_target:
            #do TransformedTargetRegressor by hand (because setting the
            # validation set in skorch conflicts with TransformedTargetRegressor)
            self.transformer_mean = y.mean()
            self.transformer_std = y.std()
            y = (y - self.transformer_mean) / self.transformer_std
            if is_cv:
                y_val = (y_val - self.transformer_mean) / self.transformer_std
        else:
            self.transformer_mean = None
            self.transformer_std = None     

        # create datasets for pytorch lightning
        X_num = train_ds.tensors['x_cont']
        X_cat = train_ds.tensors['x_cat']
        # separate bin and cat
        cat_sizes = train_ds.tensor_infos['x_cat'].get_cat_sizes()
        cat_sizes = cat_sizes - 1 # cat sizes contains the size + 1 for unknown values
        #TODO: I think we could do something cleaner
        binary_indicator = cat_sizes == 2
        to_drop_indicator = cat_sizes <= 1 #TODO: this should be dealt with in the converter or the factory
        cat_indicator = (~to_drop_indicator) & (~binary_indicator)
        X_bin = train_ds.tensors['x_cat'][:, binary_indicator]
        X_cat = train_ds.tensors['x_cat'][:, cat_indicator]
        cat_sizes_nonbinary = cat_sizes[cat_indicator].tolist()

        # create validation dataset
        if is_cv:
            X_num_val = val_ds.tensors['x_cont']
            X_cat_val = val_ds.tensors['x_cat']
            # separate bin and cat
            X_bin_val = val_ds.tensors['x_cat'][:, binary_indicator]
            X_cat_val = val_ds.tensors['x_cat'][:, cat_indicator]

        # We need to do ordinalEncoding again here to prevent holes in the categories
        if X_cat.shape[1] > 0:
            #missing values were encoded as 0 in ToDictDatasetConverter
            # missing values were encoded as zero, we need to make them missing again
            self.replace_zero_by_nans = SimpleImputer(missing_values=0.,
                                    strategy="constant",
                                    fill_value=np.nan)
            self.ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', 
                                            unknown_value=-1,
                                            encoded_missing_value=-1)
            # apparently it doesn't work on the integer tensor
            X_cat = self.replace_zero_by_nans.fit_transform(X_cat.float())
            X_cat = torch.from_numpy(self.ord_enc.fit_transform(X_cat))
            if is_cv:
                X_cat_val = self.replace_zero_by_nans.transform(X_cat_val.float())
                X_cat_val = torch.from_numpy(self.ord_enc.transform(X_cat_val))
        if X_bin.shape[1] > 0:
            # the ToDictDatasetConverter encoded binary features as 1 and 2
            # we need to encode them as 0 and 1
            X_bin = X_bin - 1
            assert torch.logical_or(
                torch.logical_or(
                            (X_bin == -1), # missing values were encoded as 0
                            (X_bin == 0)
                            ),
                            (X_bin == 1)).all()
            # replace -1 by 0.5
            X_bin[X_bin == -1] = 0.5
            if is_cv:
                X_bin_val = X_bin_val - 1
                X_bin_val[X_bin_val == -1] = 0.5

        from skorch.dataset import Dataset

        class TabrDataset(Dataset):
            def __init__(self, X_num, X_bin, X_cat, Y):
                self.data = {
                    "Y": Y.reshape(-1)
                }
                if X_num.shape[1] > 0:
                    self.data["X_num"] = X_num.float()
                if X_bin.shape[1] > 0:
                    self.data["X_bin"] = X_bin.long()
                if X_cat.shape[1] > 0:
                    self.data["X_cat"] = X_cat.long()
                self.size = len(Y)

            def __len__(self):
                return self.size

            def __getitem__(self, idx):
                return {"indices": idx}


        train_dataset = TabrDataset(
            X_num,
            X_bin,
            X_cat,
            y,
        )
        if is_cv:
            val_dataset = TabrDataset(
                X_num_val,
                X_bin_val,
                X_cat_val,
                y_val,
            )
        else:
            assert NotImplementedError

        n_train = idxs_list[0].n_train
        min_context_freeze_train_size = self.config.get('min_context_freeze_train_size', 0)
        freeze_contexts_after_n_epochs = self.config.get('freeze_contexts_after_n_epochs', None)
        if n_train < min_context_freeze_train_size:
            freeze_contexts_after_n_epochs = None  # don't freeze

        torch_model = self.create_model(
            n_num_features=X_num.shape[1],
            n_bin_features=X_bin.shape[1],
            cat_cardinalities=cat_sizes_nonbinary, # we could save a little memory
            # by recomputing the cardinality on train only, but let's keep it simple
            n_classes=self.n_classes if self.n_classes > 0 else None,
            freeze_contexts_after_n_epochs=freeze_contexts_after_n_epochs
            )

        # set batch size if auto
        if self.config.get('batch_size', None) == 'auto':
            self.config['batch_size'] = self.infer_batch_size(len(y))

        self.config["n_threads"] = interface_resources.n_threads
        self.config["verbosity"] = self.config.get("verbosity", 0)

        class_to_use = TabrLightningContextFreeze if freeze_contexts_after_n_epochs is not None else TabrLightning
        self.model = class_to_use(
            torch_model, train_dataset, val_dataset, C=self.config,
            n_classes=self.n_classes,
            )

        if self.n_classes > 0:
            val_metric_name = self.config.get('val_metric_name', 'class_error')
            if val_metric_name == 'class_error':
                es_callback = EarlyStopping(monitor='val_accuracy',
                                                  patience=self.config["patience"], mode='max')
                checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_accuracy", mode="max",
                                                      dirpath=tmp_folders[0])
            elif val_metric_name == 'cross_entropy':
                print(f'Early stopping on cross-entropy loss')
                es_callback = EarlyStopping(monitor='val_loss',
                                            patience=self.config["patience"], mode='min')
                checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min",
                                                      dirpath=tmp_folders[0])
            else:
                raise ValueError(f'Validation metric {val_metric_name} not implemented for TabR')
        else:
            es_callback = EarlyStopping(monitor='val_loss', 
                                              patience=self.config["patience"], mode='min')
            checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min",
                                                  dirpath=tmp_folders[0])

        gpu_devices = interface_resources.gpu_devices
        print("gpu_devices", gpu_devices)
        self.device = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu'
        if self.device == 'cpu':
            pl_accelerator = 'cpu'
            pl_devices = 'auto'
        elif self.device == 'mps':
            pl_accelerator = 'mps'
            pl_devices = 'auto'
        elif self.device == 'cuda':
            pl_accelerator = 'gpu'
            pl_devices = [0]
        elif self.device.startswith('cuda:'):
            pl_accelerator = 'gpu'
            pl_devices = [int(self.device[len('cuda:'):])]
        else:
            raise ValueError(f'Unknown device "{self.device}"')
        
        self.trainer = pl.Trainer(
                           accelerator=pl_accelerator,
                            devices=pl_devices,
                            deterministic=True,
                            callbacks=[es_callback, checkpoint_callback, ExceptionPrintingCallback()],
                            max_epochs=self.config["n_epochs"],
                            enable_progress_bar=self.config["verbosity"] > 0,
                            enable_model_summary=self.config["verbosity"] > 0,
                            logger=pl.loggers.logger.DummyLogger(),
                            )

        self.trainer.fit(self.model)

        if self.config["verbosity"] > 0:
            print("path to best model",
                checkpoint_callback.best_model_path)   # prints path to the best model's checkpoint
            print("best score",
                checkpoint_callback.best_model_score) # and prints it score
        # load best model
        class_to_use = TabrLightningContextFreeze if freeze_contexts_after_n_epochs is not None else TabrLightning
        self.model = class_to_use.load_from_checkpoint(checkpoint_callback.best_model_path,
                                                        model = torch_model,
                                                        train_dataset=train_dataset,
                                                        val_dataset=val_dataset,
                                                        C=self.config,
                                                        n_classes=self.n_classes,
                                                        )

        torch.use_deterministic_algorithms(use_deterministic_before)

        return None

    def predict(self, ds: DictDataset) -> torch.Tensor:
        # adapted from SklearnSubSplitLearner
        # should return tensor of shape len(ds) x output_shape

        use_deterministic_before = torch.are_deterministic_algorithms_enabled()
        torch.use_deterministic_algorithms(False)

        if self.tfm is not None:
            ds = self.tfm.forward_ds(ds)
        
        X_num = ds.tensors['x_cont']
        X_cat = ds.tensors['x_cat']
        # separate bin and cat
        cat_sizes = ds.tensor_infos['x_cat'].get_cat_sizes()
        cat_sizes = cat_sizes - 1 # cat sizes contains the size + 1 for missing values
        binary_indicator = cat_sizes == 2
        to_drop_indicator = cat_sizes <= 1
        cat_indicator = (~to_drop_indicator) & (~binary_indicator)
        X_bin = ds.tensors['x_cat'][:, binary_indicator]
        X_cat = ds.tensors['x_cat'][:, cat_indicator]

        # We need to do ordinalEncoding again here to prevent holes in the categories
        if X_cat.shape[1] > 0:
            X_cat = self.replace_zero_by_nans.transform(X_cat.float())
            X_cat = torch.from_numpy(self.ord_enc.transform(X_cat))
        if X_bin.shape[1] > 0:
            # the ToDictDatasetConverter encoded binary features as 1 and 2
            # we need to encode them as 0 and 1
            X_bin = X_bin - 1
            assert torch.logical_or(
                torch.logical_or(
                            (X_bin == -1), # missing values were encoded as 0
                            (X_bin == 0)
                            ),
                            (X_bin == 1)).all()
            # replace -1 by 0.5
            X_bin[X_bin == -1] = 0.5

        from skorch.dataset import Dataset

        class TabrDatasetTest(Dataset):
            def __init__(self, X_num, X_bin, X_cat):
                self.data = {}
                if X_num.shape[1] > 0:
                    self.data["X_num"] = X_num.float()
                    self.size = len(X_num)
                if X_bin.shape[1] > 0:
                    self.data["X_bin"] = X_bin.long()
                    self.size = len(X_bin)
                if X_cat.shape[1] > 0:
                    self.data["X_cat"] = X_cat.long()
                    self.size = len(X_cat)
            def __len__(self):
                return self.size
            def __getitem__(self, idx):
                return {
                    key: self.data[key][idx]
                    for key in self.data
        }

        test_dataset = TabrDatasetTest(
            X_num,
            X_bin,
            X_cat,
        )
        # create a dataloader
        test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=self.config["eval_batch_size"],
            shuffle=False,
            num_workers=0, #min(self.config["n_threads"] - 1, 16)
        )

        y_pred = self.trainer.predict(self.model, test_dataloader)
        y_pred = torch.cat(y_pred, dim=0)

        # guard against missing classes in the training set
        # (GBDT interfaces don't need this because they get passed n_classes as a parameter)
        y_pred = insert_missing_class_columns(y_pred, self.train_ds)
        # inverse transform for y (like in TransformedTargetRegressor)
        if self.transformer_mean is not None:
            y_pred = y_pred * self.transformer_std + self.transformer_mean

        torch.use_deterministic_algorithms(use_deterministic_before)

        return y_pred[None]  # add vectorized dimension

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        has_num_emb = self.config.get('num_embeddings', None) is not None
        if has_num_emb:
            num_emb_dict = self.config['num_embeddings']

            num_emb_size_factor = 1.0 + 0.2 * (num_emb_dict.get('n_frequencies', 8) + num_emb_dict.get('d_embedding', 4))
        else:
            num_emb_size_factor = 1.0

        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=1), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 1e-4}
        ram_params = {'': 4, 'ds_onehot_size_gb': 1.5}
        gpu_ram_params = {'': 5, 'n_features': num_emb_size_factor * 1e-4, "n_train": 3e-5,
                          'n_features*n_train': num_emb_size_factor * 0.5e-7, 'n_classes': 0.04}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.3) #, gpu_ram_params)
        return rc.get_required_resources(ds, n_train=n_train)


class RandomParamsTabRAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        hpo_space_name = self.config.get('hpo_space_name', 'tabr')

        if hpo_space_name == 'tabr':
            params = {
                # reduced d_layers
                "d_main": rng.choice(np.arange(96, 385)),
                "context_dropout": rng.uniform(0.0, 0.6),
                "dropout0": rng.uniform(0.0, 0.6),
                "dropout1": 0.0,
                "optimizer": {
                    "type": "AdamW",
                    "lr": np.exp(rng.uniform(np.log(3e-5), np.log(1e-3))),
                    "weight_decay": rng.choice([0, np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))])
                    # paper says 1e-3 but logs on github say 1e-4 for upper bound
                },
                "encoder_n_blocks": rng.choice([0, 1]),
                "predictor_n_blocks": rng.choice([1, 2]),
                "num_embeddings": {
                    "type": "PLREmbeddings",
                    "n_frequencies": rng.choice(np.arange(16, 97)),
                    "d_embedding": rng.choice(np.arange(16, 65)),
                    "frequency_scale": np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                    "lite": True,
                },
            }

            if is_classification:
                params = utils.join_dicts(DefaultParams.TABR_S_D_CLASS, params)
            else:
                params = utils.join_dicts(DefaultParams.TABR_S_D_REG, params)
        elif hpo_space_name == 'realtabr':
            tfms_list = [['quantile_tabr'], ['median_center', 'robust_scale', 'smooth_clip']]
            params = {
                # reduced d_layers
                "d_main": rng.choice(np.arange(96, 385)),
                "context_dropout": rng.uniform(0.0, 0.6),
                "dropout0": rng.uniform(0.0, 0.6),
                "dropout1": 0.0,
                "optimizer": {
                    "type": "AdamW",
                    "lr": np.exp(rng.uniform(np.log(3e-5), np.log(1e-3))),
                    "weight_decay": rng.choice([0, np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))]),
                    # paper says 1e-3 but logs on github say 1e-4 for upper bound
                    "betas": (0.9, rng.choice([0.95, 0.999])),
                },
                "encoder_n_blocks": rng.choice([0, 1]),
                "predictor_n_blocks": rng.choice([1, 2]),
                "num_embeddings": {
                    "type": "PBLDEmbeddings",
                    # use factor 2 since it results in the same hidden dimension
                    # as for PLR without the factor 2 because of the concat(sin, cos) thing
                    "n_frequencies": 2*rng.choice(np.arange(16, 97)),
                    "d_embedding": rng.choice(np.arange(16, 65)),
                    "frequency_scale": np.exp(rng.uniform(np.log(1e-2), np.log(1e2))),
                },
                "ls_eps": rng.choice([0.0, 0.1]),
                'tfms': tfms_list[rng.choice(np.arange(len(tfms_list)))],
                'add_scaling_layer': rng.choice([True, False]),
                'scale_lr_factor': 96,
            }

            if is_classification:
                params = utils.join_dicts(DefaultParams.RealTABR_D_CLASS, params)
            else:
                params = utils.join_dicts(DefaultParams.RealTABR_D_REG, params)
        else:
            raise ValueError(f'Unknown HPO space name "{hpo_space_name}"')

        return params

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**config) for i in range(n_tv_splits)])


================================================
FILE: pytabkit/models/alg_interfaces/xgboost_interfaces.py
================================================
import copy
from pathlib import Path
from typing import Optional, Dict, Any, Tuple, List, Union

import numpy as np
import torch

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.resource_params import ResourceParams
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.base import RequiredResources
from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, \
    SingleSplitWrapperAlgInterface, \
    SklearnSubSplitInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer

from pytabkit.models.alg_interfaces.alg_interfaces import OptAlgInterface, AlgInterface, RandomParamsAlgInterface
from pytabkit.models.training.metrics import Metrics


class XGBCustomMetric:
    def __init__(self, metric_names: Union[str, List[str]], is_classification: bool, is_higher_better: bool = False):
        self.metric_names = metric_names
        self.is_classification = is_classification
        self.is_higher_better = is_higher_better

    def __call__(self, y_pred: np.ndarray, dtrain):
        # dtrain should be of type xgb.DMatrix
        y = torch.as_tensor(dtrain.get_label(), dtype=torch.long if self.is_classification else torch.float32)
        if len(y.shape) == 1:
            y = y[:, None]

        # print(f'{y_pred.shape=}, {eval_data.get_label().shape=}')
        y_pred = torch.as_tensor(y_pred, dtype=torch.float32)
        if len(y_pred.shape) == 1:
            if self.is_classification:
                if y_pred.shape[0] == y.shape[0]:
                    # binary classification, transform into both class probabilities
                    y_pred = torch.stack([1. - y_pred, y_pred], dim=-1)
                else:
                    # bugged multiclass classification in LightGBM, need to reshape
                    # print(y_pred[:7])
                    y_pred = y_pred.view(-1, y.shape[0]).t().contiguous()
                    # print(y_pred[0, :].sum())
            else:
                y_pred = y_pred[:, None]

        if self.is_classification:
            # go from probabilities to logits
            y_pred = torch.log(y_pred + 1e-30)

        # print(f'{y_pred[4]=}')
        # print(f'{torch.min(y_pred).item()=}')
        # print(f'{np.asarray(dtrain.get_data())[4]=}')

        # print(f'{y_pred.shape=}, {y.shape=}')

        # print(f'{y_pred=}, {y=}')

        if isinstance(self.metric_names, str):
            return self.metric_names, Metrics.apply(y_pred, y, metric_name=self.metric_names).item()
        elif isinstance(self.metric_names, list):
            results = [(metric_name, Metrics.apply(y_pred, y, metric_name=metric_name).item()) for metric_name in self.metric_names]
            # print(results)
            return results
        else:
            raise ValueError(f'Invalid {type(self.metric_names)=}')


class XGBSklearnSubSplitInterface(SklearnSubSplitInterface):
    def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any:
        params_config = [('n_estimators', None),
                         ('verbosity', None),
                         ('max_depth', None),
                         ('eta', ['lr', 'learning_rate', 'eta']),
                         ('subsample', None),
                         ('colsample_bytree', None),
                         ('colsample_bylevel', None),
                         ('colsample_bynode', None),
                         ('alpha', ['alpha', 'reg_alpha']),
                         ('lambda', ['lambda', 'reg_lambda']),
                         ('gamma', ['gamma', 'reg_gamma']),
                         ('tree_method', None),
                         ('min_child_weight', None),
                         ('max_delta_step', None),
                         ('max_cat_to_onehot', ['max_cat_to_onehot', 'max_onehot_cat_size', 'one_hot_max_size'], None),
                         ('num_parallel_tree', None),
                         ('max_bin', None),
                         ('nthread', ['nthread', 'n_threads'], n_threads),
                         ]

        params = utils.extract_params(self.config, params_config)
        if self.n_classes > 0:
            from xgboost import XGBClassifier

            return XGBClassifier(random_state=seed, **params)
        else:
            from xgboost import XGBRegressor

            return XGBRegressor(random_state=seed, **params)

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.xgb_class_time,
                               cpu_ram_params=ResourceParams.xgb_class_ram)
        return rc.get_required_resources(ds)


class XGBSubSplitInterface(TreeBasedSubSplitInterface):
    # for RF: https://xgboost.readthedocs.io/en/latest/tutorials/rf.html
    def _get_params(self):
        # n_estimators is not set in params but directly in bst.fit() below
        params_config = [('verbosity', None, 0),
                         ('max_depth', None, 6),
                         ('eta', ['lr', 'learning_rate', 'eta'], 0.3),
                         ('subsample', None, 1.0),
                         ('colsample_bytree', None, 1.0),
                         ('colsample_bylevel', None, 1.0),
                         ('colsample_bynode', None, 1.0),
                         ('alpha', ['alpha', 'reg_alpha'], 0.0),
                         ('lambda', ['lambda', 'reg_lambda'], 1.0),
                         ('gamma', ['gamma', 'reg_gamma'], 0.0),
                         ('tree_method', None, 'auto'),
                         ('min_child_weight', None),
                         ('max_delta_step', None),
                         ('max_cat_to_onehot', ['max_cat_to_onehot', 'max_onehot_cat_size', 'one_hot_max_size'], None),
                         ('num_parallel_tree', None),
                         ('max_bin', None),
                         ('multi_strategy', None),
                         ('grow_policy', None),
                         ('max_leaves', None),
                         ]

        params = utils.extract_params(self.config, params_config)
        if self.config.get('use_gpu', False):
            params['tree_method'] = 'gpu_hist'
        return params

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        assert n_refit == 1
        return XGBSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config)

    # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/xgboost_experiment.py
    def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]:
        params = copy.deepcopy(params)
        if n_classes == 0:
            train_metric_name = self.config.get('train_metric_name', 'mse')
            # val_metric_name = self.config.get('val_metric_name', 'rmse')
            if train_metric_name == 'mse':
                params['objective'] = 'reg:squarederror'
                # params['eval_metric'] = 'rmse'
            elif train_metric_name.startswith('pinball('):
                quantile = float(train_metric_name[len('pinball('):-1])
                params['objective'] = 'reg:quantileerror'
                params['quantile_alpha'] = quantile
            else:
                raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!')
            # params.update({'objective': 'reg:squarederror', 'eval_metric': 'rmse'})
        elif n_classes == 2:
            params.update({'objective': 'binary:logistic'})
        elif n_classes > 2:
            params.update({'objective': 'multi:softprob', 'num_class': n_classes})

        if n_classes <= 2 and 'multi_strategy' in params:
            del params['multi_strategy']

        # could use gpu using
        # param['gpu_id'] = 0
        # param['tree_method'] = 'gpu_hist'

        params['max_depth'] = int(params['max_depth'])
        return params

    def _convert_ds(self, ds: DictDataset) -> Any:
        import xgboost as xgb
        label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy()
        has_cat = 'x_cat' in ds.tensor_infos and ds.tensor_infos['x_cat'].get_n_features() > 0
        x_df = ds.without_labels().to_df()
        # print(f'{x_df.iloc[4 if ds.n_samples < 1000 else 240]=}')
        # print([x_df[col].cat.categories.tolist() for col in x_df.select_dtypes(include="category").columns])
        return xgb.DMatrix(x_df, label, enable_categorical=has_cat)

    def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int,
             n_threads: int, val_metric_name: Optional[str] = None,
             tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]:
        import xgboost as xgb

        # print(f'Fitting XGBoost')
        n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item()
        params = self._preprocess_params(params, n_classes)
        params.update({'seed': seed, 'nthread': n_threads})
        evals = [] if val_ds is None else [(self._convert_ds(val_ds), 'val')]
        evals_result = {}

        custom_metric = None
        eval_metric_name = None

        val_metric_names = self.config.get('val_metric_names', None)

        if val_ds is not None:
            # print(f'{val_ds.n_samples=}')
            if val_metric_names is not None:
                eval_metric_name = val_metric_names[0]
                custom_metric = XGBCustomMetric(val_metric_names, is_classification=n_classes > 0)
            else:
                # single validation metric

                if val_metric_name is None:
                    val_metric_name = 'class_error' if n_classes > 0 else 'rmse'

                if val_metric_name == 'class_error':
                    eval_metric_name = 'error' if n_classes == 2 else 'merror'
                elif val_metric_name == 'cross_entropy':
                    eval_metric_name = 'logloss' if n_classes == 2 else 'mlogloss'
                elif val_metric_name == 'rmse':
                    eval_metric_name = 'rmse'
                elif val_metric_name == 'mae':
                    eval_metric_name = 'mae'
                else:
                    eval_metric_name = val_metric_name
                    custom_metric = XGBCustomMetric(val_metric_name, is_classification=n_classes > 0)

            if custom_metric is None:
                params['eval_metric'] = eval_metric_name
            else:
                params['disable_default_eval_metric'] = True

        extra_train_params = {}
        if val_ds is not None and 'early_stopping_rounds' in self.config:
            extra_train_params['early_stopping_rounds'] = self.config['early_stopping_rounds']

        n_estimators = self.config.get('n_estimators', 1000)
        if 'n_estimators' in params:
            # can happen for refit because fit_params are directly joined into params
            n_estimators = int(params['n_estimators'])

        bst = xgb.train(params, self._convert_ds(train_ds), evals=evals, evals_result=evals_result,
                        custom_metric=custom_metric,
                        num_boost_round=n_estimators, verbose_eval=False,
                        **extra_train_params)

        # print(f'xgb train completed')
        # import psutil
        # import os
        # print(f'Memory: {psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3} GB')

        if val_ds is not None:
            if val_metric_names is not None:
                val_errors = {vmn: evals_result['val'][vmn] for vmn in val_metric_names}
                # for vmn in val_metric_names:
                    # print(f'{vmn=}, {np.argmin(val_errors[vmn])=}, {np.min(val_errors[vmn])=}')
            else:
                val_errors = evals_result['val'][eval_metric_name]
                # print(f'{np.min(val_errors)=}')
                # print(f'{val_ds.tensors["x_cont"][4]=}')
                # print(f'{val_ds.tensors["x_cat"][4]=}')
                # print(f'{self._predict(bst, val_ds, n_classes, dict(n_estimators=np.argmin(val_errors)+1))[4]=}')
        else:
            val_errors = None
        return bst, val_errors

    def _predict(self, bst, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor:
        # bst should be of type xgb.Booster
        # print(f'XGB _predict() with {other_params=}')
        # print(f'predict with {ds.n_samples=}, {ds.tensors["x_cont"][4]=}, {ds.tensors["x_cat"][4]=}, {ds.tensors["x_cont"][240]=}, {ds.tensors["x_cat"][240]=}')
        iteration_range = (0, 0) if other_params is None else (0, int(other_params['n_estimators']))
        y_pred = torch.as_tensor(bst.predict(self._convert_ds(ds), iteration_range=iteration_range),
                                 dtype=torch.float32)
        if n_classes == 0:
            y_pred = y_pred.unsqueeze(-1)
        elif n_classes == 2:
            y_pred = torch.stack([1. - y_pred, y_pred], dim=-1)

        if n_classes >= 2:
            y_pred = torch.log(y_pred + 1e-30)
        # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}')
        return y_pred

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6, max_n_threads=8), self.config)
        rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.xgb_class_time,
                               cpu_ram_params=ResourceParams.xgb_class_ram)
        return rc.get_required_resources(ds)


class XGBHyperoptAlgInterface(OptAlgInterface):
    def __init__(self, space=None, n_hyperopt_steps: int = 50, **config):
        from hyperopt import hp
        default_config = {}
        max_config = dict()
        if space is None:
            space = config.get('hpo_space_name', None)

        if space == 'catboost_quality_benchmarks':
            # space from catboost quality benchmarks
            # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/xgboost_experiment.py
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            space = {
                'eta': hp.loguniform('eta', -7, 0),
                'max_depth': hp.quniform('max_depth', 2, 10, 1),
                'subsample': hp.uniform('subsample', 0.5, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
                'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
                'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]),
                'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]),
                'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)])
            }
            default_config = dict(n_estimators=5000)
            max_config['max_depth'] = 10
        elif space == 'NODE' or space == 'popov':
            # space from NODE paper:
            # Popov, Morozov, and Babenko, Neural oblivious decision ensembles for deep learning on tabular data
            # the parameter names in the space are for the alg interface, not directly for the GBDT interface!
            # same as catboost_quality_benchmarks except with smaller n_estimators
            space = {
                'eta': hp.loguniform('eta', -7, 0),
                'max_depth': hp.quniform('max_depth', 2, 10, 1),
                'subsample': hp.uniform('subsample', 0.5, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
                'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
                'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]),
                'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]),
                'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)])
            }
            default_config = dict(n_estimators=2048)
            max_config['max_depth'] = 10
        elif space == 'shwartz-ziv':
            # from Shwartz-Ziv and Armon, Tabular data: Deep learning is not all you need
            # the TabPFN-Paper uses the same configuration
            space = {
                'n_estimators': hp.quniform('n_estimators', 100, 4000, 1),
                'eta': hp.loguniform('eta', -7, 0),
                'max_depth': hp.quniform('max_depth', 1, 10, 1),
                'subsample': hp.uniform('subsample', 0.2, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1),
                'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
                'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]),
                'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]),
                'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)])
            }
            max_config['max_depth'] = 10
        elif space == 'kadra':
            # from Kadra, Lindauer, Hutter, and Grabocka, Well-tuned Simple Nets Excel on Tabular Datasets
            space = {
                'n_estimators': hp.quniform('n_estimators', 1, 1000, 1),
                'eta': hp.loguniform('eta', np.log(1e-3), 0),
                'max_depth': hp.quniform('max_depth', 1, 20, 1),
                'subsample': hp.uniform('subsample', 0.01, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 1),
                'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(20.0)),
                'max_delta_step': hp.quniform('max_delta_step', 0, 10, 1),
                'reg_alpha': hp.loguniform('alpha', np.log(1e-10), 0),
                'reg_lambda': hp.loguniform('lambda', np.log(1e-10), 0),
                'reg_gamma': hp.loguniform('gamma', np.log(1e-10), 0)
            }
            max_config['max_depth'] = 20
        elif space == 'grinsztajn':
            # from Grinsztajn, Oyallon, Varoquaux,
            # Why do tree-based models still outperform deep learning on typical tabular data?
            #  they have early-stopping-rounds=20
            #  they also use XGBClassifier / XGBRegressor from scikit-learn
            #  they also start the random searches with the default hyperparameters of the model
            # see https://github.com/LeoGrin/tabular-benchmark/blob/main/src/configs/model_configs/xgb_config.py
            space = {
                'eta': hp.loguniform('eta', np.log(1e-5), np.log(0.7)),
                'max_depth': hp.quniform('max_depth', 1, 11, 1),
                'min_child_weight': hp.qloguniform('min_child_weight', 0.0, np.log(100.0), 1),
                'subsample': hp.uniform('subsample', 0.5, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
                'reg_alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1e-2)),
                'reg_lambda': hp.loguniform('lambda', np.log(1.0), np.log(4.0)),
                'reg_gamma': hp.loguniform('gamma', np.log(1e-8), np.log(7.0))
            }
            default_config = dict(n_estimators=1000)
            max_config['max_depth'] = 11
        elif space == 'gorishniy':
            # from Gorishniy, Rubachev, Khrulkov, Babenko, Revisiting Deep Learning Models for Tabular Data
            # they also have booster = "gbtree" (default), early-stopping-rounds=50,
            #  n_hyperopt_steps=100
            space = {
                'eta': hp.loguniform('eta', np.log(1e-5), np.log(1.0)),
                'max_depth': hp.quniform('max_depth', 3, 10, 1),
                'min_child_weight': hp.qloguniform('min_child_weight', np.log(1e-8), np.log(1e5), 1),
                'subsample': hp.uniform('subsample', 0.5, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
                'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', np.log(1e-8), np.log(1e2))]),
                'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', np.log(1e-8), np.log(1e2))]),
                'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', np.log(1e-8), np.log(1e2))])
            }
            default_config = dict(n_estimators=2000)
            max_config['max_depth'] = 10
        elif space == 'custom-v1':
            space = {
                'eta': hp.loguniform('eta', np.log(2e-3), np.log(0.5)),
                'max_depth': hp.quniform('max_depth', 1, 10, 1),
                'min_child_weight': hp.qloguniform('min_child_weight', np.log(1e-5), np.log(100.0), 1),
                'subsample': hp.uniform('subsample', 0.4, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
                'reg_alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
                'reg_lambda': hp.loguniform('lambda', np.log(1e-8), np.log(4.0)),
                'reg_gamma': hp.loguniform('gamma', np.log(1e-8), np.log(7.0))
            }
            default_config = dict(n_estimators=1000)
            max_config['max_depth'] = 11

        config = utils.update_dict(default_config, config)
        super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(),
                                                           n_hyperopt_steps=n_hyperopt_steps,
                                                           **config),
                         max_resource_config=utils.join_dicts(config, max_config),
                         **config)

    def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface:
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**config) for i in range(n_sub_splits)])


class RandomParamsXGBAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        rng = np.random.default_rng(seed)
        # adapted from Grinsztajn et al. (2022)
        hpo_space_name = self.config.get('hpo_space_name', 'grinsztajn')
        if hpo_space_name == 'grinsztajn':
            params = {
                'eta': np.exp(rng.uniform(np.log(1e-5), np.log(0.7))),
                'max_depth': rng.integers(1, 11, endpoint=True),
                'min_child_weight': round(np.exp(rng.uniform(0.0, np.log(100.0)))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bytree': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-8), np.log(1e-2))),
                'reg_lambda': np.exp(rng.uniform(np.log(1.0), np.log(4.0))),
                'reg_gamma': np.exp(rng.uniform(np.log(1e-8), np.log(7.0)))
            }
        elif hpo_space_name == 'probclass':
            params = {
                'eta': np.exp(rng.uniform(np.log(1e-3), np.log(0.7))),
                'max_depth': rng.integers(1, 11, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-5), np.log(100.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bytree': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))),
                'reg_gamma': np.exp(rng.uniform(np.log(1e-5), np.log(5.0)))
            }
        elif hpo_space_name == 'large':
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(1e-3), np.log(0.7))),
                'max_depth': rng.integers(1, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bytree': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_gamma': np.exp(rng.uniform(np.log(1e-4), np.log(1.0)))
            }
        elif hpo_space_name == 'large-v2':
            # modified (mostly larger) version of large
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(1e-2), np.log(0.2))),  # shrunk
                'max_depth': rng.integers(1, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bytree': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.5, 1),
                'colsample_bynode': rng.uniform(0.5, 1),  # added
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_gamma': np.exp(rng.uniform(np.log(1e-4), np.log(1.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']), # added

                # hard to meta-optimize since 0 is the default
                # 'max_leaves': round(np.exp(rng.uniform(np.log(1e-1), np.log(64))))
                # 'multi_strategy'
                # 'num_parallel_tree' # makes things slower
                # 'max_bin'  # also makes things slower
            }
        elif hpo_space_name == 'large-v3':
            # shrunk version of large-v2: removed gamma, colsample_bytree
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(1e-2), np.log(8e-2))),  # shrunk
                'max_depth': rng.integers(3, 10, endpoint=True),  # shrunk
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.6, 1),  # shrunk
                'colsample_bynode': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']),
            }
        elif hpo_space_name == 'large-v4':
            # modified version of large-v3
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))),  # shrunk
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))),  # expanded
                'subsample': rng.uniform(0.6, 1),  # shrunk
                'colsample_bylevel': rng.uniform(0.7, 1),  # shrunk
                'colsample_bynode': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))),  # modified
                'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))),  # modified
                'grow_policy': rng.choice(['depthwise', 'lossguide']),
                'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0))))  # added
            }
        elif hpo_space_name == 'large-v5':
            # shrunk version of large-v3 but without the extra stuff from large-v4
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))),  # shrunk
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))),  # expanded
                'subsample': rng.uniform(0.6, 1),  # shrunk
                'colsample_bylevel': rng.uniform(0.7, 1),  # shrunk
                'colsample_bynode': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),  # modified
                'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))),  # modified
            }
        elif hpo_space_name == 'large-v6':
            # shrunk version of large-v4
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))),  # shrunk
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))),  # expanded
                'subsample': rng.uniform(0.65, 1),  # shrunk
                'colsample_bylevel': rng.uniform(0.7, 1),  # shrunk
                'colsample_bynode': rng.uniform(0.9, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),  # modified
                'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))),  # modified
                'grow_policy': rng.choice(['lossguide']),
                'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0))))  # added
            }
        elif hpo_space_name == 'large-v7-10k':
            # large-v3 but with tabrepo lr space and with 10k estimators
            params = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.6, 1),
                'colsample_bynode': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']),
            }
        elif hpo_space_name == 'large-v8-10k':
            # large-v7-10k but really tuning grow_policy this time (it wasn't picked up before)
            # also tuning max_leaves (which also wasn't picked up before)
            params = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.6, 1),
                'colsample_bynode': rng.uniform(0.5, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']),  # added
                'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0))))  # added
            }
        elif hpo_space_name == 'large-v9-10k':
            # large-v8-10k but with smaller max_leaves space,
            # larger lower bound for colsample_bynode and colsample_bylevel
            params = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(3, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.5, 1),
                'colsample_bylevel': rng.uniform(0.7, 1),
                'colsample_bynode': rng.uniform(0.6, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']),
                'max_leaves': round(np.exp(rng.uniform(np.log(16), np.log(1024.0))))  # added
            }
        elif hpo_space_name == 'tabrepo1-es':
            params = {
                'n_estimators': 1000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(0.5), np.log(1.5))),
                'colsample_bytree': rng.uniform(0.5, 1),
                # there is enable_categorical, but I don't know how it makes sense to tune it
            }
        elif hpo_space_name == 'tabrepo1-es-10k':
            params = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 50,
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(0.5), np.log(1.5))),
                'colsample_bytree': rng.uniform(0.5, 1),
                # there is enable_categorical, but I don't know how it makes sense to tune it
            }
        elif hpo_space_name == 'tabarena':
            params = {
                'n_estimators': 10_000,
                'early_stopping_rounds': 300,  # probably not exactly equivalent to TabArena
                'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
                'max_depth': rng.integers(4, 10, endpoint=True),
                'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))),
                'subsample': rng.uniform(0.6, 1),
                'colsample_bylevel': rng.uniform(0.6, 1),
                'colsample_bynode': rng.uniform(0.6, 1),
                'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
                'grow_policy': rng.choice(['depthwise', 'lossguide']),
                'max_cat_to_onehot': int(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))).item()),
                'max_leaves': int(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(1025.0)))).item()),
            }
        else:
            raise ValueError(f'Unknown {hpo_space_name=}')
        return params

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**config) for i in range(n_tv_splits)])

    def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]:
        return XGBSubSplitInterface(**self.config).get_available_predict_params()

    def set_current_predict_params(self, name: str) -> None:
        super().set_current_predict_params(name)


================================================
FILE: pytabkit/models/alg_interfaces/xrfm_interfaces.py
================================================
import contextlib
import random
from pathlib import Path
from typing import Optional, List, Any, Tuple, Dict

import numpy as np
import torch

from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface, \
    RandomParamsAlgInterface
from pytabkit.models.alg_interfaces.base import RequiredResources, SplitIdxs, InterfaceResources
from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface
from pytabkit.models.data.data import DictDataset
from pytabkit.models.nn_models.base import Fitter
from pytabkit.models.nn_models.models import PreprocessingFactory
from pytabkit.models.torch_utils import get_available_memory_gb
from pytabkit.models.training.logging import Logger


class xRFMSubSplitInterface(SingleSplitAlgInterface):
    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        super().__init__(fit_params=fit_params, **config)

    def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
        raise NotImplementedError()

    def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
            logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[
        List[List[List[Tuple[Dict, float]]]]]:
        assert len(idxs_list) == 1
        assert idxs_list[0].n_trainval_splits == 1

        torch.set_float32_matmul_precision('highest')

        seed = idxs_list[0].sub_split_seeds[0]
        # print(f'Setting seed: {seed}')
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

        n_train = idxs_list[0].n_train
        n_classes = ds.get_n_classes()
        device = interface_resources.gpu_devices[0] if len(interface_resources.gpu_devices) >= 1 else 'cpu'

        self.n_classes_ = n_classes
        self.device_ = device

        # create preprocessing factory
        factory = self.config.get('factory', None)
        if 'tfms' not in self.config:
            self.config['tfms'] = ['mean_center', 'l2_normalize', 'one_hot']
        if factory is None:
            print("factory is None, creating factory")
            factory = PreprocessingFactory(**self.config)

        if idxs_list[0].val_idxs is None:
            raise ValueError(f'Training without validation set is currently not implemented')

        ds_train = ds.get_sub_dataset(idxs_list[0].train_idxs[0])
        ds_val = ds.get_sub_dataset(idxs_list[0].val_idxs[0])

        num_numerical = ds_train.tensor_infos['x_cont'].get_n_features()
        raw_cat_sizes = ds_train.tensor_infos['x_cat'].get_cat_sizes()
        if isinstance(raw_cat_sizes, torch.Tensor):
            raw_cat_sizes = raw_cat_sizes.tolist()
        else:
            raw_cat_sizes = [int(size) for size in raw_cat_sizes]

        if 'factory' in self.config or 'one_hot' not in self.config['tfms']:
            cat_sizes = []  # don't apply fast_categorical stuff
        else:
            use_missing_zero = self.config.get('use_missing_zero', True)
            use_binary_drop = self.config.get('use_1d_binary_onehot', True)
            cat_sizes = []
            for size in raw_cat_sizes:
                adjusted = size - 1 if use_missing_zero else size
                if adjusted == 2 and use_binary_drop:
                    adjusted = 1
                cat_sizes.append(adjusted)

        # transform according to factory
        fitter: Fitter = factory.create(ds.tensor_infos)
        self.tfm_, ds_train = fitter.fit_transform(ds_train)
        ds_val = self.tfm_(ds_val)

        # print("Expected shape from ds_train: ", ds_train.tensors['x_cont'].shape)
        numerical_indices, categorical_indices, categorical_vectors = None, None, None
        if 'one_hot' in self.config['tfms']:
            # Simpler categorical_info construction (no standardization):
            # - Treat one-hots for categories with <=100 levels as numerical features
            # - Only provide identity vectors for categories with >100 levels
            numerical_block = torch.arange(num_numerical)
            categorical_indices = []
            categorical_vectors = []
            numerical_indices_parts = []
            idx = num_numerical
            for cat_size in cat_sizes:
                cat_idxs = torch.arange(idx, idx + cat_size)
                if cat_size > 100:
                    categorical_indices.append(cat_idxs)
                    categorical_vectors.append(torch.eye(cat_size))
                else:
                    numerical_indices_parts.append(cat_idxs)
                idx += cat_size
            if len(numerical_indices_parts) > 0:
                numerical_indices = torch.cat([numerical_block] + numerical_indices_parts)
            else:
                numerical_indices = numerical_block
        
        # assume categoricals are encoded
        x_train = ds_train.tensors['x_cont'].to(device)
        x_val = ds_val.tensors['x_cont'].to(device)
        y_train = ds_train.tensors['y'].to(device)
        y_val = ds_val.tensors['y'].to(device)

        if self.n_classes_ == 0:  # regression
            assert ds.tensor_infos['y'].get_n_features() == 1
            self.y_mean_ = y_train.mean().item()
            self.y_std_ = y_train.std(correction=0).item()

            y_train = (y_train - self.y_mean_) / (self.y_std_ + 1e-30)
            y_val = (y_val - self.y_mean_) / (self.y_std_ + 1e-30)
        else:
            y_train = y_train.long()
            y_val = y_val.long()

        bandwidth = self.config.get('bandwidth', 10)
        p_interp = self.config.get('p_interp', 0.0)
        exponent = self.config.get('exponent', 1.0)
        reg = self.config.get('reg', 1e-3)
        iters = self.config.get('rfm_iters', 5)
        diag = self.config.get('diag', True)
        min_subset_size = self.config.get('max_leaf_samples', self.config.get('min_subset_size', 60_000))
        early_stop_rfm = self.config.get('early_stop_rfm', True)
        early_stop_multiplier = self.config.get('early_stop_multiplier', 1.1)
        classification_mode = self.config.get('classification_mode', 'prevalence')
        fast_categorical = self.config.get('fast_categorical', True)
        M_batch_size = self.config.get('M_batch_size', 'auto')
        overlap_fraction = self.config.get('overlap_fraction', 0.1)
        use_temperature_tuning = self.config.get('use_temperature_tuning', True)
        temp_tuning_space = self.config.get('temp_tuning_space', None)

        bandwidth_mode = self.config.get('bandwidth_mode', 'constant')
        kernel_type = self.config.get('kernel_type', 'l2')
        split_method = self.config.get('split_method', 'top_vector_agop_on_subset')
        if bandwidth_mode in ['constant', 'adaptive']:
            pass
        elif bandwidth_mode == 'sqrtd':
            bandwidth *= np.sqrt(x_train.shape[0])
        else:
            raise ValueError()

        if M_batch_size == 'auto':
            if kernel_type in ['gen_laplace', 'l1-laplace', 'lpq-laplace', 'l1', 'lpq', 'lpq_kermac']:
                # heuristic for storing a (n_train, M_batch_size, n_features) tensor in memory
                # 4 bytes per float
                full_tensor_size_per_elem_gb = (4 * n_train * ds_train.tensor_infos['x_cont'].get_n_features()) / (
                        1024 ** 3)
                full_tensor_size_per_elem_gb *= 12  # just a heuristic
                M_batch_size = max(1, min(8192, round(get_available_memory_gb(device) / full_tensor_size_per_elem_gb)))
                # M_batch_size = 512 if n_train <= 10_000 else (256 if n_train <= 20_000 else 64)
            else:
                M_batch_size = 8192

        print(f'{kernel_type=}, {M_batch_size=}')

        model_params, fit_params = {}, {}
        model_params['kernel'] = kernel_type
        model_params['bandwidth'] = bandwidth
        model_params['exponent'] = exponent
        model_params['norm_p'] = exponent + (2-exponent)*p_interp
        model_params['bandwidth_mode'] = bandwidth_mode
        model_params['diag'] = diag
        model_params['fast_categorical'] = fast_categorical
        fit_params['reg'] = reg
        fit_params['iters'] = iters
        fit_params['verbose'] = True
        fit_params['early_stop_rfm'] = early_stop_rfm
        fit_params['early_stop_multiplier'] = early_stop_multiplier
        fit_params['M_batch_size'] = M_batch_size

        if self.n_classes_ == 2:
            fit_params['solver'] = self.config.get('binary_solver', 'solve')
        else:
            fit_params['solver'] = 'solve'

        rfm_params = {'model': model_params, 'fit': fit_params}

        if 'one_hot' in self.config['tfms']:
            # Provide identity vectors only for high-cardinality categoricals; treat others as numerical
            categorical_info = {
                'numerical_indices': numerical_indices.to(device),
                'categorical_indices': [i.to(device) for i in categorical_indices],
                'categorical_vectors': [v.to(device) for v in categorical_vectors],
            }
        else:
            # treat cats like numerical features
            categorical_info = None
        
        classification = self.n_classes_ > 0

        val_metric_name = self.config.get('val_metric_name', 'class_error' if classification else 'mse')
        metric_name_to_metric_class = {
            '1-auroc-ovr': 'auc',
            'class_error': 'accuracy',
            'mse': 'mse',
            'rmse': 'rmse',
            'logloss': 'logloss',
            'cross_entropy': 'logloss',
            'brier': 'mse',
        }
        tuning_metric = metric_name_to_metric_class[val_metric_name]

        from xrfm import xRFM
        self.model_ = xRFM(rfm_params, device=device, min_subset_size=min_subset_size, 
                             tuning_metric=tuning_metric,
                             categorical_info=categorical_info, 
                             classification_mode=classification_mode,
                             split_method=split_method, overlap_fraction=overlap_fraction,
                           use_temperature_tuning=use_temperature_tuning, temp_tuning_space=temp_tuning_space)

        self.model_.fit(x_train, y_train, x_val, y_val)

        return None

    def predict(self, ds: DictDataset) -> torch.Tensor:
        ds = self.tfm_(ds).to(self.device_)

        x_cont = ds.tensors['x_cont']

        if self.n_classes_ > 0:
            with torch.cuda.device(self.device_) if self.device_.startswith('cuda') else contextlib.nullcontext():
                y_pred = torch.from_numpy(self.model_.predict_proba(x_cont)).to(self.device_)
            y_pred = torch.log(y_pred)
        else:
            with torch.cuda.device(self.device_) if self.device_.startswith('cuda') else contextlib.nullcontext():
                y_pred = torch.from_numpy(self.model_.predict(x_cont)).to(self.device_)
            y_pred = y_pred * self.y_std_ + self.y_mean_

        return y_pred[None]  # add n_models dimension

    def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                               split_seeds: List[int], n_train: int) -> RequiredResources:
        assert n_cv == 1
        assert n_refit == 0
        assert n_splits == 1
        updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config)
        time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8}
        ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0}
        # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4,
        #                   'cat_size_sum': 2e-3}
        # gpu_ram_params = {'': 0.2, 'ds_onehot_size_gb': 5.0, 'n_train': 6e-5, 'n_features': 2e-3,
        #                   'n_train*n_train': 20.0 / (1024 ** 3), 'n_train*n_features': 20.0 / (1024 ** 3)}
        gpu_ram_params = {'': 0.0, 'ds_onehot_size_gb': 0.0, 'n_train': 0.0, 'n_features': 0.0,
                          'n_train*n_train': 0.0, 'n_train*n_features': 0.0}
        rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params,
                               cpu_ram_params=ram_params, n_gpus=1, gpu_usage=1)
        
        # print("rc.get_required_resources(ds, n_train=n_train)")
        # rr = rc.get_required_resources(ds, n_train=n_train)
        # print("rr.n_threads = ", rr.n_threads)
        # print("rr.cpu_ram_gb = ", rr.cpu_ram_gb)
        # print("rr.n_gpus = ", rr.n_gpus)
        # print("rr.gpu_usage = ", rr.gpu_usage)
        # print("rr.gpu_ram_gb = ", rr.gpu_ram_gb)
        # print("rr.time_s = ", rr.time_s)
        # exit()
        return rc.get_required_resources(ds, n_train=n_train)
        # return RequiredResources(time_s=10.0, n_threads=16, cpu_ram_gb=50, n_gpus=0)


def sample_xrfm_params(seed: int, hpo_space_name: str = 'default'):
    rng = np.random.default_rng(seed)

    if hpo_space_name == 'default':
        # similar or identical to the search space used in TabArena
        # (but here we also tune the categorical preprocessing)
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]
        params = {
            'bandwidth': np.exp(rng.uniform(np.log(0.5), np.log(200.0))),
            'reg': np.exp(rng.uniform(np.log(1e-6), np.log(10.))),
            'exponent': rng.uniform(0.7, 1.4),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq', 'l2'], p=[0.8, 0.2]),
            # don't set these here so they can be overridden
            # they're the default values anyway
            # 'bandwidth_mode': rng.choice(['constant']),
            # 'min_subset_size': 60_000,
            # 'rfm_iters': 5,
            # 'classification_mode': 'prevalence',
            # 'binary_solver': 'solve',
            # 'early_stop_rfm': True,
            # 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss)
            # 'split_method': 'top_vector_agop_on_subset',
        }
    elif hpo_space_name == 'only_l2':
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]
        params = {
            'bandwidth': np.exp(rng.uniform(np.log(0.5), np.log(200.0))),
            'reg': np.exp(rng.uniform(np.log(1e-6), np.log(10.))),
            'exponent': rng.uniform(0.7, 1.4),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            # don't set these here so they can be overridden
            # 'bandwidth_mode': rng.choice(['constant']),
            # 'kernel_type': 'l2',
            # 'min_subset_size': 60_000,
            # 'rfm_iters': 5,
            # 'classification_mode': 'prevalence',
            # 'binary_solver': 'solve',
            # 'early_stop_rfm': True,
            # 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss)
            # 'split_method': 'top_vector_agop_on_subset',
        }
    elif hpo_space_name == 'paper-large':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',  # todo: adjust general solver?
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'top_vector_agop_on_subset',
            'overlap_fraction': 0.0,
            'use_temperature_tuning': False,
        }
    elif hpo_space_name == 'paper-large-pca':
        # like paper-large, but with pca splitting
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,
            'rfm_iters': 5,  # don't put it here, it's the default anyway and can be overridden
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',  # todo: adjust general solver?
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'pca',  # changed compared
            'overlap_fraction': 0.0,
            'use_temperature_tuning': False,
        }
    elif hpo_space_name == 'large-soft':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'top_vector_agop_on_subset',
            # 'overlap_fraction': 0.0,
            # 'use_temperature_tuning': False,
        }
    elif hpo_space_name == 'large-soft-pca':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'pca',
            # 'overlap_fraction': 0.0,
            # 'use_temperature_tuning': False,
        }
    elif hpo_space_name == 'large-temptune':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'top_vector_agop_on_subset',
            'overlap_fraction': 0.0,
            # 'use_temperature_tuning': False,
            'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15))
        }
    elif hpo_space_name == 'large-temptune-pca':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'pca',
            'overlap_fraction': 0.0,
            # 'use_temperature_tuning': False,
            'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15))
        }
    elif hpo_space_name == 'large-temptune-rf':
        # used on meta-test in the paper
        num_tfms_list = [['mean_center', 'l2_normalize']]
        num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))]
        cat_tfms_list = [['ordinal_encoding'], ['one_hot']]
        cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))]

        params = {
            'bandwidth_mode': rng.choice(['constant', 'adaptive']),
            'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))),
            'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))),
            'exponent': rng.uniform(0.7, 1.3),
            'p_interp': rng.uniform(0., 0.8),
            'tfms': num_tfms + cat_tfms,
            'diag': rng.choice([False, True]),
            'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]),
            # 'max_leaf_samples': 60_000,  # don't put it here, it's the default anyway and can be overridden
            'rfm_iters': 5,
            'classification_mode': 'zero_one',
            'binary_solver': 'solve',
            'early_stop_rfm': True,
            'early_stop_multiplier': 1.1,  # early stop if val metric > esm * best val metric (for loss)
            'split_method': 'rf_criterion',
            'overlap_fraction': 0.0,
            # 'use_temperature_tuning': False,
            'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15))
        }
    else:
        raise ValueError(f'Unknown {hpo_space_name=}')

    return params


class RandomParamsxRFMAlgInterface(RandomParamsAlgInterface):
    def _sample_params(self, is_classification: bool, seed: int, n_train: int):
        return sample_xrfm_params(seed, self.config.get('hpo_space_name', 'default'))

    def _create_interface_from_config(self, n_tv_splits: int, **config):
        return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**config) for i in range(n_tv_splits)])


================================================
FILE: pytabkit/models/data/__init__.py
================================================


================================================
FILE: pytabkit/models/data/conversion.py
================================================
import warnings
from typing import Union, List, Optional

import numpy as np
import pandas as pd
import torch
from pandas import Index
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer

from pytabkit.models.data.data import DictDataset, TensorInfo


class ToDictDatasetConverter:
    def __init__(self, cat_features: Optional[Union[List[bool], np.ndarray]] = None, verbosity: int = 0):
        self.cat_features = cat_features if cat_features is None else np.asarray(cat_features, dtype=np.bool_)
        self.num_tf = None
        self.cat_tf = None
        self.fitted = False
        self.tensor_infos = None
        self.fitted_columns = None
        self.fitted_type = None
        self.verbosity = verbosity

    def fit_transform(self, x: Union[np.ndarray, pd.DataFrame, pd.Series, DictDataset]) -> DictDataset:
        self.fitted = True
        self.fitted_type = type(x)

        if isinstance(x, DictDataset):
            return x

        x = pd.DataFrame(x)
        self.fitted_columns = set(x.columns)

        if self.cat_features is not None:
            cat_columns = list(x.columns[self.cat_features])
            num_columns = list(x.columns[~self.cat_features])
            self.num_tf = ColumnTransformer(transformers=[
                ('continuous', FunctionTransformer(), num_columns),
            ])
            self.cat_tf = ColumnTransformer(transformers=[
                ('categorical', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1,
                                               encoded_missing_value=-1), cat_columns)
            ])
        else:
            self.num_tf = ColumnTransformer(transformers=[
                ('continuous', FunctionTransformer(), make_column_selector(dtype_include='number')),
                # ('continuous', FunctionTransformer(), make_column_selector(dtype_exclude=["string", "object", "category", "boolean"])),
                # todo: include this if we can make skrub a dependency
                # ('datetime', DatetimeEncoder(), make_column_selector(dtype_include=['datetime', 'datetimetz']))
            ])
            self.cat_tf = ColumnTransformer(transformers=[
                ('categorical', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1,
                                               encoded_missing_value=-1),
                 make_column_selector(dtype_include=["string", "object", "category", "boolean"]))
            ])

        x_cont = torch.as_tensor(self.num_tf.fit_transform(x), dtype=torch.float32)
        x_cat = torch.as_tensor(self.cat_tf.fit_transform(x) + 1, dtype=torch.long)

        # print(f'{self.num_tf.transformers_=}')
        # print(f'{self.cat_tf.transformers_=}')

        selected_cols = []

        for col_tfm in [self.num_tf, self.cat_tf]:
            for name, tfm, cols in col_tfm.transformers_:
                if tfm != 'drop':
                    selected_cols.extend(list(cols))
                    if self.verbosity >= 1:
                        print(f'Columns classified as {name}: {list(cols)}')

        non_selected_cols = self.fitted_columns.difference(set(selected_cols))
        if len(non_selected_cols) >= 1:
            warnings.warn(f'The following columns are not used due to their data type: {list(non_selected_cols)}')

        cat_sizes = torch.max(x_cat, dim=0)[0] + 1
        self.tensor_infos = {'x_cont': TensorInfo(feat_shape=x_cont.shape[1:]),
                             'x_cat': TensorInfo(cat_sizes=cat_sizes)}

        return DictDataset(tensors={'x_cont': x_cont, 'x_cat': x_cat}, tensor_infos=self.tensor_infos)

    def transform(self, x: Union[np.ndarray, pd.DataFrame, pd.Series, DictDataset]) -> DictDataset:
        if not self.fitted:
            raise ValueError("Call fit() first to fit the converter.")
        if not isinstance(x, self.fitted_type):
            raise ValueError(f'Different input types during fit and predict: {self.fitted_type} and {type(x)}')

        if isinstance(x, DictDataset):
            # todo: could check whether cat_sizes etc. match?
            return x

        x = pd.DataFrame(x)

        # print(set(x.columns), self.fitted_columns)

        if set(x.columns) != self.fitted_columns:
            print('Raising column error')
            # second line is to satisfy the sklearn test
            # check_n_features_in_after_fitting in scikit-learn >= 1.6
            raise ValueError(f'Different columns during fit() and predict(): {self.fitted_columns} and {set(x.columns)}\n'
                             f'X has {len(x.columns)} features, but estimator is expecting {len(self.fitted_columns)} features as input')

        x_cont = torch.as_tensor(self.num_tf.transform(x), dtype=torch.float32)
        x_cat = torch.as_tensor(self.cat_tf.transform(x) + 1, dtype=torch.long)

        return DictDataset(tensors={'x_cont': x_cont, 'x_cat': x_cat}, tensor_infos=self.tensor_infos)


if __name__ == '__main__':
    data = {'Continuous1': [1.2, 2.3, 3.4, 4.5, 5.6],
            'Continuous2': [5.6, 6.7, 7.8, 8.9, 10.0],
            'Category1': ['A', 'B', 'A', 'C', None],
            'Category2': ['X', 'Y', None, 'X', None]}
    df = pd.DataFrame(data)
    df['Category2'] = df['Category2'].astype('category')

    print(set(df.columns) == set(df.columns))

    print(ToDictDatasetConverter(cat_features=[True, False, True, True]).fit_transform(df).tensors)
    print(ToDictDatasetConverter().fit_transform(df).tensors)


================================================
FILE: pytabkit/models/data/data.py
================================================
import math
from typing import Optional, Union, List, Dict, Tuple

import numpy as np
import pandas as pd
import torch

from pytabkit.models import utils
from pytabkit.models.torch_utils import seeded_randperm, batch_randperm


class TaskType:
    CLASSIFICATION = 'classification'
    REGRESSION = 'regression'


# todo: add info which values might be missing?
# todo: use np arrays instead of torch.Tensor? need to convert back a lot of .item()...
class TensorInfo:
    def __init__(self, feat_shape: Optional[Union[List, np.ndarray, torch.Tensor]] = None,
                 cat_sizes: Optional[Union[List, np.ndarray, torch.Tensor]] = None):
        self.feat_shape = feat_shape
        self.cat_sizes = cat_sizes
        if isinstance(self.feat_shape, torch.Tensor):
            self.feat_shape = self.feat_shape.detach().cpu().numpy()

    def get_feat_shape(self) -> np.ndarray:
        if self.feat_shape is None and self.cat_sizes is not None:
            self.feat_shape = np.asarray(self.cat_sizes).shape
        return np.asarray(self.feat_shape)
        # return torch.as_tensor(self.feat_shape)

    def get_cat_sizes(self) -> torch.Tensor:
        if self.cat_sizes is None and self.feat_shape is not None:
            self.cat_sizes = torch.zeros(*self.feat_shape, dtype=torch.long)
        return torch.as_tensor(self.cat_sizes)

    def get_n_features(self) -> int:
        return np.prod(self.get_feat_shape())

    def get_cat_size_product(self) -> int:
        return torch.prod(self.get_cat_sizes()).item()

    def is_empty(self) -> bool:
        return self.get_n_features() == 0

    def is_cont(self) -> bool:
        return self.cat_sizes is None or len(self.cat_sizes) == 0 or self.cat_sizes[
            0] == 0  # todo: might not work for multi-dimensional tensors

    def is_cat(self) -> bool:
        return not self.is_cont()

    def to_dict(self) -> Dict:
        # convert to list for yaml serialization
        return {'feat_shape': self.get_feat_shape().tolist(), 'cat_sizes': self.get_cat_sizes().numpy().tolist()}

    @staticmethod
    def from_dict(data: Dict) -> 'TensorInfo':
        return TensorInfo(data['feat_shape'], data['cat_sizes'])

    @staticmethod
    def concat(tensor_infos: List['TensorInfo']) -> 'TensorInfo':
        """
        Create the TensorInfo that corresponds to concatenating the tensors.
        :param tensor_infos:
        :return:
        """
        assert len(tensor_infos) > 0
        if tensor_infos[0].is_cat():
            return TensorInfo(cat_sizes=torch.cat([ti.get_cat_sizes() for ti in tensor_infos], dim=0))
        else:
            return TensorInfo(feat_shape=sum([ti.get_feat_shape() for ti in tensor_infos]))


class DictDataset:
    # todo: add conversion methods to/from pandas dataframe?
    #  also to/from numpy/torch tensors?
    def __init__(self, tensors: Optional[Dict[str, torch.Tensor]], tensor_infos: Dict[str, TensorInfo],
                 device: Optional[Union[str, torch.device]] = None,
                 n_samples: Optional[int] = None):
        """
        :param tensors: Can be None, but then device and n_samples must be specified.
        :param tensor_infos: Information (shape, category sizes) for each tensor.
        :param device: Device that tensors is on. If tensors is specified, this will be computed automatically.
        :param n_samples: Number of samples. If tensors is specified, this will be computed automatically.
        """
        self.device = device if device is not None else next(iter(tensors.values())).device
        self.n_samples = n_samples if n_samples is not None else next(iter(tensors.values())).shape[0]
        self.tensors = None if tensors is None else {key: t.to(device) for key, t in tensors.items()}
        self.tensor_infos = tensor_infos

    def split_xy(self) -> Tuple['DictDataset', 'DictDataset']:
        y_keys = [key for key in self.tensors if key.startswith('y')]
        x_keys = [key for key in self.tensors if key not in y_keys]
        return self[x_keys], self[y_keys]

    def without_labels(self) -> 'DictDataset':
        return self.split_xy()[0]

    def to_df(self) -> pd.DataFrame:
        tensor_dfs = []
        for key in self.tensors:
            val_np = self.tensors[key].detach().cpu().numpy()
            col_names = [f'{key}_{i}' for i in range(val_np.shape[1])]

            if self.tensor_infos[key].is_cat():
                cat_sizes = self.tensor_infos[key].get_cat_sizes().numpy()
                df = pd.DataFrame(
                    {col_names[i]: pd.Categorical(val_np[:, i], categories=list(range(cat_sizes[i]))) for i in
                     range(len(col_names))})
            else:
                df = pd.DataFrame(val_np, columns=col_names)

            tensor_dfs.append(df)

        return pd.concat(tensor_dfs, axis=1)

    def get_batch(self, idxs) -> Dict[str, torch.Tensor]:
        return {key: t[idxs, :] for key, t in self.tensors.items()}

    def get_sub_dataset(self, idxs) -> 'DictDataset':
        return DictDataset(self.get_batch(idxs), self.tensor_infos, device=self.device)

    def get_shuffled(self, seed) -> 'DictDataset':
        return self.get_sub_dataset(seeded_randperm(self.n_samples, self.device, seed))

    def get_size_gb(self) -> float:
        """
        :return: RAM usage in Gigabytes
        """
        return self.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4)
                                     for ti in self.tensor_infos.values()]) / (1024 ** 3)

    @staticmethod
    def join(*datasets):
        return DictDataset(utils.join_dicts(*[ds.tensors for ds in datasets]),
                           utils.join_dicts(*[ds.tensor_infos for ds in datasets]))

    def to(self, device):
        return DictDataset(self.tensors, self.tensor_infos, device=device)

    def __getitem__(self, key):
        if isinstance(key, list):
            return DictDataset({k: self.tensors[k] for k in key}, {k: self.tensor_infos[k] for k in key},
                               device=self.device, n_samples=self.n_samples)
        return DictDataset({key: self.tensors[key]}, {key: self.tensor_infos[key]},
                           device=self.device, n_samples=self.n_samples)

    def get_n_classes(self):
        """
        :return: Returns the number of classes, given by the category size of the first feature of the y tensor.
        This only makes sense if there is a y tensor, and it does not check if y has more than one feature.
        """
        return self.tensor_infos['y'].get_cat_sizes()[0].item()


class ParallelDictDataLoader:
    def __init__(self, ds: DictDataset, idxs: torch.Tensor, batch_size: int, shuffle: bool = False,
                 adjust_bs: bool = False, drop_last: bool = False,
                 output_device: Optional[Union[str, torch.device]] = None):
        """
        :param dataset: A TaskData instance
        :param batch_size: default batch size, might be automatically adjusted
        :param shuffle: whether the dataset should be shuffled before each epoch
        :param adjust_bs: whether the batch_size may be lowered
        so that the batches are of more equal size while keeping the number of batches the same
        :param drop_last: whether the last batch should be omitted if it is smaller than the other ones
        :param output_device: The device that the returned data should be on
        (if None, take the device where the data already is)
        """
        self.ds = ds
        self.idxs = idxs.to(ds.device)
        self.n_parallel = idxs.shape[0]
        self.n_samples = idxs.shape[1]
        self.output_device = ds.device if output_device is None else output_device
        self.adjust_bs = adjust_bs
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.specified_batch_size = batch_size
        self.batch_size = min(batch_size, self.n_samples)

        if self.drop_last:
            self.n_batches = math.floor(self.n_samples / self.batch_size)
            if adjust_bs:
                self.batch_size = math.floor(self.n_samples / self.n_batches)
            self.sep_idxs = [self.batch_size * i for i in range(self.n_batches + 1)]
        else:
            self.n_batches = math.ceil(self.n_samples / self.batch_size)
            if adjust_bs:
                self.batch_size = math.ceil(self.n_samples / self.n_batches)
            self.sep_idxs = [self.batch_size * i for i in range(self.n_batches)] + [self.n_samples]

    def get_num_samples(self):
        return self.n_samples

    def get_num_iterated_samples(self):
        if self.drop_last:
            return self.n_batches * self.batch_size
        return self.get_num_samples()

    def __len__(self):
        return self.n_batches

    def __iter__(self):
        if self.shuffle:
            perms = batch_randperm(self.n_parallel, self.n_samples, device=self.ds.device)
            for start, stop in zip(self.sep_idxs[:-1], self.sep_idxs[1:]):
                batches = self.ds.get_batch(idxs=self.idxs.gather(1, perms[:, start:stop]))
                yield {key: t.to(self.output_device) for key, t in batches.items()}
        else:
            for start, stop in zip(self.sep_idxs[:-1], self.sep_idxs[1:]):
                batches = self.ds.get_batch(idxs=self.idxs[:, start:stop])
                yield {key: t.to(self.output_device) for key, t in batches.items()}


class ValDictDataLoader:
    def __init__(self, ds: DictDataset, val_idxs: torch.Tensor, val_batch_size=256):
        """
        Create a Prediction Dataloader from Dataset and validation indices
        """
        ds_x, ds_y = ds.split_xy()
        self.val_x_dl = ParallelDictDataLoader(ds_x, val_idxs, batch_size=val_batch_size)
        self.val_idxs = val_idxs
        self.val_y = ds_y.get_batch(val_idxs).get('y', None)
        self.n_samples = val_idxs.shape[1]

    def __len__(self):
        return self.n_samples

    def __iter__(self):
        return self.val_x_dl.__iter__()


================================================
FILE: pytabkit/models/data/nested_dict.py
================================================
from typing import Union, List, Tuple, Dict

from pytabkit.models import utils


class NestedDict:
    """
    Dictionary that can be used with multiple indices. Instead of
    d = dict()
    d['first'] = dict()
    d['first']['second'] = 1.0

    we can use

    d = NestedDict()
    d['first', 'second'] = 1.0
    """
    def __init__(self, data_dict=None):
        self.data_dict = data_dict if data_dict is not None else {}

    def __getitem__(self, idxs):
        if not isinstance(idxs, tuple):
            idxs = (idxs,)
        d = self.data_dict
        for idx in idxs:
            d = d[idx]
        return d

    def __setitem__(self, idxs, value):
        if not isinstance(idxs, tuple):
            idxs = (idxs,)
        if isinstance(value, NestedDict):
            value = value.data_dict # allow to properly "hang in" value in the case that value is of type NestedDict?
        d = self.data_dict
        for i, idx in enumerate(idxs):
            if idx not in d or i+1 == len(idxs):
                v = value
                for rev_idx in idxs[:i:-1]:
                    v = {rev_idx: v}
                d[idx] = v
                return
            d = d[idx]

    def __contains__(self, item: Union[List, Tuple]):
        current_dict = self.data_dict
        for elem in item:
            if elem not in current_dict:
                return False
            current_dict = current_dict[elem]
        return True


    def get(self, idxs, default=None):
        try:
            return self[idxs]
        except KeyError:
            return default

    def _dict_update_rec(self, d1: dict, d2: dict):
        for key in d2:
            if key in d1:
                self._dict_update_rec(d1[key], d2[key])
            else:
                d1[key] = d2[key]

    def update(self, other: 'NestedDict'):
        self._dict_update_rec(self.data_dict, other.data_dict)

    def __str__(self):
        return str(self.data_dict)

    def __repr__(self):
        return f'NestedDict({str(self)})'

    def get_dict(self) -> Dict:
        return self.data_dict

    @staticmethod
    def from_kwargs(**kwargs):
        return NestedDict(
            {key: (value.data_dict if isinstance(value, NestedDict) else value) for key, value in kwargs.items()}
        )


if __name__ == '__main__':
    nd = NestedDict()
    nd['test', 'test'] = 1
    print(nd['test'])


================================================
FILE: pytabkit/models/data/splits.py
================================================
import math
from typing import Tuple, List, Optional

import torch

from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset
from pytabkit.models.torch_utils import seeded_randperm


# splits should not reference tasks, since tasks should only be loaded in the respective processes in the DevicePool,
# while splits are loaded earlier

class Split:
    def __init__(self, ds: DictDataset, idxs: Tuple[torch.Tensor, torch.Tensor]):
        """
        :param ds: The dataset that is split into parts
        :param idxs: Tuple of Tensors containing indices of the different parts of ds
        """
        self.ds = ds
        self.idxs = idxs

    def get_sub_ds(self, i):
        return self.ds.get_sub_dataset(self.idxs[i])

    def get_sub_idxs(self, i):
        return self.idxs[i]


class Splitter:
    def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError()

    def split_ds(self, ds: DictDataset) -> Split:
        idxs = self.get_idxs(ds)
        return Split(ds, idxs)

    def get_split_sizes(self, n_samples: int) -> Tuple:
        raise NotImplementedError()


class RandomSplitter(Splitter):
    def __init__(self, seed, first_fraction=0.8, max_n_first: Optional[int] = None):
        self.seed = seed
        self.first_fraction = first_fraction
        self.max_n_first = max_n_first

    def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]:
        # use ceil such that e.g. in the case of 1 sample, the sample ends up in the training set.
        split_idx = int(math.ceil(self.first_fraction * ds.n_samples))
        if self.max_n_first is not None:
            split_idx = min(split_idx, self.max_n_first)
        perm = seeded_randperm(ds.n_samples, ds.device, self.seed)
        return perm[:split_idx], perm[split_idx:]

    def get_split_sizes(self, n_samples: int) -> Tuple:
        split_idx = int(math.ceil(self.first_fraction * n_samples))
        if self.max_n_first is not None:
            split_idx = min(split_idx, self.max_n_first)
        return split_idx, n_samples-split_idx


class IndexSplitter(Splitter):
    def __init__(self, index):
        self.index = index

    def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]:
        idxs = torch.arange(ds.n_samples, device=ds.device, dtype=torch.long)
        return idxs[:self.index], idxs[self.index:]

    def get_split_sizes(self, n_samples: int) -> Tuple:
        return self.index, n_samples-self.index


class AllNothingSplitter(Splitter):
    def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]:
        all = torch.arange(ds.n_samples, device=ds.device, dtype=torch.long)
        nothing = torch.zeros(0, device=ds.device, dtype=torch.long)
        return all, nothing

    def split_ds(self, ds: DictDataset) -> Split:
        idxs = self.get_idxs(ds)
        return Split(ds, idxs)

    def get_split_sizes(self, n_samples: int) -> Tuple:
        return n_samples, 0


class MultiSplitter:
    def get_idxs(self, ds: DictDataset) -> List[Tuple[torch.Tensor, torch.Tensor]]:
        raise NotImplementedError()

    def split_ds(self, ds: DictDataset) -> List[Split]:
        idxs_list = self.get_idxs(ds)
        return [Split(ds, idxs) for idxs in idxs_list]


class KFoldSplitter(MultiSplitter):
    def __init__(self, k: int, seed: int, stratified=False):
        if k <= 1:
            raise ValueError(f'KFoldSplitter: required k>=2, but received {k=}')
        self.k = k
        self.seed = seed
        self.stratified = stratified

    def get_idxs(self, ds: DictDataset) -> List[Tuple[torch.Tensor, torch.Tensor]]:
        idxs = seeded_randperm(ds.n_samples, device=ds.device, seed=self.seed)
        if self.stratified:
            # do it with random shuffling such that elements of the same class are still shuffled
            perm = torch.argsort(ds.tensors['y'][idxs, 0])
            idxs = idxs[perm]
        fold_len = (ds.n_samples // self.k) * self.k
        fold_idxs = [idxs[start:fold_len:self.k] for start in range(self.k)]
        rest_idxs = idxs[fold_len:]
        idxs_list = []
        for i in range(self.k):
            idxs_1 = torch.cat([fold_idxs[j] for j in range(self.k) if j != i] + [rest_idxs], dim=-1)
            idxs_list.append((idxs_1, fold_idxs[i]))
        return idxs_list

    def get_split_sizes(self, n_samples: int) -> Tuple:
        n_val = n_samples // self.k
        return n_samples - n_val, n_val


class SplitInfo:
    def __init__(self, splitter: Splitter, split_type: str, id: int, alg_seed: int, train_fraction: float = 0.75):
        self.splitter = splitter
        self.split_type = split_type  # one of "random", "default"
        self.id = id
        self.alg_seed = alg_seed
        self.train_fraction = train_fraction

    def get_sub_seed(self, split_idx: int, is_cv: bool):
        return utils.combine_seeds(self.alg_seed, 2 * split_idx + int(is_cv))
        # return self.alg_seed + 5000 * int(is_cv) + 10000 * split_idx

    def get_sub_splits(self, ds: DictDataset, n_splits: int, is_cv: bool) -> List[Split]:
        if not is_cv:
            split = AllNothingSplitter().split_ds(ds)
            return [split] * n_splits

        if n_splits <= 1:
            return [RandomSplitter(seed=self.alg_seed, first_fraction=self.train_fraction).split_ds(ds)]
        else:
            is_classification = ds.tensor_infos['y'].get_cat_sizes()[0].item() > 0
            return KFoldSplitter(n_splits, seed=self.alg_seed, stratified=is_classification).split_ds(ds)

    def get_train_and_val_size(self, n_samples: int, n_splits: int, is_cv: bool) -> Tuple[int, int]:
        n_trainval, n_test = self.splitter.get_split_sizes(n_samples)
        if not is_cv:
            return n_trainval, 0
        elif n_splits <= 1:
            return RandomSplitter(seed=self.alg_seed, first_fraction=self.train_fraction).get_split_sizes(n_trainval)
        else:
            # stratified doesn't influence split sizes
            return KFoldSplitter(n_splits, seed=self.alg_seed, stratified=False).get_split_sizes(n_samples)


================================================
FILE: pytabkit/models/hyper_opt/__init__.py
================================================


================================================
FILE: pytabkit/models/hyper_opt/coord_opt.py
================================================
from pathlib import Path

import numpy as np
from typing import Union, Callable, Any, Optional, Dict, Tuple

from pytabkit.models import utils
from pytabkit.models.hyper_opt.hyper_optimizers import HyperOptimizer


# implementing a custom coordinate-descent style hyperparameter optimizer


def identity(x):
    return x


class Hyperparameter:
    def __init__(self, start_value: Union[int, float], min_step_size: Union[int, float], importance: float,
                 log_scale: bool = False, only_int: bool = False,
                 min_value: Union[int, float] = -np.inf, max_value: Union[int, float] = np.inf,
                 out_func: Callable[[Any], Any] = None, max_step_size: float = np.inf):
        # if log_scale=True, min_value, max_value, min_step_size, and max_step_size are on the log scale,
        # i.e., min_value can still be negative
        # in this case, the values will be exponentiated at the end
        self.start_value = start_value
        self.min_step_size = min_step_size
        self.max_step_size = max_step_size
        self.importance = importance
        self.log_scale = log_scale
        self.only_int = only_int
        self.min_value = min_value
        self.max_value = max_value
        self.out_func = out_func or identity
        self.tfm = (lambda x: np.exp(x)) if log_scale else identity
        self.inv_tfm = (lambda x: np.log(x)) if log_scale else identity
        self.quant_tfm = (lambda x: round(x)) if only_int else identity
        # if log_scale:
        #     self.min_value = np.log(min_value) if 0 < min_value < np.inf else -np.inf
        #     self.max_value = np.log(max_value) if 0 < max_value < np.inf else np.inf
        if self.log_scale and self.only_int:
            # need to avoid having values < 0 for which round(exp(value)) = 0, which is not representable in log-space
            self.min_value = max(self.min_value, 0.0)

    def adjust_step_size(self, current_value: float, step_size: float) -> Optional[float]:
        # should return suggested step size that satisfies all constraints, or None if no suitable step size is found

        # We have three constraints: step size limit, min_value/max-value, and quantization.
        # Updating each of them could violate one of the others.
        # do a loop and check if all three are satisfied
        # if it doesn't work after a certain number of iterations, we fail and return None
        for i in range(5):
            updated = False
            step_size_sign = np.sign(step_size)

            # check min_step_size / max_step_size
            if np.abs(step_size) < self.min_step_size - 1e-8:
                step_size = step_size_sign * self.min_step_size
                updated = True
            if np.abs(step_size) > self.max_step_size + 1e-8:
                step_size = step_size_sign * self.max_step_size
                updated = True

            # check min_value / max_value
            candidate = current_value + step_size
            if candidate < self.min_value - 1e-8:
                candidate = self.min_value
                updated = True
            elif candidate > self.max_value + 1e-8:
                candidate = self.max_value
                updated = True
            step_size = candidate - current_value
            
            print(f'CoordOpt: {self.min_value=}, {self.max_value=}, {self.start_value=}')
            print(f'CoordOpt: {current_value=}, {candidate=}')

            curr_t = self.tfm(current_value)
            cand_t = self.tfm(candidate)
            curr_q = self.quant_tfm(curr_t)
            cand_q = self.quant_tfm(cand_t)

            if curr_q == cand_q:
                cand_q = curr_q + step_size_sign
                if self.log_scale and self.only_int and cand_q <= 0.5:
                    return None  # curr_q is 1 and we want to make cand_q = 0 but this doesn't exist in log scale
                step_size = self.inv_tfm(cand_q) - current_value
                updated = True

            if not updated:
                # step size fulfilled all three constraints in this loop and hence has not been updated
                return step_size
        return None  # did not find a step size that fulfills all constraints

    def apply_tfms(self, x: Any) -> Any:
        return self.out_func(self.quant_tfm(self.tfm(x)))


class CoordOptimizerImpl:
    # potential improvements:
    # increase the importances in an UCB-style
    # in coord_opt_idx allow to explore the reverse direction if the first step in the previous direction fails
    def __init__(self, f: Callable[[Dict], Tuple[float, Any]], space: Dict[str, Hyperparameter], n_steps: int,
                 beta: float = 0.5, step_dec_factor: float = 0.5, step_inc_factor: float = 2.0,
                 initial_step_multiplier: float = 8.0):
        self.f = f
        self.space = space
        self.n_steps = n_steps
        self.n_f_evals = 0

        if n_steps <= 0:
            raise ValueError(f'CoordOptimizerImpl: Got {n_steps=} but need n_steps > 0')

        # hyperparameters of the HPO method
        self.beta = beta
        self.step_dec_factor = step_dec_factor
        self.step_inc_factor = step_inc_factor
        self.initial_step_multiplier = initial_step_multiplier
        self.max_coord_opt_steps = 10

        self.keys = [k for k, v in space.items()]  # preserve the order in space
        self.d = len(self.keys)
        self.hps = [space[key] for key in self.keys]
        self.prior_importances = [hp.importance for hp in self.hps]
        self.priorities = np.argsort(np.asarray(self.prior_importances))[::-1]
        self.importances = np.zeros(self.d)
        self.min_step_sizes = np.asarray([hp.min_step_size for hp in self.hps])
        self.hp_values = np.asarray([hp.start_value for hp in self.hps])
        self.step_sizes = self.initial_step_multiplier * self.min_step_sizes
        for idx in range(self.d):
            # adjust direction of step sizes
            if self.hp_values[idx] - self.hps[idx].min_value > self.hps[idx].max_value - self.hp_values[idx]:
                # there is more space in the negative direction, start in the other direction
                self.step_sizes[idx] *= -1

        # current best hyperparameter values  (before transformation, i.e., can be in log-space)
        self.evaluated_hp_values = []  # to avoid evaluating the same point twice
        # eval loss on starting values
        self.loss, self.additional_info = self.eval(self.hp_values)
        self.blocked_directions = np.zeros(self.d, dtype=np.int32)

    def suggest(self, new_hp_values) -> float:
        # return loss difference, update optimum if necessary etc.
        # unblock variables if new optimum is found
        new_loss, new_additional_info = self.eval(new_hp_values)
        loss_diff = new_loss - self.loss
        if new_loss < self.loss:
            # update parameters
            self.loss = new_loss
            self.additional_info = new_additional_info
            self.hp_values = new_hp_values
            # unblock all coordinates
            print(f'CoordOpt: Unblocking all coordinates')
            self.blocked_directions = np.zeros(self.d, dtype=np.int32)

        return loss_diff

    def convert_hp_values(self, values: np.ndarray) -> Dict[str, Any]:
        return {key: hp.apply_tfms(value) for (key, value, hp) in zip(self.keys, values, self.hps)}

    def eval(self, new_hp_values: np.ndarray) -> Tuple[float, Any]:
        # convert hyperparameters, call function, increase step counter, raise error if step count is full
        if self.n_f_evals >= self.n_steps:
            raise StopIteration()
        self.n_f_evals += 1
        print(f'CoordOpt: Evaluating hyperparameters in step {self.n_f_evals}: {new_hp_values}')
        self.evaluated_hp_values.append(new_hp_values)
        converted = {key: hp.apply_tfms(value) for (key, value, hp) in zip(self.keys, new_hp_values, self.hps)}
        return self.f(converted)

    def already_evaluated(self, new_hp_values: np.ndarray) -> bool:
        """
        :param new_hp_values: New hyperparameter values that should be tried.
        :return: True if these hyperparameters have already been evaluated before
        """
        for old_hp_values in self.evaluated_hp_values:
            if np.allclose(new_hp_values, old_hp_values):
                return True

        return False

    def coord_opt_idx(self, idx: int):
        # implicitly update importance
        # keep track of step? or use an exception to break when the step count has finished?

        for i in range(self.max_coord_opt_steps):
            print(f'CoordOpt: Optimizing coordinate {idx}, step {i}')
            # loop while line search over coordinate still finds an improvement

            # adjust step size
            adj_step = self.hps[idx].adjust_step_size(current_value=self.hp_values[idx], step_size=self.step_sizes[idx])
            if adj_step is None:
                print(f'CoordOpt: adj_step is None')
                # no suitable step size was found, for example because the boundary is reached
                self.step_sizes[idx] = -self.step_dec_factor * self.step_sizes[idx]
                # if this would bring us below the minimum step size, block the variable
                if np.abs(self.step_sizes[idx]) < self.hps[idx].min_step_size:
                    print(f'CoordOpt: Blocking coordinate {idx}')
                    self.blocked_directions[idx] += 1
                return

            # make step with suggest()
            new_hp_values = np.copy(self.hp_values)
            new_hp_values[idx] += adj_step
            if self.already_evaluated(new_hp_values):
                print(f'CoordOpt: Already evaluated hyperparameters')
                self.step_sizes[idx] = -self.step_dec_factor * adj_step
                self.blocked_directions[idx] += 1
                return
            loss_diff = self.suggest(new_hp_values)

            # update importance
            self.importances[idx] = self.beta * self.importances[idx] + (1 - self.beta) * np.abs(loss_diff)

            if loss_diff < 0:
                print(f'CoordOpt: Loss decreased')
                self.step_sizes[idx] = self.step_inc_factor * adj_step
            else:
                print(f'CoordOpt: Loss did not decrease')
                # if loss didn't reduce, *= - step_dec_factor, return
                self.step_sizes[idx] = -self.step_dec_factor * adj_step
                # if this would bring us below the minimum step size, block the variable
                if np.abs(self.step_sizes[idx]) < self.hps[idx].min_step_size:
                    print(f'CoordOpt: Blocking coordinate {idx}')
                    self.blocked_directions[idx] += 1
                return

    def run(self) -> None:
        # wrap everything in try/catch for termination

        try:
            while True:
                # select best index according to importance
                if np.all(self.blocked_directions >= 2):
                    print(f'CoordOpt: Reached a local optimum')
                    return
                if len(self.priorities) > 0:
                    hp_idx = self.priorities[0]
                    self.priorities = self.priorities[1:]
                else:
                    # print(f'{self.importances=}')
                    importances = np.copy(self.importances)
                    importances[self.blocked_directions >= 2] = -1.0
                    hp_idx = np.argmax(importances)
                    if self.blocked_directions[hp_idx] >= 2:
                        raise RuntimeError('CoordOpt: selected blocked index, this should not occur')

                # run coord_opt_idx on the index
                self.coord_opt_idx(hp_idx)
        except StopIteration:
            return


class CoordOptimizer(HyperOptimizer):
    class CoordOptFuncWrapper:
        def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: Dict[str, Any]):
            self.f = f
            self.fixed_params = fixed_params

        def __call__(self, params: Dict[str, Any], seed: int = 0):
            params = utils.join_dicts(params, self.fixed_params)
            loss, additional_info = self.f(params)
            return np.inf if np.isnan(loss) else loss, None

    def __init__(self, space: Dict[str, Hyperparameter], fixed_params: Dict[str, Any], n_hyperopt_steps: int = 50, **config):
        super().__init__(n_hyperopt_steps=n_hyperopt_steps)
        self.space = space
        self.n_hyperopt_steps = n_hyperopt_steps
        self.fixed_params = fixed_params
        self.config = config

    def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None:
        fn = CoordOptimizer.CoordOptFuncWrapper(f, self.fixed_params)

        opt = CoordOptimizerImpl(fn, self.space, n_steps=self.n_hyperopt_steps)
        opt.run()


================================================
FILE: pytabkit/models/hyper_opt/hyper_optimizers.py
================================================
import time
from pathlib import Path
from typing import Callable, Tuple, Any, Dict, Union, Optional

import numpy as np

from pytabkit.models import utils
from pytabkit.models.training.logging import Logger


class FunctionEvaluationTracker:
    """
    Helper class to keep track of where the function to be optimized is evaluated and what are the best parameters
    """
    def __init__(self, f: Callable[[dict], Tuple[float, Any]], n_steps: int, opt_desc: str, logger: Logger):
        self.f = f
        self.n_steps = n_steps
        self.opt_desc = opt_desc
        self.logger = logger
        self.best_params = None
        self.best_result = None
        self.n_calls = 0

    def __call__(self, params: dict) -> Tuple[float, Any]:
        # params = utils.join_dicts(params, self.fixed_params)
        start_time = time.time()
        result = self.f(params)
        if np.isnan(result[0]):
            result = (np.inf, result[1])
        eval_time = time.time() - start_time
        if self.best_result is None or (result[0] <= self.best_result[0]):
            # print(f'new best result')
            self.best_params = params
            self.best_result = result
        self.n_calls += 1
        self.logger.log(-1, f'Hyperopt step {self.n_calls}/{self.n_steps} on {self.opt_desc} took {eval_time:g} s')

        # don't return the second part of result as HPO libraries might store all of them, causing RAM problems
        return result[0], None

    def get_best_params_and_result(self) -> Tuple[Dict, Tuple[float, Any]]:
        return self.best_params, self.best_result


class HyperOptimizer:
    def __init__(self, n_hyperopt_steps: int):
        self.n_hyperopt_steps = n_hyperopt_steps

    def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None:
        # override this in subclasses
        raise NotImplementedError()

    def optimize(self, f: Callable[[dict], Tuple[float, Any]], seed: int, opt_desc: str, logger: Logger) \
            -> Tuple[Dict, Any]:
        """
        :param f: Function to minimize. It should take a dict of parameters
        and return a tuple containing the validation loss and additional information about the run
        (additional information could for example be the early stopping epoch found in this particular run,
        for example {'n_estimators': best_n_estimators})
        :param seed: Random seed for optimization
        :param opt_desc: name of the optimized algorithm / optimization problem
         (used for printing optimization intermediate state)
        :param logger: Logger used for printing information
        :return: Returns a tuple containing a dictionary with the optimal parameters
        and the additional info generated by the function at the optimal parameters
        """
        # todo: could also add verbosity level
        # todo: may need to be able to treat failures, hence make the tuple optional?
        # todo: could allow to pass the iteration number to the function
        tracker = FunctionEvaluationTracker(f, n_steps=self.n_hyperopt_steps, opt_desc=opt_desc, logger=logger)
        self._optimize_impl(tracker, seed=seed)
        best_params, best_result = tracker.get_best_params_and_result()
        return best_params, best_result[1]

    def get_n_hyperopt_steps(self) -> int:
        return self.n_hyperopt_steps


# todo: have one class that does performance tracking of all intermediate steps  (or do that in HyperoptAlgInterface?)
# and maybe also do logging separately?
# maybe have wrapper function / callable class that tracks it?
# Then implement something like _optimize() that gets the wrapped function?


class ConstantHyperOptimizer(HyperOptimizer):
    def __init__(self, params: dict):
        super().__init__(n_hyperopt_steps=1)
        self.params = params

    def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None:
        f(self.params)


def f_unpack_dict(dct):
    """
    Unpacks all sub-dictionaries in given dictionary recursively.
    There should be no duplicated keys across all nested
    subdictionaries, or some instances will be lost without warning

    Source: https://www.kaggle.com/fanvacoolt/tutorial-on-hyperopt

    Parameters:
    ----------------
    dct : dictionary to unpack

    Returns:
    ----------------
    : unpacked dictionary
    """

    res = {}
    for (k, v) in dct.items():
        if isinstance(v, dict):
            res = {**res, **f_unpack_dict(v)}
        else:
            res[k] = v

    return res


class HyperoptOptimizer(HyperOptimizer):
    class HyperoptFuncWrapper:
        def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: dict):
            self.f = f
            self.fixed_params = fixed_params

        def __call__(self, params: dict):
            params = f_unpack_dict(params)  # for nested/conditional params
            from hyperopt import STATUS_FAIL, STATUS_OK
            params = utils.join_dicts(params, self.fixed_params)
            loss, additional_info = self.f(params)
            return {'loss': loss, 'additional_info': additional_info,
                    'status': STATUS_FAIL if np.isnan(loss) else STATUS_OK,
                    'params': params.copy()}

    def __init__(self, space, fixed_params, n_hyperopt_steps: int = 50, **config):
        super().__init__(n_hyperopt_steps=n_hyperopt_steps)
        self.space = space
        self.fixed_params = fixed_params
        self.config = config

    def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None:
        import hyperopt
        trials = hyperopt.Trials()  # todo: could serialize the trials object for restarting
        algo_name = self.config.get('hyperopt_algo', 'tpe')
        if algo_name == 'tpe':
            algo = hyperopt.tpe.suggest
        elif algo_name == 'atpe':
            # atpe seems to be not deterministic even when setting the seed...
            raise ValueError('atpe for hyperopt is not implemented since it is not deterministic and can throw errors')
            # print(f'Using atpe', flush=True)
            # algo = hyperopt.atpe.suggest
        elif algo_name == 'rand':
            print(f'Using rand', flush=True)
            algo = hyperopt.rand.suggest
        else:
            raise ValueError(f'Unknown hyperopt_algo name "{algo_name}"')
        fn = HyperoptOptimizer.HyperoptFuncWrapper(f, self.fixed_params)
        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
        _ = hyperopt.fmin(fn=fn, timeout=None if time_limit_s is None else int(time_limit_s),
                          space=self.space, algo=algo, max_evals=self.n_hyperopt_steps, trials=trials,
                          rstate=np.random.default_rng(seed=seed), verbose=False, show_progressbar=False)


class SMACOptimizer(HyperOptimizer):
    class SMACFuncWrapper:
        def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: Dict[str, Any]):
            self.f = f
            self.fixed_params = fixed_params

        def __call__(self, params, seed: int = 0):
            # params should be of type ConfigSpace.Configuration
            params = params.get_dictionary()
            params = utils.join_dicts(params, self.fixed_params)
            loss, additional_info = self.f(params)
            return np.inf if np.isnan(loss) else loss

    def __init__(self, space, fixed_params: Dict[str, Any], n_hyperopt_steps: int = 50,
                 tmp_folder: Union[str, Path] = 'smac3_output', **config):
        super().__init__(n_hyperopt_steps=n_hyperopt_steps)
        self.space = space
        self.n_hyperopt_steps = n_hyperopt_steps
        self.fixed_params = fixed_params
        self.config = config
        self.tmp_folder = tmp_folder

    def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None:
        use_gp = self.config.get('smac_surrogate', 'RF') == 'GP'
        fn = SMACOptimizer.SMACFuncWrapper(f, self.fixed_params)

        import smac
        scenario = smac.Scenario(self.space, deterministic=True, n_trials=self.n_hyperopt_steps,
                                 seed=seed, use_default_config=True, output_directory=self.tmp_folder)

        max_ratio = 0.25
        n_configs_per_hyperparameter = 8 if use_gp else 10
        if 'n_initial_design' in self.config:
            max_ratio = self.config['n_initial_design'] / self.n_hyperopt_steps
            n_configs_per_hyperparameter = self.config['n_initial_design']

        from smac.initial_design import SobolInitialDesign
        initial_design = SobolInitialDesign(
            scenario=scenario,
            n_configs=None,
            n_configs_per_hyperparameter=n_configs_per_hyperparameter,
            max_ratio=max_ratio,
            additional_configs=[],
        )

        # Now we use SMAC to find the best hyperparameters
        if use_gp:
            print(f'Using SMAC with GP surrogate')
            facade = smac.BlackBoxFacade(
                scenario=scenario,
                target_function=fn.__call__,
                overwrite=True,
                logging_level=False,
                initial_design=initial_design
            )
        else:
            facade = smac.HyperparameterOptimizationFacade(
                scenario,
                fn.__call__,  # We pass the target function here
                overwrite=True,  # Overrides any previous results that are found that are inconsistent with the meta-data
                logging_level=False,  # no logging
                initial_design=initial_design,
            )
        facade.optimize()


================================================
FILE: pytabkit/models/nn_models/__init__.py
================================================


================================================
FILE: pytabkit/models/nn_models/activations.py
================================================
import torch
import torch.nn.functional as F
from typing import Dict

# ------ from fastai2
from torch.jit import script

from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.nn_models.base import Variable, Fitter, FitterFactory, FunctionFitter, Layer


@script
def _swish_jit_fwd(x): return x.mul(torch.sigmoid(x))


@script
def _swish_jit_bwd(x, grad_output):
    x_sigmoid = torch.sigmoid(x)
    return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))


class _SwishJitAutoFn(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return _swish_jit_fwd(x)

    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_variables[0]
        return _swish_jit_bwd(x, grad_output)


# don't use the optimized version since this seems to behave slightly differently for Pytorch Lightning
# def swish(x): return _SwishJitAutoFn.apply(x)
def swish(x): return x * torch.sigmoid(x)


@script
def _mish_jit_fwd(x): return x.mul(torch.tanh(F.softplus(x)))


@script
def _mish_jit_bwd(x, grad_output):
    x_sigmoid = torch.sigmoid(x)
    x_tanh_sp = F.softplus(x).tanh()
    return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))


class MishJitAutoFn(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return _mish_jit_fwd(x)

    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return _mish_jit_bwd(x, grad_output)


# don't use the optimized version since this seems to behave slightly differently for Pytorch Lightning
# def mish(x): return MishJitAutoFn.apply(x)
def mish(x): return x.mul(torch.tanh(F.softplus(x)))


def golu(x):
    return x * torch.exp(-torch.exp(-torch.clamp(x, min=-10)))

# ----- end fastai2


class ParametricActivationLayer(Layer):
    def __init__(self, f, weight):
        super().__init__()
        self.f = f
        self.weight = weight

    def forward_cont(self, x):
        # print(f'{self.weight.mean().item()=:g}')
        return x + (self.f(x) - x) * self.weight

    def _stack(self, layers):
        return ParametricActivationLayer(self.f, Variable.stack([l.weight for l in layers]))


class ParametricActivationFitter(Fitter):
    def __init__(self, f, **config):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])
        self.f = f
        self.act_lr_factor = config.get('act_lr_factor', 1.0)
        self.act_wd_factor = config.get('act_wd_factor', 1.0)

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        n_cont = ds.tensor_infos['x_cont'].get_n_features()
        return ParametricActivationLayer(self.f, Variable(torch.ones(1, n_cont, device=ds.device),
                                                          trainable=True, hyper_factors={'lr': self.act_lr_factor,
                                                                                         'wd': self.act_wd_factor}))


class ActivationFactory(FitterFactory):
    def __init__(self, **config):
        super().__init__()
        self.config = config

    def _create(self, tensor_infos) -> Fitter:
        # todo: implement more activations, also parametric ones
        act_name = self.config.get('act_name', self.config.get('act', 'relu'))
        if act_name == 'relu':
            f = torch.relu
        elif act_name == 'selu':
            f = torch.selu
        elif act_name == 'swish' or act_name == 'silu':
            f = swish
        elif act_name == 'sswish':  # normalized by output variance
            f = lambda x: 1.6765 * swish(x)
        elif act_name == 'mish':
            f = mish
        elif act_name == 'smish':   # normalized by output variance
            f = lambda x: 1.6 * mish(x)
        elif act_name == 'gelu':
            f = F.gelu
        elif act_name == 'elu':
            f = F.elu
        elif act_name == 'golu':
            f = golu
        else:
            raise ValueError(f'Activation {act_name} unknown')


        if self.config.get('use_parametric_act', False):
            return ParametricActivationFitter(f, **self.config)
        else:
            return FunctionFitter(f)


================================================
FILE: pytabkit/models/nn_models/base.py
================================================
from pytabkit.models import torch_utils, utils
from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.training.coord import HyperparamManager
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch._C import _disabled_torch_function_impl
import numpy as np
import threading
import re
import copy
from contextlib import contextmanager
from typing import Optional, List, Union, Dict, Tuple

# have a layer that allows to split/merge DictDatasets?
# need something like numerical_preprocess
# could specify a input to output mapping, e.g. {'x_cont': None, 'x_cat': 'x_cont'}, which could also allow to merge
# or just have a ParallelFitter that merges outputs, with a tensor subselection beforehand
# e.g.
# num_pipeline = SequentialFactory([FilterFactory('x_cont'), PreprocessingFactory(), NumericalEmbeddingFactory()])
# cat_pipeline = SequentialFactory([FilterFactory('x_cat', 'y'), OneHotFactory(), PreprocessingFactory(), EmbeddingFactory()]
# pipeline = ConcatFactory([num_pipeline, cat_pipeline])

# theoretically, could allow to split off fitters by max RAM usage / max num features depending on size of dataset
# then, small datasets could be preprocessed in advance even with heavy parallelization
# but this would require a parallelized version of DictDataset...

# fitter.fit() should pass context to Variable - how? After returning!
# But pass scope into fit already, otherwise parent scope will not be known
# fit() should then take scope, hp_manager - how to pass that on to sub-fitters? Or have nn.ModuleList-like system?
# in the latter case, could have a set_context() function - but then would need to ensure that this is called...
# problem: setting later to layer needs to be done still before layer is called - if fit_transform is implemented,
# this will not work
# use context manager instead?
# use self.create_variable()?  (could also easily be forgotten)
# what about if each layer takes a fitter?
# layer could also forget to pass scope to variable
# should a variable have hyper_getter itself instead of having it in the optimizer?

# Fitter constructor should have an attribute scope_name or so

# use context managers at factory creation, then replicate context manager in create() and fit()
# and implement fit_impl() and create_impl() then grab context in Layer() and Variable() constructors?
# in order to let factory set its own context as well (e.g. weight), include constructor parameter?
# need separate context for HyperparamManager around fit()?
# can we have a thread-local context?
# can those contexts also be used to select configs? (like for first_layer_config etc.)

# could use linux-like scope /first_layer/block/weight or /pipeline/1/robust_scale and then filter it using regexes

# run in problems with stack() and register_hypers() twice? or no problem because of new naming convention?
# or should stack() not call register_hypers() again but use a list of getters?
# (that would be good for having different hypers for different parallel layers,
# but bad for dropout implementation and maybe speed)

# could have a simplify() function in Fitter to remove Identity layers and empty SequentialLayers recursively
# then could maybe save the IdentityLayer check in SequentialLayer

# todo: does multiple inheritance from Fitter and FitterFactory work with contexts?


class Scope:
    def __init__(self, names: Optional[List[str]] = None):
        self._names = names or []

    def get_sub_scope(self, name: str) -> 'Scope':
        return Scope(self._names + [name])

    def __str__(self):
        return '/' + '/'.join(self._names)

    def matches(self, regex: Union[str, re.Pattern]) -> bool:
        if isinstance(regex, str):
            regex = re.compile(regex)
        return bool(regex.match(str(self)))


class TrainContext:
    # see https://stackoverflow.com/questions/51849395/how-can-we-associate-a-python-context-manager-to-the-variables-appearing-in-it
    _data = threading.local()

    def __init__(self, scope: Optional[Scope] = None, hp_manager: Optional[HyperparamManager] = None):
        self.scope = scope or Scope()
        self.hp_manager = hp_manager

    def clone(self):
        return TrainContext(copy.deepcopy(self.scope), self.hp_manager)

    @staticmethod
    def get_global_context() -> 'TrainContext':
        if not hasattr(TrainContext._data, 'context'):
            TrainContext._data.context = TrainContext()
        return TrainContext._data.context


@contextmanager
def sub_scope_context(name: str):
    current_context = TrainContext.get_global_context()
    old_scope = current_context.scope
    current_context.scope = old_scope.get_sub_scope(name)
    yield
    current_context.scope = old_scope


@contextmanager
def sub_scopes_context(names: List[str]):
    current_context = TrainContext.get_global_context()
    old_scope = current_context.scope
    new_scope = old_scope
    for name in names:
        new_scope = new_scope.get_sub_scope(name)
    current_context.scope = new_scope
    yield
    current_context.scope = old_scope


@contextmanager
def set_scope_context(scope: Scope):
    current_context = TrainContext.get_global_context()
    old_scope = current_context.scope
    current_context.scope = scope
    yield
    current_context.scope = old_scope


@contextmanager
def set_hp_context(hp_manager: Optional[HyperparamManager]):
    current_context = TrainContext.get_global_context()
    old_hp_manager = current_context.hp_manager
    if hp_manager:
        current_context.hp_manager = hp_manager
    yield
    current_context.hp_manager = old_hp_manager


class ContextAware:
    def __init__(self, scope_names: Optional[List[str]] = None):
        super().__init__()  # needed in case of multiple inheritance from ContextAware and another base class
        self.scope_names = scope_names or []

    def add_scope(self, name: str):
        self.scope_names.append(name)
        return self

    def add_others_scope(self, other: 'ContextAware'):
        self.scope_names.extend(other.scope_names)
        return self

    @contextmanager
    def set_context(self):
        with sub_scopes_context(self.scope_names):
            yield


class ContextRecorder:
    def __init__(self):
        super().__init__()   # needed in case of multiple inheritance from ContextRecorder and another base class
        self.context = TrainContext.get_global_context().clone()

    @contextmanager
    def set_context(self):
        with set_scope_context(self.context.scope):
            with set_hp_context(self.context.hp_manager):
                yield


class StringConvertible:
    def __init__(self):
        super().__init__()  # for multiple inheritance

    def __repr__(self):
        return str(self)

    def __str__(self):
        return self.__class__.__name__ + '(' \
               + ', '.join([f'{key} = {value}' for key, value in self.__dict__.items()]) + ')'


class Variable(ContextRecorder, nn.Parameter):
    def __new__(cls, data=None, trainable=True, requires_grad=None, hyper_factors=None):
        if data is None:
            data = torch.Tensor()
        if requires_grad is None:
            requires_grad = trainable
        obj = super().__new__(cls, data, requires_grad)
        obj.hyper_factors = hyper_factors or dict()
        obj.trainable = trainable
        return obj

    def __init__(self, data=None, trainable=True, requires_grad=None, hyper_factors=None):
        super().__init__()

    def __deepcopy__(self, memo):
        if id(self) in memo:
            return memo[id(self)]
        else:
            result = type(self)(self.data.clone(memory_format=torch.preserve_format), self.trainable,
                                self.requires_grad, self.hyper_factors)
            memo[id(self)] = result
            return result

    def __repr__(self):
        return f'Variable(trainable={self.trainable}) containing:\n' + super(Variable, self).__repr__()

    __torch_function__ = _disabled_torch_function_impl

    @staticmethod
    def stack(vars: List['Variable'], dim=0):
        # vars must not be an empty list
        # todo: could make hyper_factors stackable
        with vars[0].set_context():
            with torch.no_grad():
                return Variable(torch.stack(vars, dim=dim), trainable=vars[0].trainable,
                                requires_grad=vars[0].requires_grad, hyper_factors=vars[0].hyper_factors)


# ------- Layers -------


class Layer(ContextRecorder, StringConvertible, nn.Module):
    """
    Extended version of nn.Module, allowing vectorization, processing data sets with multiple tensors,
    using Variable instead of Parameter, ...

    The following methods need to be overridden:
    - forward_tensor_infos (but if the output is constant, we can just set new_tensor_infos in the constructor)
    - forward_tensor or forward_cont (the latter if only x_cont is changed)
    - _stack()
    - optionally __repr__() and __str__()
    """

    def __init__(self, new_tensor_infos: Optional[Dict[str, TensorInfo]] = None,
                 fitter: Optional['Fitter'] = None, remove_keys: Optional[Union[str, List[str]]] = None):
        """
        Constructor. Puts the layer in eval mode, since it might be used inside the fit_transform() of the Fitter.
        The parameters provide different opportunities
        to specify a default implementation for forward_tensor_infos().
        The default implementation is:
        ```
            if self.fitter is not None:
                return self.fitter.forward_tensor_infos(tensor_infos)
            return utils.update_dict(tensor_infos, self.new_tensor_infos, remove_keys=self.remove_keys)
        ```
        """
        super().__init__()
        self.new_tensor_infos = {} if new_tensor_infos is None else new_tensor_infos
        self.remove_keys = remove_keys
        self.fitter = fitter
        self.hp_manager = None
        # don't put in eval mode, so we have realistic behavior during fit_transform()
        self.eval()  # todo: remove

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        """
        Override this method if the information from constructor is not sufficient.
        :param tensor_infos: Tensor infos (shapes etc.)
        :return: Transformed tensor infos.
        """
        if self.fitter is not None:
            return self.fitter.forward_tensor_infos(tensor_infos)
        return utils.update_dict(tensor_infos, self.new_tensor_infos, remove_keys=self.remove_keys)

    def forward(self, data: Union[DictDataset, Dict[str, torch.Tensor]]) -> Union[DictDataset, Dict[str, torch.Tensor]]:
        """
        This is an implementation of the nn.Module forward() function, which is called by __call__().
        Don't override this method.
        :param data: data set or dict of tensors.
        :return: Transformed version of the data set or dict of tensors.
        """
        if isinstance(data, DictDataset):
            return self.forward_ds(data)
        else:
            return self.forward_tensors(data)

    def forward_ds(self, ds: DictDataset) -> DictDataset:
        # default implementation
        return DictDataset(None if ds.tensors is None else self.forward_tensors(ds.tensors),
                           self.forward_tensor_infos(ds.tensor_infos), device=ds.device, n_samples=ds.n_samples)

    def forward_cont(self, x: torch.Tensor) -> torch.Tensor:
        # only needs to be overridden if the default implementation of forward_tensors() is used
        # we check this to avoid infinite recursion if forward_tensors() is not overridden
        if self.__class__.forward_tensors != Layer.forward_tensors:
            return self.forward_tensors({'x_cont': x})['x_cont']
        raise NotImplementedError()

    def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Transforms the given tensors.
        :param tensors:
        :return:
        """
        # default implementation just updates x_cont using self.forward_cont()
        # print(f'{self.__class__.__name__}: {tensors.keys()=}')
        return utils.join_dicts(tensors, {'x_cont': self.forward_cont(tensors['x_cont'])})

    def _stack(self, layers: List['Layer']) -> 'Layer':
        """
        Implementation of stack(). Can be overridden.
        Vectorizes the given layers. The given layers should all have the same structure.
        If layers[0] has no parameters (trainable or buffer), then the default implementation simply returns layers[0].
        Override if another implementation is desired.
        :param layers: Layers that should be stacked for vectorization.
        :return: Returns the stacked Layer object
        """
        # this needs to be overridden by some classes
        if len(list(layers[0].state_dict())) == 0:
            # no parameters, can simply vectorize by taking the first layer
            return layers[0]
        else:
            raise NotImplementedError()

    def stack(self, layers: List['Layer']) -> 'Layer':
        """
        Vectorizes the given layers. The given layers should all have the same structure.
        Do not override this method, override _stack() instead.
        :param layers: Layers that should be stacked for vectorization.
        :return: Returns the stacked Layer object
        """
        with self.set_context():
            return self._stack(layers)

    def __setattr__(self, name, value):
        # adapted from nn.Module.__setattr__
        # first checks whether the value is a Variable, otherwise uses nn.Module.__setattr__
        def remove_from(*dicts_or_sets):
            for d in dicts_or_sets:
                if name in d:
                    if isinstance(d, dict):
                        del d[name]
                    else:
                        d.discard(name)
        if isinstance(value, Variable):
            if value.trainable:
                if self.__dict__.get('_parameters') is None:
                    raise AttributeError(
                        "cannot assign parameters before Module.__init__() call")
                remove_from(self.__dict__, self._parameters, self._buffers, self._modules,
                            self._non_persistent_buffers_set)
                self.register_parameter(name, value)
            else:
                if self.__dict__.get('_buffers') is None:
                    raise AttributeError(
                        "cannot assign parameters before Module.__init__() call")
                remove_from(self.__dict__, self._parameters, self._buffers, self._modules,
                            self._non_persistent_buffers_set)
                self.register_buffer(name, value)
        else:
            super(Layer, self).__setattr__(name, value)


class IdentityLayer(Layer):
    # Attention: do not inherit from IdentityLayer since this might mess with optimizations in SequentialLayer!
    def forward_tensors(self, x):
        return x


class SequentialLayer(Layer):
    def __init__(self, tfms: List[Layer]):
        super().__init__()
        self.tfms = nn.ModuleList([tfm for tfm in tfms if not isinstance(tfm, IdentityLayer)])

    def forward_tensor_infos(self, tensor_infos):
        for tfm in self.tfms:
            tensor_infos = tfm.forward_tensor_infos(tensor_infos)
        return tensor_infos

    def forward_ds(self, ds: DictDataset):
        for tfm in self.tfms:
            ds = tfm.forward_ds(ds)
        return ds

    def forward_tensors(self, tensors):
        for tfm in self.tfms:
            tensors = tfm(tensors)
        return tensors

    def _stack(self, seq_tfms):
        return SequentialLayer([seq_tfms[0].tfms[i].stack([seq_tfm.tfms[i] for seq_tfm in seq_tfms])
                                for i in range(len(seq_tfms[0].tfms))])

    def __repr__(self):
        return str(self)

    def __str__(self):
        sub_strings = ['  ' + line for tfm in self.tfms for line in str(tfm).split('\n')]
        return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n'


class ResidualLayer(Layer):
    def __init__(self, inner_layer: Layer):
        super().__init__()
        self.inner_layer = inner_layer

    def forward_tensor_infos(self, tensor_infos):
        return self.inner_layer.forward_tensor_infos(tensor_infos)

    def forward_tensors(self, tensors: Dict[str, torch.Tensor]):
        new_tensors = self.inner_layer.forward_tensors(tensors)
        new_tensors['x_cont'] = tensors['x_cont'] + new_tensors['x_cont']
        return new_tensors

    def _stack(self, seq_tfms):
        return ResidualLayer(seq_tfms[0].inner_layer.stack([seq_tfm.inner_layer for seq_tfm in seq_tfms]))

    def __repr__(self):
        return str(self)

    def __str__(self):
        sub_strings = ['  ' + line for line in str(self.inner_layer).split('\n')]
        return f'ResidualLayer [\n' + '\n'.join(sub_strings) + '\n]\n'


class ConcatParallelLayer(Layer):
    """
    Executes all layers on the given input
    and combines the resulting output tensors by concatenating along the last dimension (as in DenseNet, for example).
    Not all layers need to output the same tensors, e.g.,
    one can output only 'x_cont' and the other can output 'x_cont' and 'y',
    in which case 'y' will not be concatenated with another tensor.
    """
    def __init__(self, layers: List[Layer], fitter: 'Fitter'):
        super().__init__(fitter=fitter)
        self.layers = nn.ModuleList(layers)

    def forward_tensors(self, tensors):
        out_tensors = [layer.forward_tensors(tensors) for layer in self.layers]
        out_keys = {key for t in out_tensors for key in t.keys()}
        # print(f'{[t["x_cont"].shape for t in out_tensors]=}')
        return {key: torch_utils.cat_if_necessary([t[key] for t in out_tensors if key in t], dim=-1)
                for key in out_keys}

    def _stack(self, tfms: List[Layer]):
        return ConcatParallelLayer([tfms[0].layers[i].stack([tfm.layers[i] for tfm in tfms])
                                    for i in range(len(tfms[0].layers))], fitter=tfms[0].fitter)

    def __repr__(self):
        return str(self)

    def __str__(self):
        sub_strings = ['  ' + line for tfm in self.layers for line in str(tfm).split('\n')]
        return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n'


class FilterTensorsLayer(Layer):
    """
    Only returns those tensors whose name is in a list of names
    """
    def __init__(self, include_keys: Optional[List[str]], exclude_keys: Optional[List[str]], fitter: 'Fitter'):
        """
        :param keys: List of tensor names that is allowed to pass through
        """
        super().__init__(fitter=fitter)
        self.include_keys = include_keys
        self.exclude_keys = exclude_keys

    def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        # return {key: value for key, value in tensors.items() if key in self.keys}
        result = {key: (value if (self.include_keys is None or key in self.include_keys) and
                                 (self.exclude_keys is None or key not in self.exclude_keys)
                        else value[..., :0])
                for key, value in tensors.items()}
        # print(result)
        return result


class FunctionLayer(Layer):
    def __init__(self, f):
        super().__init__()
        self.f = f

    def forward_cont(self, x: torch.Tensor) -> torch.Tensor:
        return self.f(x)


class BiasLayer(Layer):
    def __init__(self, bias: Variable, factor: float = 1.0):
        super().__init__()
        self.bias = bias
        self.factor = factor

    def forward_cont(self, x):
        if self.factor != 1.0:
            x = x + self.factor * self.bias
        else:
            x = x + self.bias
        return x

    def _stack(self, tfms):
        return BiasLayer(Variable.stack([tfm.bias for tfm in tfms]), factor=tfms[0].factor)


class ScaleLayer(Layer):
    def __init__(self, scale: Variable):
        super().__init__()
        self.scale = scale

    def forward_cont(self, x):
        # print(f'{x.norm().item()=:g}, {self.scale.norm().item()=:g}')
        return x * self.scale

    def _stack(self, tfms):
        return ScaleLayer(Variable.stack([tfm.scale for tfm in tfms]))


class WeightLayer(Layer):
    def __init__(self, weight: Variable, factor: float = 1.0):
        super().__init__(new_tensor_infos={'x_cont': TensorInfo(feat_shape=[weight.shape[-1]])})
        # weight should be <batch-dims> x in_features x out_features unlike in nn.Linear
        self.weight = weight
        self.factor = factor

    def forward_cont(self, x):
        x = x.matmul(self.weight)
        if self.factor != 1.0:
            x = self.factor * x
        return x

    def _stack(self, tfms):
        return WeightLayer(Variable.stack([tfm.weight for tfm in tfms]), factor=tfms[0].factor)


class RenameTensorLayer(Layer):
    def __init__(self, old_name: str, new_name: str, fitter: 'Fitter'):
        super().__init__(fitter=fitter)
        self.old_name = old_name
        self.new_name = new_name

    def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        if self.old_name not in tensors:
            return tensors
        elif self.new_name not in tensors:
            return utils.update_dict(tensors, {self.new_name: tensors[self.old_name]}, remove_keys=self.old_name)
        else:
            # print(f'{tensors[self.new_name].shape=}, {tensors[self.old_name].shape=}')
            new_tensor = torch.cat([tensors[self.new_name], tensors[self.old_name]], dim=-1)
            return utils.update_dict(tensors, {self.new_name: new_tensor}, remove_keys=self.old_name)

    def _stack(self, layers: List['Layer']) -> 'Layer':
        return layers[0]


# ------ Fitters ------


class Fitter(ContextAware, StringConvertible):
    """
    Fitters produce Layer objects given a data set (of inputs to the fitter at initialization)
    """

    def __init__(self, needs_tensors: bool = True, is_individual: bool = True, scope_names: Optional[List[str]] = None,
                 modified_tensors: Optional[List[str]] = None):
        """
        :param needs_tensors: Set to true if the fitter needs to have the tensors in fit() or fit_transform().
        If false, then in fit(ds) or fit_transform(ds), ds.tensors is allowed to be None.
        :param is_individual: Set to false if fit(ds) deterministically produces a non-trainable layer.
        (In this case, this Fitter only needs to be called once in k-fold CV on the train+val set.)
        :param scope_names: List of names to add to the scope
        (will be present in the names of Variables constructed in this Fitter)
        :param modified_tensors: List of names of tensors that are modified by this Fitter, e.g., ['x_cont'].
        This is used for the default implementation of get_n_forward(),
        which is used to get a RAM estimate for the forward pass.
        The default RAM estimate is simply the size of all modified tensors.
        """
        super().__init__(scope_names=scope_names)
        # needs_data=False specifies that in fit(ds), ds.tensors is allowed to be None
        # is_individual=False specifies that fit(ds) deterministically produces a non-trainable layer
        self.needs_tensors = needs_tensors
        self.is_individual = is_individual
        self.modified_tensors = modified_tensors

    def _get_n_values(self, tensor_infos: Dict[str, TensorInfo], relevant_tensors: Optional[List[str]]):
        """
        Helper function that can be used internally to get the number of elements of a list of tensors.
        Should not be overridden.
        :param tensor_infos: Tensor infos of the data set
        :param relevant_tensors: List of tensor names that should be considered. If None, 0 is returned.
        :return: Returns the number of components of a list of tensors (per batch element).
        """
        if relevant_tensors is None:
            return 0
        return sum([ti.get_n_features() for key, ti in tensor_infos.items() if key in relevant_tensors])

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        """
        Should be overridden if the fitter produces layers with trainable parameters.
        :param tensor_infos: Tensor infos.
        :return: Returns the number of parameters of the fitted layer for the given tensor_infos.
        """
        return 0

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        """
        Should be overridden if the fitter does more than just one operation.
        :param tensor_infos: Ingoing tensor infos.
        :return: Should return the number of bytes used in the forward pass per batch element
        """
        if self.modified_tensors is None:
            return 0
        return self._get_n_values(self.forward_tensor_infos(tensor_infos), self.modified_tensors)

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        """
        Should be overridden if the fitter changes the tensor shapes.
        :param tensor_infos: Tensor infos (for shapes and category sizes).
        :return: Transformed tensor infos.
        """
        return tensor_infos  # should be overridden by subclasses if tensor_infos change

    def fit(self, ds: DictDataset) -> Layer:
        """
        Produces a layer initialized based on a given data set.
        This method should not be overridden, override _fit() instead.
        :param ds: Data set.
        :return: Layer object.
        """
        with self.set_context():
            return self._fit(ds)

    def fit_transform(self, ds: DictDataset, needs_tensors: bool = True) -> Tuple[Layer, DictDataset]:
        """
        Produces a layer initialized based on a given data set.
        This method should not be overridden, override _fit_transform() instead.
        :param ds: Data set.
        :param needs_tensors: Whether the transformed data set should also contain transformed tensors
         (compared to only transformed tensor_infos).
        :return: Layer object and the data set transformed by the Layer.
        """
        with self.set_context():
            return self._fit_transform(ds, needs_tensors)

    def fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \
            -> Tuple[Layer, DictDataset]:
        """
        Similar to fit_transform(), but may subsample the data set in order to stay within a given RAM limit.
        This method should not be overridden, override _fit_transform_subsample() instead.
        :param ds: Data set.
        :param ram_limit_gb: RAM limit in GB.
        :param needs_tensors: Whether the transformed tensors should be output.
        :return: Tuple of the resulting Layer and the transformed DictDataset.
        """
        with self.set_context():
            return self._fit_transform_subsample(ds, ram_limit_gb, needs_tensors)

    def _fit(self, ds: DictDataset) -> Layer:
        """
        Implementation of fit(). At least one of _fit() or _fit_transform() should be overridden by subclasses.
        :param ds: Data set.
        :return: Initialized Layer object.
        """
        if self.__class__._fit_transform != Fitter._fit_transform:
            # avoid infinite recursion if the method is not overridden
            tfm, ds = self._fit_transform(ds, False)
            return tfm
        elif self.__class__._fit_transform_subsample != Fitter._fit_transform_subsample:
            # avoid infinite recursion if the method is not overridden
            tfm, ds = self._fit_transform_subsample(ds, ram_limit_gb=np.inf, needs_tensors=False)
            return tfm
        if isinstance(self, Layer):
            return self
        raise NotImplementedError()

    def _fit_transform(self, ds: DictDataset, needs_tensors: bool) -> Tuple[Layer, DictDataset]:
        """
        Implementation of fit_transform(). At least one of _fit() or _fit_transform()
        should be overridden by subclasses.
        :param ds: Data set.
        :param needs_tensors: Whether the transformed data set should also contain transformed tensors
         (compared to only transformed tensor_infos).
        :return: Initialized Layer object and transformed data set
        """
        if self.__class__._fit_transform_subsample != Fitter._fit_transform_subsample:
            return self._fit_transform_subsample(ds, ram_limit_gb=np.inf, needs_tensors=needs_tensors)
        else:
            tfm = self._fit(ds)
            if needs_tensors:
                return tfm, tfm.forward_ds(ds)
            else:
                return tfm, DictDataset(None, tfm.forward_tensor_infos(ds.tensor_infos), ds.device, ds.n_samples)

    def _fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \
            -> Tuple[Layer, DictDataset]:
        n_forward = self.get_n_forward(ds.tensor_infos)

        # check if subsampling is necessary
        if ram_limit_gb < np.inf and n_forward > 0 and ds.tensors is not None and (self.needs_tensors or needs_tensors):
            # optimistically assume 4 bytes per number, while 8 are needed for categorical values
            max_n_samples = max(1, int(ram_limit_gb * (1024 ** 3) / (4 * n_forward)))
            if max_n_samples < ds.n_samples:
                # subsample the data set
                subsample_idxs = torch.randperm(ds.n_samples, device=ds.device)[:max_n_samples]
                ds = ds.get_sub_dataset(subsample_idxs)

        return self._fit_transform(ds, needs_tensors)

    def split_off_dynamic(self) -> Tuple['Fitter', 'Fitter']:
        """
        Can be overridden by subclasses if a trivial split based on
        self.needs_tensors and self.is_individual is not desired.
        :return: Returns a tuple of a static and a dynamic transform
        such that self is equivalent to SequentialFitter([static, dynamic])
        and such that the static transform does not need data and is not trainable.
        The idea is that in the vectorized setting, the static transform only needs to be applied once to the data set,
        while the dynamic transform needs to be applied separately for each of the vectorized models.
        """
        if self.needs_tensors or self.is_individual:
            return IdentityFitter(), self
        else:
            return self, IdentityFitter()

    def split_off_individual(self):
        """
        Can be overridden by subclasses if a trivial split based on self.is_individual is not desired.
        :return: Returns a tuple of a non-individual and an individual transform
        such that self is equivalent to SequentialFitter([non_individual, individual])
        and such that the non_individual transform deterministically produces a non-trainable layer.
        The idea is that the non-individual transform only needs to be applied once in k-fold cross-validation.
        """
        if self.is_individual:
            return IdentityFitter(), self
        else:
            return self, IdentityFitter()


class IdentityFitter(Fitter):
    def __init__(self, **config):
        super().__init__(needs_tensors=False, is_individual=False)

    def _fit(self, ds: DictDataset) -> Layer:
        return IdentityLayer()


class SequentialFitter(Fitter):
    def __init__(self, fitters: List[Fitter], **config):
        super().__init__(needs_tensors=np.any([f.needs_tensors for f in fitters]),
                         is_individual=np.any([f.is_individual for f in fitters]))
        self.fitters = fitters
        # print(f'Creating SequentialFitter with fitters {fitters} and {self.needs_tensors=}')

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]):
        for f in self.fitters:
            tensor_infos = f.forward_tensor_infos(tensor_infos)
        return tensor_infos

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]):
        n_params = 0
        for f in self.fitters:
            n_params += f.get_n_params(tensor_infos)
            tensor_infos = f.forward_tensor_infos(tensor_infos)
        return n_params

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]):
        forward_bytes = 0
        for f in self.fitters:
            forward_bytes += f.get_n_forward(tensor_infos)
            tensor_infos = f.forward_tensor_infos(tensor_infos)
        return forward_bytes

    def _fit_transform(self, ds: DictDataset, needs_tensors: bool = True):
        needs_tensors_list = [f.needs_tensors for f in self.fitters] + [needs_tensors]
        max_tensors_idx = np.max(np.argwhere(needs_tensors_list)) if np.any(needs_tensors_list) else 0
        tfms = []
        for i, fitter in enumerate(self.fitters):
            tfm, ds = fitter.fit_transform(ds, needs_tensors=(i < max_tensors_idx))
            tfms.append(tfm)
        return SequentialLayer(tfms), ds

    def _fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \
            -> Tuple[Layer, DictDataset]:
        needs_tensors_list = [f.needs_tensors for f in self.fitters] + [needs_tensors]
        max_tensors_idx = np.max(np.argwhere(needs_tensors_list)) if np.any(needs_tensors_list) else 0
        tfms = []
        for i, fitter in enumerate(self.fitters):
            tfm, ds = fitter.fit_transform_subsample(ds, ram_limit_gb=ram_limit_gb, needs_tensors=(i < max_tensors_idx))
            tfms.append(tfm)
        return SequentialLayer(tfms), ds

    def split_off_dynamic(self):
        is_dynamic = [f.needs_tensors or f.is_individual for f in self.fitters]
        if np.any(is_dynamic):
            first_dynamic = np.min(np.argwhere(is_dynamic))
            static, dynamic = self.fitters[first_dynamic].split_off_dynamic()
            return SequentialFitter(self.fitters[:first_dynamic] + [static]).add_others_scope(self), \
                   SequentialFitter([dynamic] + self.fitters[first_dynamic + 1:]).add_others_scope(self)
        else:
            return self, IdentityFitter()

    def split_off_individual(self):
        is_individual = [f.is_individual for f in self.fitters]
        if np.any(is_individual):
            first_indiv = np.min(np.argwhere(is_individual))
            non_indiv, indiv = self.fitters[first_indiv].split_off_individual()
            return SequentialFitter(self.fitters[:first_indiv] + [non_indiv]).add_others_scope(self), \
                   SequentialFitter([indiv] + self.fitters[first_indiv + 1:]).add_others_scope(self)
        else:
            return self, IdentityFitter()

    def __str__(self):
        sub_strings = ['  ' + line for fitter in self.fitters for line in str(fitter).split('\n')]
        return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n'


class ResidualFitter(Fitter):
    def __init__(self, inner_fitter: Fitter, **config):
        super().__init__(needs_tensors=inner_fitter.needs_tensors,
                         is_individual=inner_fitter.is_individual)
        self.inner_fitter = inner_fitter

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]):
        return self.inner_fitter.forward_tensor_infos(tensor_infos)

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]):
        return self.inner_fitter.get_n_params(tensor_infos)

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]):
        return self.inner_fitter.get_n_forward(tensor_infos) + self._get_n_values(tensor_infos, ['x_cont'])

    def _fit_transform(self, ds: DictDataset, needs_tensors=True):
        layer = ResidualLayer(self.inner_fitter.fit(ds))
        if needs_tensors:
            ds = layer.forward_ds(ds)
        return layer, ds

    def split_off_dynamic(self):
        if self.inner_fitter.needs_tensors or self.inner_fitter.is_individual:
            return IdentityFitter(), self
        else:
            return self, IdentityFitter()

    def split_off_individual(self):
        if self.inner_fitter.is_individual:
            return IdentityFitter(), self
        else:
            return self, IdentityFitter()

    def __str__(self):
        sub_strings = ['  ' + line for fitter in [self.inner_fitter] for line in str(fitter).split('\n')]
        return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n'


class FunctionFitter(Fitter):
    def __init__(self, f, **config):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont'])
        self.f = f

    def _fit(self, ds: DictDataset):
        return FunctionLayer(self.f)


class ConcatParallelFitter(Fitter):
    # todo: could implement better _fit_transform_subsample()
    def __init__(self, fitters: List[Fitter]):
        super().__init__(needs_tensors=np.any([f.needs_tensors for f in fitters]),
                         is_individual=np.any([f.is_individual for f in fitters]))
        self.fitters = fitters

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        out_tensor_infos = self.forward_tensor_infos(tensor_infos)
        # pessimistic bound assuming that all tensors need to get concatenated
        concat_space = self._get_n_values(out_tensor_infos, relevant_tensors=list(out_tensor_infos.keys()))
        return sum([f.get_n_forward(tensor_infos) for f in self.fitters]) + concat_space

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return sum([f.get_n_params(tensor_infos) for f in self.fitters])

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        out_tensor_infos_list = [f.forward_tensor_infos(tensor_infos) for f in self.fitters]
        out_keys = {key for ti in out_tensor_infos_list for key in ti.keys()}
        return {key: TensorInfo(cat_sizes=torch_utils.cat_if_necessary([ti[key].get_cat_sizes()
                                                                        for ti in out_tensor_infos_list
                                                                        if key in ti], dim=-1))
                for key in out_keys}

    def _fit(self, ds: DictDataset) -> Layer:
        return ConcatParallelLayer([f.fit(ds) for f in self.fitters], fitter=self)


# ------ Factory -------

class FitterFactory(ContextAware, StringConvertible):
    """
    Class that allows to create Fitter objects depending on
    tensor_infos (the shape and category sizes of the tensors).
    """
    def __init__(self, scope_names: Optional[List[str]] = None):
        super().__init__(scope_names=scope_names)

    def create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter:
        """
        Creates a Fitter object with the scope given in the constructor.
        Do not override this method, override _create() or _create_transform() instead.
        :param tensor_infos: Tensor infos (shapes etc.)
        :return: Fitter object.
        """
        fitter = self._create(tensor_infos)
        if fitter is self:
            return fitter
        return fitter.add_others_scope(self)

    def create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]:
        """
        Creates a Fitter object with the scope given in the constructor.
        Do not override this method, override _create() or _create_transform() instead.
        :param tensor_infos: Tensor infos (shapes etc.)
        :return: Fitter object and the transformed tensor infos.
        """
        fitter, tensor_infos = self._create_transform(tensor_infos)
        if fitter is self:
            return fitter, tensor_infos
        return fitter.add_others_scope(self), tensor_infos

    def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter:
        """
        If the subclass also inherits from Fitter, this will just return self.
        Otherwise, override at least one of _create() or _create_transform().
        :param tensor_infos: Tensor infos.
        :return: Fitter object.
        """
        if self.__class__._create_transform != FitterFactory._create_transform:
            # don't have to worry about infinite recursion
            return self._create_transform(tensor_infos)[0]
        if isinstance(self, Fitter):
            return self
        raise NotImplementedError()

    def _create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]:
        fitter = self._create(tensor_infos)
        return fitter, fitter.forward_tensor_infos(tensor_infos)


class SequentialFactory(FitterFactory):
    def __init__(self, factories: List[FitterFactory]):
        super().__init__()
        self.factories = factories

    def _create_transform(self, tensor_infos: Dict[str, TensorInfo]):
        fitters = []
        for f in self.factories:
            fitter, tensor_infos = f.create_transform(tensor_infos)
            fitters.append(fitter)
        return SequentialFitter(fitters), tensor_infos

    def __str__(self):
        sub_strings = ['  ' + line for factory in self.factories for line in str(factory).split('\n')]
        return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n'


class IdentityFactory(FitterFactory):
    def _create(self, tensor_infos):
        return IdentityFitter()


class FunctionFactory(FitterFactory):
    def __init__(self, f):
        super().__init__()
        self.f = f

    def _create(self, tensor_infos):
        return FunctionFitter(self.f)


class ConcatParallelFactory(FitterFactory):
    def __init__(self, factories: List[FitterFactory]):
        super().__init__()
        self.factories = factories

    def _create(self, tensor_infos) -> Fitter:
        return ConcatParallelFitter([factory.create(tensor_infos) for factory in self.factories])


class FilterTensorsFactory(Fitter, FitterFactory):
    def __init__(self, include_keys: Optional[List[str]] = None, exclude_keys: Optional[List[str]] = None):
        super().__init__(needs_tensors=False, is_individual=False)
        self.include_keys = include_keys
        self.exclude_keys = exclude_keys

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        return {key: (ti if ((self.include_keys is None or key in self.include_keys)
                             and (self.exclude_keys is None or key not in self.exclude_keys))
                      else TensorInfo(feat_shape=0 * ti.get_feat_shape()))
                      for key, ti in tensor_infos.items()}

    def _fit(self, ds: DictDataset) -> Layer:
        return FilterTensorsLayer(include_keys=self.include_keys, exclude_keys=self.exclude_keys, fitter=self)


class RenameTensorFactory(Fitter, FitterFactory):
    def __init__(self, old_name: str, new_name: str, **config):
        super().__init__(needs_tensors=False, is_individual=False)
        self.old_name = old_name
        self.new_name = new_name

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        if self.old_name in tensor_infos and self.new_name in tensor_infos:
            return self._get_n_values(tensor_infos, [self.old_name, self.new_name])
        else:
            return 0

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        if self.old_name not in tensor_infos:
            return tensor_infos
        elif self.new_name not in tensor_infos:
            return utils.update_dict(tensor_infos, {self.new_name: tensor_infos[self.old_name]},
                                     remove_keys=self.old_name)
        else:
            # both names exist in tensor_infos
            new_tensor_info = TensorInfo.concat([tensor_infos[self.new_name], tensor_infos[self.old_name]])
            return utils.update_dict(tensor_infos, {self.new_name: new_tensor_info},
                                     remove_keys=self.old_name)

    def _fit(self, ds: DictDataset) -> Layer:
        return RenameTensorLayer(old_name=self.old_name, new_name=self.new_name, fitter=self)


================================================
FILE: pytabkit/models/nn_models/categorical.py
================================================
from typing import Iterable, List, Dict, Tuple, Any, Callable, Optional, Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from pytabkit.models import utils
from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.nn_models.base import FitterFactory, IdentityFitter, Layer, Fitter, Variable
from pytabkit.models.torch_utils import cat_if_necessary


class SingleEncodingFactory(FitterFactory):
    def __init__(self, create_fitter, min_cat_size=0, max_cat_size=-1):
        super().__init__()
        self.min_cat_size = min_cat_size
        self.max_cat_size = max_cat_size
        self.create_fitter = create_fitter

    def apply_on(self, cat_size: int, n_classes: int):
        # can be overridden
        return cat_size >= self.min_cat_size and (self.max_cat_size < 0 or cat_size <= self.max_cat_size)

    def _create(self, tensor_infos):
        if 'x_cat' not in tensor_infos:
            return IdentityFitter()
        x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy()
        if len(x_cat_sizes) != 1:
            raise ValueError(
                'SingleEncoderFactory has to be applied to a single category but was applied to category sizes '
                + str(x_cat_sizes))
        cat_size = x_cat_sizes[0]
        n_classes = tensor_infos['y'].get_cat_sizes()[0].item()
        if self.apply_on(cat_size, n_classes):
            return self.create_fitter(tensor_infos)
        return IdentityFitter()


class EncodingLayer(Layer):
    def __init__(self, single_enc_layers: Iterable[Layer], enc_output_name: str, fitter):
        super().__init__(fitter=fitter)
        self.emb_layers = nn.ModuleList(single_enc_layers)
        self.enc_output_name = enc_output_name

    def forward_tensors(self, tensors):
        x_cat = tensors['x_cat']
        prev_output_tensors = [tensors[self.enc_output_name]] if self.enc_output_name in tensors else []

        new_tensors = []
        for i, l in enumerate(self.emb_layers):
            sub_x_cat = x_cat[tuple([slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)])]
            sub_tensors = {'x_cat': sub_x_cat}
            if 'y' in tensors:
                sub_tensors['y'] = tensors['y']
            new_tensors.append(l.forward_tensors(sub_tensors))
        output_tensors = prev_output_tensors + [t['x_cont'] for t in new_tensors if 'x_cont' in t]
        if len(output_tensors) == 0:
            # create empty tensor
            new_conts = torch.zeros(*x_cat.shape[:-1], 0, device=x_cat.device, dtype=torch.float32)
        else:
            new_conts = cat_if_necessary(output_tensors, dim=-1)
        cat_tensors = [t['x_cat'] for t in new_tensors if 'x_cat' in t]
        if len(cat_tensors) > 0:
            new_cats = torch.cat(cat_tensors, dim=-1)
            return utils.update_dict(tensors, {self.enc_output_name: new_conts, 'x_cat': new_cats})
        else:
            return utils.update_dict(tensors, {self.enc_output_name: new_conts}, remove_keys='x_cat')

    def _stack(self, layers: List['EncodingLayer']):
        return EncodingLayer([layers[0].emb_layers[i].stack([layers[j].emb_layers[i] for j in range(len(layers))])
                              for i in range(len(layers[0].emb_layers))], layers[0].enc_output_name, layers[0].fitter)


class EncodingFitter(Fitter):
    def __init__(self, single_encoder_fitters: List[Fitter], enc_output_name: str = 'x_cont', **config):
        super().__init__(needs_tensors=any([enc.needs_tensors for enc in single_encoder_fitters]),
                         is_individual=any([enc.is_individual for enc in single_encoder_fitters]))
        self.single_encoder_fitters = single_encoder_fitters
        self.enc_output_name = enc_output_name  # allow to have something other than x_cont
        assert enc_output_name != 'x_cat'

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return sum([f.get_n_params(ti)
                    for f, ti in zip(self.single_encoder_fitters, self._sub_tensor_infos(tensor_infos))])

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        # for splitting categories
        forward_bytes = self._get_n_values(tensor_infos, ['x_cat'])
        forward_bytes += sum([f.get_n_forward(ti)
                              for f, ti in zip(self.single_encoder_fitters, self._sub_tensor_infos(tensor_infos))])
        # for concat
        forward_bytes += self._get_n_values(self.forward_tensor_infos(tensor_infos),
                                            [self.enc_output_name, 'x_cat'])
        return forward_bytes

    def _sub_tensor_infos(self, tensor_infos):
        x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy()
        if 'y' in tensor_infos:
            return [{'x_cat': TensorInfo(cat_sizes=[cat_sz]), 'y': tensor_infos['y']} for cat_sz in x_cat_sizes]
        return [{'x_cat': TensorInfo(cat_sizes=[cat_sz])} for cat_sz in x_cat_sizes]

    def forward_tensor_infos(self, tensor_infos):
        x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy()
        n_cont = tensor_infos[self.enc_output_name].get_n_features() \
            if self.enc_output_name in tensor_infos else 0
        out_cat_sizes = []
        for cat_sz, enc in zip(x_cat_sizes, self.single_encoder_fitters):
            ti = {'x_cat': TensorInfo(cat_sizes=[cat_sz])}
            out_ti = enc.forward_tensor_infos(ti)
            if 'x_cont' in out_ti:
                n_cont += out_ti['x_cont'].get_n_features()
            else:
                out_cat_sizes.append(out_ti['x_cat'].get_cat_sizes()[0].item())
        if len(out_cat_sizes) > 0:
            return utils.update_dict(tensor_infos, {self.enc_output_name: TensorInfo(feat_shape=[n_cont]),
                                                   'x_cat': TensorInfo(cat_sizes=out_cat_sizes)})
        else:
            return utils.update_dict(tensor_infos, {self.enc_output_name: TensorInfo(feat_shape=[n_cont])},
                                     remove_keys='x_cat')

    def _fit(self, ds: DictDataset) -> Layer:
        x_cat_sizes = ds.tensor_infos['x_cat'].get_cat_sizes().numpy()
        enc_layers = []
        for i in range(len(x_cat_sizes)):
            enc = self.single_encoder_fitters[i]
            if enc.needs_tensors:
                tensors = {'x_cat': ds.tensors['x_cat'][:, i:i+1]}
                if 'y' in ds.tensors:
                    tensors['y'] = ds.tensors['y']
            else:
                tensors = None
            tensor_infos = {'x_cat': TensorInfo(cat_sizes=[x_cat_sizes[i]])}
            if 'y' in ds.tensor_infos:
                tensor_infos['y'] = ds.tensor_infos['y']
            enc_layers.append(enc.fit(DictDataset(tensors, tensor_infos, ds.device, ds.n_samples)))

        return EncodingLayer(enc_layers, self.enc_output_name, self)

    # def split_off_dynamic(self):
    #     splits = [f.split_off_dynamic() for f in self.single_encoder_fitters]
    #     s0 = [s[0] for s in splits]
    #     s1 = [s[1] for s in splits]
    #     # todo


class EncodingFactory(FitterFactory):
    def __init__(self, single_encoder_factory, enc_output_name: str = 'x_cont'):
        super().__init__()
        self.single_encoder_factory = single_encoder_factory
        self.enc_output_name = enc_output_name

    def _create(self, tensor_infos):
        if 'x_cat' not in tensor_infos or tensor_infos['x_cat'].get_n_features() == 0:
            return IdentityFitter()

        x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy()
        single_encoder_fitters = [self.single_encoder_factory.create({'x_cat': TensorInfo(cat_sizes=[cat_sz]),
                                                                      'y': tensor_infos['y']})
                                  for cat_sz in x_cat_sizes]
        return EncodingFitter(single_encoder_fitters, enc_output_name=self.enc_output_name)

# ----- One-Hot ------


class SingleOneHotLayer(Layer):
    def __init__(self, fitter: Fitter, onoff, cat_size, use_missing_zero: bool, use_1d_binary_onehot: bool):
        super().__init__(fitter=fitter)
        self.onoff = onoff
        self.cat_size = cat_size
        self.use_missing_zero = use_missing_zero
        self.use_1d_binary_onehot = use_1d_binary_onehot

    def _binary(self, x_cat, values):
        src = torch.as_tensor(values, dtype=torch.float32, device=x_cat.device)
        # add other dimensions to match those of x_cat
        src = src[tuple([None] * (x_cat.dim()-1) + [slice(None)])].expand(*(list(x_cat.shape[:-1]) + [-1]))
        return src.gather(dim=-1, index=x_cat)

    def _multiple(self, x_cat, on_value, off_value):
        cont_shape = (*x_cat.shape[:-1], self.cat_size)
        cont = torch.full(cont_shape, off_value, dtype=torch.float32, device=x_cat.device)
        src = torch.full([1] * x_cat.dim(), on_value,
                         dtype=torch.float32, device=x_cat.device).expand(*x_cat.shape)
        cont.scatter_(dim=-1, index=x_cat, src=src)
        return cont

    def forward_tensors(self, tensors):
        x_cat = tensors['x_cat']
        # default_slices = [slice(None)] * (x_cat_sq.dim() - 1)
        on_value = self.onoff[0]
        off_value = self.onoff[1]

        if self.use_missing_zero:
            if self.cat_size == 2 and self.use_1d_binary_onehot:
                # should not be used with use_missing_zero anyway
                cont = self._binary(x_cat, [-on_value, on_value])
            elif self.cat_size == 3 and self.use_1d_binary_onehot:
                cont = self._binary(x_cat, [off_value, on_value, -on_value])
            else:
                cont = self._multiple(x_cat, on_value=on_value, off_value=off_value)
                # cont = cont[[slice(None)] * (x_cat.dim() - 1) + [slice(1, None)]]
                cont = cont[..., 1:]  # cut off the dimension for the missing value one-hot
        else:
            if self.cat_size == 2 and self.use_1d_binary_onehot:
                cont = self._binary(x_cat, [off_value, on_value])
            else:
                cont = self._multiple(x_cat, on_value=on_value, off_value=off_value)

        return utils.update_dict(tensors, {'x_cont': cont}, remove_keys='x_cat')


class SingleOneHotFitter(Fitter):
    def __init__(self, use_missing_zero: bool, bin_onoff: Tuple[float, float], multi_onoff: Tuple[float, float],
                 use_1d_binary_onehot: bool):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont', 'x_cat'])
        self.use_missing_zero = use_missing_zero
        self.bin_onoff = bin_onoff
        self.multi_onoff = multi_onoff
        self.use_1d_binary_onehot = use_1d_binary_onehot

    def forward_tensor_infos(self, tensor_infos):
        cat_size = tensor_infos['x_cat'].get_cat_sizes()[0].item()
        if self.use_missing_zero:
            cat_size -= 1
        if cat_size == 2 and self.use_1d_binary_onehot:
            cat_size = 1
        return utils.update_dict(tensor_infos, {'x_cont': TensorInfo(feat_shape=[cat_size])}, remove_keys='x_cat')

    def _fit(self, ds: DictDataset) -> Layer:
        cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item()
        is_binary = cat_size - int(self.use_missing_zero) <= 2
        return SingleOneHotLayer(self, onoff=self.bin_onoff if is_binary else self.multi_onoff, cat_size=cat_size,
                                 use_missing_zero=self.use_missing_zero,
                                 use_1d_binary_onehot=self.use_1d_binary_onehot)


class SingleOneHotFactory(SingleEncodingFactory):
    def __init__(self, use_missing_zero=True, bin_onoff=(1.0, 0.0), multi_onoff=(1.0, 0.0), min_one_hot_cat_size=0,
                 max_one_hot_cat_size=-1, max_one_hot_size_by_n_classes=False, use_1d_binary_onehot: bool = True,
                 **config):
        super().__init__(create_fitter=lambda tensor_infos:
                                            SingleOneHotFitter(use_missing_zero=use_missing_zero,
                                                               bin_onoff=bin_onoff, multi_onoff=multi_onoff,
                                                               use_1d_binary_onehot=use_1d_binary_onehot),
                         min_cat_size=min_one_hot_cat_size, max_cat_size=max_one_hot_cat_size)
        self.max_one_hot_size_b_n_classes = max_one_hot_size_by_n_classes

    def apply_on(self, cat_size: int, n_classes: int):
        if self.max_one_hot_size_b_n_classes:
            return cat_size <= n_classes
        else:
            return super().apply_on(cat_size, n_classes)


# ------ Embedding --------


class SingleEmbeddingLayer(Layer):
    def __init__(self, emb: Variable):
        super().__init__(new_tensor_infos={'x_cont': TensorInfo(feat_shape=[emb.shape[-1]])}, remove_keys='x_cat')
        # emb.shape should be (parallel dims) x cat_size x emb_size
        # print(f'{emb.numel()=}')
        self.emb = emb

    def forward_tensors(self, tensors):
        x_cat = tensors['x_cat']
        # print(f'{x_cat.shape=}')
        x_cat = x_cat.squeeze(-1)  # squeeze feature dimension, we assume that there is only one feature
        parallel_dims = self.emb.dim() - 2  # subtract category and feature dimension

        # idxs = []
        # for dim in range(parallel_dims):
        #     # todo: could cache these and not create them newly every time?
        #     view_shape = [1] * (parallel_dims+1)
        #     view_shape[dim] = self.emb.shape[dim]
        #     idxs.append(torch.arange(self.emb.shape[dim], dtype=torch.long, device=self.emb.device).view(*view_shape))
        # idxs.append(x_cat)
        # x_cont = self.emb[idxs]

        # code using index_select which is faster than fancy indexing
        # put all parallel dimensions into the batch dimension
        cat_size = self.emb.shape[-2]
        n_flattened_idxs = cat_size
        n_batch = x_cat.shape[-1]
        # shape: (n_parallel * cat_size) x n_features
        emb_flat = self.emb.reshape(-1, self.emb.shape[-1])
        while x_cat.dim() > 1:
            # merge batch dimension with all parallel dimensions
            n_parallel = x_cat.shape[-2]
            parallel_idxs = torch.arange(x_cat.shape[-2], dtype=torch.long, device=self.emb.device)
            # add offsets to parallel dimension
            x_cat = x_cat + n_flattened_idxs * parallel_idxs[:, None]
            # merge parallel and batch dimension
            x_cat = x_cat.reshape(*x_cat.shape[:-2], -1)
            # now the indexes span a larger range
            n_flattened_idxs *= n_parallel
        # for dim in range(parallel_dims):
        #     # todo:
        #     pass
        # print(f'{x_cat.shape=}, {emb_flat.shape=}, {n_flattened_idxs=}, {x_cat.max().item()=}')
        x_cont = emb_flat.index_select(0, x_cat)
        x_cont = x_cont.reshape(*self.emb.shape[:-2], n_batch, self.emb.shape[-1])

        # print(f'{torch.norm(x_cont)=}, {torch.norm(x_cont-x_cont_other)=}')


        return utils.update_dict(tensors, {'x_cont': x_cont}, remove_keys='x_cat')

    def _stack(self, layers: List['SingleEmbeddingLayer']):
        return SingleEmbeddingLayer(Variable.stack([layer.emb for layer in layers]))


def fastai_emb_size_fn(n_cat: int):
    return min(600, round(1.6 * n_cat ** 0.56))


class ConstantFunction:
    def __init__(self, value: Any):
        self.value = value

    def __call__(self, *args, **kwargs) -> Any:
        return self.value


def get_embedding_size(fn: Optional[Union[int, str, Callable[[int], int]]]) -> Callable[[int], int]:
    if fn is None:
        fn = 'fastai'

    if isinstance(fn, int):
        return ConstantFunction(value=fn)
    elif isinstance(fn, str):
        if fn == 'howard' or fn == 'fastai':
            # heuristic by Jeremy Howard in fastai
            return fastai_emb_size_fn
        else:
            raise ValueError(f'Unknown embedding_size name "{fn}"')
    else:
        return fn


class SingleEmbeddingFitter(Fitter):
    def __init__(self, embedding_size=None, **config):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont', 'x_cat'])
        # default option is taken from fastai2
        self.size_func = get_embedding_size(embedding_size) if embedding_size is not None \
            else fastai_emb_size_fn
        self.emb_init_mode = config.get('emb_init_mode', 'normal')
        self.emb_init_gain = config.get('emb_init_gain', 1.0)
        self.emb_reduce_norm = config.get('emb_reduce_norm', False)
        self.emb_lr_factor = config.get('emb_lr_factor', 1.0)

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        cat_sz = tensor_infos['x_cat'].get_cat_sizes()[0].item()
        return cat_sz * self.size_func(cat_sz)

    def forward_tensor_infos(self, tensor_infos):
        new_info = TensorInfo(feat_shape=[self.size_func(tensor_infos['x_cat'].get_cat_sizes()[0].item())])
        return utils.update_dict(tensor_infos, {'x_cont': new_info}, remove_keys='x_cat')

    def _fit(self, ds: DictDataset) -> Layer:
        cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item()
        emb_size = self.size_func(cat_size)
        if self.emb_init_mode == 'normal':
            emb = torch.randn(cat_size, emb_size, device=ds.device)
        elif self.emb_init_mode == 'uniform':
            emb = 2*torch.rand(cat_size, emb_size, device=ds.device) - 1
        elif self.emb_init_mode == 'kaiming-uniform-t':
            # as in the RTDL nets, use 1/sqrt(out_features)
            emb = (1./np.sqrt(emb_size)) * (2 * torch.rand(cat_size, emb_size, device=ds.device) - 1)
            emb[0, :] = 0.0  # set unknown/missing category to 0
        else:
            raise ValueError(f'Unknown emb_init_mode: {self.emb_init_mode}')
        # todo: should emb_reduce_norm be used differently as for NTK param (Adam vs not Adam)?
        emb_factor = self.emb_init_gain * (np.sqrt(1.0/emb_size) if self.emb_reduce_norm else 1.0)
        return SingleEmbeddingLayer(Variable(emb_factor * emb, trainable=True,
                                             hyper_factors={'lr': self.emb_lr_factor}))


class SingleEmbeddingFactory(SingleEncodingFactory):
    def __init__(self, embedding_size=None, min_embedding_cat_size=0, max_embedding_cat_size=-1, **config):
        super().__init__(create_fitter=lambda tensor_infos:
                                            SingleEmbeddingFitter(embedding_size=embedding_size, **config),
                         min_cat_size=min_embedding_cat_size, max_cat_size=max_embedding_cat_size)

# ------- Target Encoding (a kind of fixed embedding) -------


class SingleTargetEncodingFitter(Fitter):
    def __init__(self, n_classes, **config):
        super().__init__(is_individual=False, modified_tensors=['x_cont', 'x_cat'])
        self.n_classes = n_classes

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        n_classes = tensor_infos['y'].get_cat_sizes()[0].item()
        emb_sz = 1 if n_classes <= 2 else n_classes
        cat_sz = tensor_infos['x_cat'].get_cat_sizes()[0].item()
        return emb_sz * cat_sz

    def forward_tensor_infos(self, tensor_infos):
        new_info = TensorInfo(feat_shape=[1 if self.n_classes <= 2 else self.n_classes])
        return utils.update_dict(tensor_infos, {'x_cont': new_info}, remove_keys='x_cat')

    def _fit(self, ds: DictDataset) -> Layer:
        x_cat = ds.tensors['x_cat'].squeeze(-1)
        x_cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item()
        y = ds.tensors['y']
        y_cat_sizes = ds.tensor_infos['y'].get_cat_sizes().numpy()
        if y_cat_sizes[0] > 2:
            # multi-class classification
            y = F.one_hot(y[:, 0], num_classes=y_cat_sizes[0]).float()
        elif y_cat_sizes[0] == 2:
            # binary classification
            y = y.float()  # convert int to float

        prior = y.mean(dim=-2)  # mean over batch dimension

        sums = torch.zeros(x_cat_size, y.shape[-1], device=y.device)
        # In the following, scatter_add_ executes sums[x_cat[:, i][j], k] += y[j, k]
        # see also https://discuss.pytorch.org/t/pytorch-equivalent-to-tf-unsorted-segment-sum/25275/5
        sums.scatter_add_(0, x_cat[:,None].expand(-1, y.shape[-1]), y)
        frequencies = torch.bincount(x_cat, minlength=x_cat_size)
        # could also give the prior a different weight, this is just an option
        emb = (sums + prior[None, :]) / (frequencies[:, None] + 1)
        return SingleEmbeddingLayer(Variable(emb, trainable=False))


class SingleTargetEncodingFactory(SingleEncodingFactory):
    def __init__(self, min_targetenc_cat_size=0, max_targetenc_cat_size=-1, **config):
        create_fitter = lambda tensor_infos: SingleTargetEncodingFitter(n_classes=tensor_infos['y'].get_cat_sizes()[0].item())
        super().__init__(create_fitter=create_fitter, min_cat_size=min_targetenc_cat_size,
                         max_cat_size=max_targetenc_cat_size)


# ------- Label Encoding -------


class SingleOrdinalEncodingLayer(Layer):
    def __init__(self, fitter, cat_size: int, permute_ordinal_encoding: bool = False):
        super().__init__(fitter=fitter)
        self.cat_size = cat_size
        self.permute_ordinal_encoding = permute_ordinal_encoding
        self.perm = None
        if permute_ordinal_encoding:
            self.perm = Variable(torch.randperm(cat_size, dtype=torch.long), trainable=False)

    def forward_tensors(self, tensors):
        x_cat = tensors['x_cat']
        if self.permute_ordinal_encoding:
            x_cat = self.perm[x_cat]
        return utils.update_dict(tensors, {'x_cont': x_cat.type(torch.float32)}, remove_keys='x_cat')


class SingleOrdinalEncodingFitter(Fitter):
    def __init__(self, permute_ordinal_encoding: bool = False, **config):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont', 'x_cat'])
        self.permute_ordinal_encoding = permute_ordinal_encoding

    def forward_tensor_infos(self, tensor_infos):
        return utils.update_dict(tensor_infos, {'x_cont': tensor_infos['x_cat']}, remove_keys='x_cat')

    def _fit(self, ds: DictDataset) -> Layer:
        return SingleOrdinalEncodingLayer(self, cat_size=ds.tensor_infos['x_cat'].get_cat_sizes()[0].item(),
                                        permute_ordinal_encoding=self.permute_ordinal_encoding)


class SingleOrdinalEncodingFactory(SingleEncodingFactory):
    def __init__(self, min_labelenc_cat_size=0, max_labelenc_cat_size=-1, **config):
        super().__init__(create_fitter=lambda tensor_infos: SingleOrdinalEncodingFitter(**config),
                         min_cat_size=min_labelenc_cat_size, max_cat_size=max_labelenc_cat_size)


================================================
FILE: pytabkit/models/nn_models/models.py
================================================
import copy
import functools
from typing import Dict, Tuple

import numpy as np
import torch
from sklearn.preprocessing import QuantileTransformer

from pytabkit.models.nn_models.activations import ActivationFactory
from pytabkit.models.nn_models.base import FitterFactory, SequentialFitter, ResidualFitter, Fitter, RenameTensorFactory, \
    FunctionFactory, \
    SequentialFactory, FilterTensorsFactory, ConcatParallelFactory
from pytabkit.models.nn_models.categorical import EncodingFactory, SingleOneHotFactory, SingleEmbeddingFactory, \
    SingleOrdinalEncodingFactory, \
    SingleTargetEncodingFactory
from pytabkit.models.nn_models.nn import DropoutFitter, WeightFitter, BiasFitter, ScaleFitter, NoiseFitter, \
    PLREmbeddingsFactory, ScaleFactory, \
    PeriodicEmbeddingsFactory, RFFeatureImportanceFactory, LabelSmoothingFactory, StochasticLabelNoiseFactory, \
    StochasticGateFactory, FeatureImportanceFactory, FixedWeightFactory, AntisymmetricInitializationFactory, \
    NormalizeOutputFactory, ClampOutputFactory
from pytabkit.models.nn_models.pipeline import MedianCenterFactory, RobustScaleFactory, MeanCenterFactory, \
    GlobalScaleNormalizeFactory, \
    L2NormalizeFactory, L1NormalizeFactory, ThermometerCodingFactory, CircleCodingFactory, SklearnTransformFactory, \
    RobustScaleV2Factory, MinMaxScaleFactory
from pytabkit.models import utils
from pytabkit.models.data.data import TensorInfo
from pytabkit.models.utils import TabrQuantileTransformer


class BlockFactory(FitterFactory):
    def __init__(self, out_features: int, block_str: str = 'w-b-a', **config):
        super().__init__()
        # could also make this a SequentialFactory if there were factories for all the individual fitters
        # or a LambdaFactory
        self.block_str = block_str
        self.out_features = out_features
        self.config = config

    def _create_transform(self, tensor_infos):
        in_features = tensor_infos['x_cont'].get_n_features()
        fitters = []
        for layer_str in self.block_str.split('-'):
            # todo: mixup layer?
            if layer_str in ['a', 'act', 'activation']:
                fitters.append(ActivationFactory(**self.config).create(tensor_infos).add_scope('act'))
            elif layer_str in ['d', 'drop', 'dropout']:
                fitters.append(DropoutFitter())
            elif layer_str in ['w', 'weight']:
                fitters.append(WeightFitter(self.out_features, **self.config).add_scope('weight'))
            elif layer_str in ['b', 'bias']:
                fitters.append(BiasFitter(in_features=in_features, **self.config).add_scope('bias'))
            # elif layer_str == 'D':  # alpha-dropout for self-normalizing neural networks
            #     pass  # todo
            elif layer_str in ['s', 'scale']:
                fitters.append(ScaleFitter(**self.config).add_scope('scale'))
            # elif layer_str == 'n':
            #     pass  # todo: batchnorm
            elif layer_str in ['noise']:
                fitters.append(NoiseFitter(**self.config))
            elif layer_str in ['r', 'res', 'residual']:
                out_tensor_infos = SequentialFitter(fitters).forward_tensor_infos(tensor_infos)
                if np.equal(tensor_infos['X_cont'].get_feat_shape(), out_tensor_infos['X_cont'].get_feat_shape()):
                    # can use residual connection
                    fitters = [ResidualFitter(SequentialFitter(fitters))]
            else:
                raise ValueError(f'BlockFactory: Unknown layer string {layer_str}')
            tensor_infos = fitters[-1].forward_tensor_infos(tensor_infos)
        return SequentialFitter(fitters), tensor_infos


def smooth_clip_func(x, max_abs_value: float = 3.0):
    return x / (1 + (1 / (max_abs_value ** 2)) * x ** 2).sqrt()


def tanh_clip_func(x):
    return 5 * torch.tanh(0.2 * x)


class PreprocessingFactory(FitterFactory):
    def __init__(self, **config):
        super().__init__()
        self.config = config

    def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter:
        tfm_factories = []

        for tfm in self.config.get('tfms', []):
            if tfm == 'one_hot':
                tfm_factories.append(EncodingFactory(SingleOneHotFactory(**self.config), enc_output_name='x_one_hot'))
                tfm_factories.append(RenameTensorFactory(old_name='x_one_hot', new_name='x_cont'))
            elif tfm == 'median_center':
                tfm_factories.append(MedianCenterFactory(**self.config))
            elif tfm == 'robust_scale':
                tfm_factories.append(RobustScaleFactory(**self.config))
            elif tfm == 'smooth_clip':
                tfm_factories.append(FunctionFactory(functools.partial(smooth_clip_func, max_abs_value=self.config.get(
                    'smooth_clip_max_abs_value', 3.0))))
            elif tfm == 'tanh_5_clip':
                tfm_factories.append(FunctionFactory(tanh_clip_func))
            elif tfm == 'mean_center':
                tfm_factories.append(MeanCenterFactory(**self.config))
            elif tfm == 'embedding':
                tfm_factories.append(EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb'))
            elif tfm == 'global_scale_normalize':
                tfm_factories.append(GlobalScaleNormalizeFactory(**self.config))
            elif tfm == 'l2_normalize':
                tfm_factories.append(L2NormalizeFactory(**self.config))
            elif tfm == 'l1_normalize':
                tfm_factories.append(L1NormalizeFactory(**self.config))
            elif tfm == 'minmax':
                tfm_factories.append(MinMaxScaleFactory(**self.config))
            elif tfm == 'thermometer_coding':
                tfm_factories.append(ThermometerCodingFactory(**self.config))
            elif tfm == 'circle_coding':
                tfm_factories.append(CircleCodingFactory(**self.config))
            elif tfm == 'ordinal_encoding':
                tfm_factories.append(EncodingFactory(SingleOrdinalEncodingFactory(**self.config)))
            elif tfm == 'target_encoding':
                tfm_factories.append(EncodingFactory(SingleTargetEncodingFactory(**self.config)))
            elif tfm == 'kdi':
                from kditransform import KDITransformer
                tfm = KDITransformer(alpha=self.config.get('kdi_alpha', 1.0),
                                     output_distribution=self.config.get('kdi_output_distribution', 'normal'),
                                     random_state=0)
                tfm_factories.append(SklearnTransformFactory(tfm))
            elif tfm == 'quantile':
                tfm = QuantileTransformer(output_distribution=self.config.get('quantile_output_distribution', 'normal'),
                                          random_state=0)
                tfm_factories.append(SklearnTransformFactory(tfm))
            elif tfm == "quantile_tabr":
                tfm = TabrQuantileTransformer(random_state=0)
                tfm_factories.append(SklearnTransformFactory(tfm))
            else:
                raise NotImplementedError(f"Transformation '{tfm}' is not implemented.")

        # old interface, using 'tfms' is preferred
        if self.config.get('use_one_hot', False):
            tfm_factories.append(EncodingFactory(SingleOneHotFactory(**self.config)))
        if self.config.get('use_median_center', False):
            tfm_factories.append(MedianCenterFactory(**self.config))
        if self.config.get('use_robust_scale', False):
            tfm_factories.append(RobustScaleFactory(**self.config))
        if self.config.get('use_robust_scale_v2', False):
            tfm_factories.append(RobustScaleV2Factory(**self.config))
        if self.config.get('use_smooth_clip', False):
            tfm_factories.append(FunctionFactory(lambda x: x / (1 + (1 / 9) * x ** 2).sqrt()))
        if self.config.get('use_mean_center', False):
            tfm_factories.append(MeanCenterFactory(**self.config))
        if self.config.get('use_embedding', False):
            tfm_factories.append(EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb'))
        if self.config.get('use_global_scale_normalize', False):
            tfm_factories.append(GlobalScaleNormalizeFactory(**self.config))

        return SequentialFactory(tfm_factories).add_scope('tfms').create(tensor_infos=tensor_infos)


class NNFactory(FitterFactory):
    def __init__(self, **config):
        super().__init__()
        self.config = config

        if 'use_embedding' not in config:
            # dirty fix to not miss out on categorical values here,
            # but do no use this as a default in PreprocessingFactory since that is also used for GBDTs
            # that can have native categorical processing capabilities
            self.config['use_embedding'] = True

    def _create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]:
        y_cat_sizes = tensor_infos['y'].get_cat_sizes().numpy()
        n_classes = y_cat_sizes[0]

        factories = []
        net_factories = []

        if 'one_hot' in self.config.get('tfms', []) or self.config.get('use_one_hot', False):
            # do it already here so it can get done once instead of per batch
            factories.append(EncodingFactory(SingleOneHotFactory(**self.config), enc_output_name='x_one_hot'))

        prep_factory = PreprocessingFactory(**self.config)

        num_emb_type = self.config.get('num_emb_type', None)

        num_emb_config = copy.copy(self.config)

        if num_emb_type is None or num_emb_type == 'ignore':
            pass  # don't modify the other configuration parameters
        elif num_emb_type == 'none':
            num_emb_config['use_plr_embeddings'] = False
            num_emb_config['use_periodic_emb'] = False
        elif num_emb_type == 'pl':
            num_emb_config['use_plr_embeddings'] = True
            num_emb_config['plr_use_densenet'] = False
            num_emb_config['plr_use_cos_bias'] = False
            num_emb_config['plr_act_name'] = 'linear'
        elif num_emb_type == 'plr':
            num_emb_config['use_plr_embeddings'] = True
            num_emb_config['plr_use_densenet'] = False
            num_emb_config['plr_use_cos_bias'] = False
            num_emb_config['plr_act_name'] = 'relu'
        elif num_emb_type == 'pbld':
            num_emb_config['use_plr_embeddings'] = True
            num_emb_config['plr_use_densenet'] = True
            num_emb_config['plr_use_cos_bias'] = True
            num_emb_config['plr_act_name'] = 'linear'
        elif num_emb_type == 'pblrd':
            num_emb_config['use_plr_embeddings'] = True
            num_emb_config['plr_use_densenet'] = True
            num_emb_config['plr_use_cos_bias'] = True
            num_emb_config['plr_act_name'] = 'relu'
        else:
            raise ValueError(f'Unknown numerical embedding type: {num_emb_type=}')

        if num_emb_config.get('use_plr_embeddings', False):
            plr_factory = PLREmbeddingsFactory(**num_emb_config).add_scope('plr')
            if num_emb_config.get('use_plr_scale', False):
                plr_factory = SequentialFactory([ScaleFactory(**num_emb_config), plr_factory])
            num_factory = SequentialFactory([
                FilterTensorsFactory(include_keys=['x_cont']),
                prep_factory,
                plr_factory
            ])
            cat_factory = SequentialFactory([
                FilterTensorsFactory(exclude_keys=['x_cont']),
                # EncodingFactory(SingleOneHotFactory(**self.config)),
                prep_factory,
                # EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb')
            ])
            factories.append(ConcatParallelFactory([num_factory, cat_factory]))
        elif num_emb_config.get('use_periodic_emb', False):
            periodic_emb_factory = PeriodicEmbeddingsFactory(**num_emb_config).add_scope('periodic_emb')
            num_factory = SequentialFactory([
                FilterTensorsFactory(include_keys=['x_cont']),
                prep_factory,
                periodic_emb_factory
            ])
            cat_factory = SequentialFactory([
                FilterTensorsFactory(exclude_keys=['x_cont']),
                # EncodingFactory(SingleOneHotFactory(**self.config)),
                prep_factory,
                # EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb')
            ])
            factories.append(ConcatParallelFactory([num_factory, cat_factory]))
        else:
            factories.append(prep_factory)

        if self.config.get('use_rf_importances', False):
            factories.append(RFFeatureImportanceFactory())

        if self.config.get('use_ls', False) and n_classes > 0:
            factories.append(LabelSmoothingFactory(**self.config))
        if self.config.get('use_sln', False) and n_classes > 0:
            factories.append(StochasticLabelNoiseFactory())
        if self.config.get('use_sg', False) and n_classes > 0:
            factories.append(StochasticGateFactory())

        if self.config.get('add_importance_layer', False):
            factories.append(FeatureImportanceFactory())
        if self.config.get('add_fixed_weight_layer', False):
            factories.append(FixedWeightFactory())

        hidden_sizes = self.config.get('hidden_sizes', [256] * 3)
        if hidden_sizes == 'rectangular':
            hidden_sizes = [self.config.get('hidden_width', 256)] * self.config.get('n_hidden_layers', 3)
        train_metric_name = self.config.get('train_metric_name', None)
        if isinstance(train_metric_name, str) and train_metric_name.startswith('multi_pinball('):
            out_factor = train_metric_name.count(',') + 1
        else:
            out_factor = 1
        out_sizes = hidden_sizes + [len(y_cat_sizes) * out_factor if n_classes == 0 else n_classes]
        for i in range(len(out_sizes)):
            layer_position = 'middle'
            block_scope_2 = f'layer-{i}'
            config = self.config
            if i + 1 == len(out_sizes):
                config = utils.join_dicts(config, {'block_str': 'w-b'}, config.get('last_layer_config', {}))
                layer_position = 'last'
            elif i == 0:
                config = utils.join_dicts(config, config.get('first_layer_config', {}))
                if config.get('add_front_scale', False):
                    config['block_str'] = 's-' + config.get('block_str', 'w-b-a-d')
                first_layer_lr_factor = config.get('first_layer_lr_factor', None)
                if first_layer_lr_factor is not None:
                    config['weight_lr_factor'] = config.get('weight_lr_factor', 1.0) * first_layer_lr_factor
                    config['bias_lr_factor'] = config.get('bias_lr_factor', 1.0) * first_layer_lr_factor
                layer_position = 'first'
            block_scope = layer_position + '_layer'
            net_factories.append(
                BlockFactory(out_features=out_sizes[i], layer_position=layer_position,
                             **config).add_scope(block_scope).add_scope(block_scope_2))

        factories.append(SequentialFactory(net_factories).add_scope('net'))

        if self.config.get('use_antisymmetric_initialization', False):
            factories = [AntisymmetricInitializationFactory(SequentialFactory(factories), **self.config)]

        if self.config.get('output_factor', 1.0) != 1.0:
            factories.append(FunctionFactory(lambda x, c=self.config['output_factor']: c * x))
        if self.config.get('normalize_output', False):
            factories.append(NormalizeOutputFactory(**self.config))
        if self.config.get('clamp_output', False):
            # use clamp after normalization!
            factories.append(ClampOutputFactory(**self.config))

        factory = SequentialFactory(factories)

        return factory.create_transform(tensor_infos)


================================================
FILE: pytabkit/models/nn_models/nn.py
================================================
import copy
from typing import Dict

import numpy as np
import torch
import torch.nn.functional as F
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from pytabkit.models import utils
from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.nn_models.base import Fitter, Variable, WeightLayer, BiasLayer, ScaleLayer, FitterFactory, Layer, \
    TrainContext, sub_scope_context, SequentialFitter, SequentialLayer, FunctionLayer
from pytabkit.models.torch_utils import gauss_cdf


class WeightFitter(Fitter):
    def __init__(self, out_features, **config):
        super().__init__(modified_tensors=['x_cont'])
        self.out_features = out_features
        self.weight_init_mode = config.get('weight_init_mode', 'normal')
        self.weight_init_gain = config.get('weight_init_gain', 1.0)
        self.weight_lr_factor = config.get('weight_lr_factor', 1.0)
        self.weight_l2_factor = config.get('weight_l2_factor', 1.0)
        self.weight_l1_factor = config.get('weight_l1_factor', 1.0)
        self.weight_wd_factor = config.get('weight_wd_factor', 1.0)
        # use abc parameterization here?
        # todo: ntk param can imply different learning rate factors for different optimizers
        #  also, the influence of Adam's epsilon can be different
        #  maybe this can be resolved using abc-style parameterization
        self.use_ntk_param = config.get('use_ntk_param', False)
        self.use_ntk_param_v2 = config.get('use_ntk_param_v2', False)
        self.use_ntk_param_v3 = config.get('use_ntk_param_v3', False)
        self.weight_param = config.get('weight_param', 'standard')
        if self.use_ntk_param:
            raise ValueError(f'use_ntk_param is discontinued, use weight_param="ntk" instead')
        if self.use_ntk_param_v2:
            raise ValueError(f'use_ntk_param_v2 is discontinued, use weight_param="ntk-v2" instead')
        if self.use_ntk_param_v3:
            raise ValueError(f'use_ntk_param_v3 is discontinued, use weight_param="ntk-v3" instead')
        self.use_norm_weight = config.get('use_norm_weight', False)
        self.norm_weight_transpose = config.get('norm_weight_transpose', False)
        self.layer_position = config.get('layer_position', None)
        self.weight_gain = config.get('weight_gain', 1.0)
        super().__init__(needs_tensors=self.weight_init_mode in ['std'])  # todo: adjust for some weight init modes

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self.out_features * self._get_n_values(tensor_infos, ['x_cont'])

    def forward_tensor_infos(self, tensor_infos):
        return utils.update_dict(tensor_infos, {'x_cont': TensorInfo(feat_shape=[self.out_features])})

    def _fit(self, ds: DictDataset):
        in_features = ds.tensor_infos['x_cont'].get_n_features()
        init_factor = self.weight_init_gain * np.sqrt(1.0 / in_features)
        lr_factor = self.weight_lr_factor
        wd_factor = self.weight_wd_factor
        weight_gain = self.weight_gain
        l2_factor = self.weight_l2_factor
        l1_factor = self.weight_l1_factor
        if self.weight_param == 'xavier':
            # todo: this is not a parametrization, use weight_init_mode instead
            init_factor = self.weight_init_gain * np.sqrt(2.0 / (in_features + self.out_features))
        elif self.weight_param == 'ntk' or self.weight_param == 'ntk-v3':
            weight_gain = self.weight_gain * np.sqrt(1.0 / in_features)
            init_factor = self.weight_init_gain
        elif self.weight_param == 'ntk-old':
            lr_factor *= weight_gain * np.sqrt(1.0 / in_features)
            init_factor *= weight_gain
            weight_gain = 1.0
        elif self.weight_param == 'ntk-v2':
            lr_factor = self.weight_lr_factor * weight_gain * np.sqrt(1.0 / in_features)
            init_factor *= weight_gain
            weight_gain = 1.0
            # this is chosen because wd is multiplied by lr when performing weight decay,
            #  and the effective wd step size should not scale with in_features
            wd_factor = self.weight_wd_factor * np.sqrt(in_features) / weight_gain
            # print(f'{self.weight_gain=}, {lr_factor=}, {wd_factor=}')
        elif self.weight_param == 'ntk-adam':
            init_factor = self.weight_init_gain * self.weight_gain * np.sqrt(1.0 / in_features)
            lr_factor = self.weight_lr_factor * self.weight_gain / in_features / np.sqrt(self.out_features)
            wd_factor = self.weight_wd_factor
            l2_factor = self.weight_l2_factor * np.sqrt(1.0 / self.out_features)
            l1_factor = self.weight_l1_factor * np.sqrt(1.0 / self.out_features)
            weight_gain = 1.0
        elif self.weight_param == 'mup-adam':
            # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer"
            if self.layer_position == 'first':
                lr_factor = self.weight_lr_factor
            elif self.layer_position == 'middle':
                lr_factor = self.weight_lr_factor / in_features
            elif self.layer_position == 'last':
                init_factor = self.weight_init_gain / in_features
                lr_factor = self.weight_lr_factor / in_features
            else:
                raise ValueError(f'Unknown layer_position for mup-adam: {self.layer_position}')
        elif self.weight_param == 'mup-sgd':
            # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer"
            if self.layer_position == 'first':
                lr_factor = self.weight_lr_factor * self.out_features
            elif self.layer_position == 'middle':
                lr_factor = self.weight_lr_factor
            elif self.layer_position == 'last':
                init_factor = self.weight_init_gain / in_features
                lr_factor = self.weight_lr_factor / in_features
            else:
                raise ValueError(f'Unknown layer_position for mup-adam: {self.layer_position}')
        elif self.weight_param == 'mup-adam-custom':
            # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer"
            if self.layer_position == 'first':
                lr_factor = self.weight_lr_factor / in_features
            elif self.layer_position == 'middle':
                lr_factor = self.weight_lr_factor / in_features
            elif self.layer_position == 'last':
                init_factor = self.weight_init_gain / in_features
                lr_factor = self.weight_lr_factor / in_features
            else:
                raise ValueError(f'Unknown layer_position for mup-adam-custom: {self.layer_position}')
        elif self.weight_param == 'mup-adam-custom-2':
            # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer"
            # with custom weight decay factors
            if self.layer_position == 'first':
                lr_factor = self.weight_lr_factor / in_features
                wd_factor = self.weight_wd_factor * np.sqrt(in_features)
            elif self.layer_position == 'middle':
                lr_factor = self.weight_lr_factor / in_features
                wd_factor = self.weight_wd_factor * np.sqrt(in_features)
            elif self.layer_position == 'last':
                init_factor = self.weight_init_gain / in_features
                lr_factor = self.weight_lr_factor / in_features
                # unclear if this wd is the right one,
                # but here the lr_factor is already on the scale of the initialization
                wd_factor = self.weight_wd_factor
            else:
                raise ValueError(f'Unknown layer_position for mup-adam-custom-2: {self.layer_position}')
        elif self.weight_param == 'mup-sgd-custom':
            # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer"
            if self.layer_position == 'first':
                lr_factor = self.weight_lr_factor
            elif self.layer_position == 'middle':
                lr_factor = self.weight_lr_factor
            elif self.layer_position == 'last':
                init_factor = self.weight_init_gain / in_features
                lr_factor = self.weight_lr_factor / in_features
            else:
                raise ValueError(f'Unknown layer_position for mup-sgd-custom: {self.layer_position}')
        elif self.weight_param == 'standard':
            pass  # standard parameterization
        else:
            raise ValueError(f'Unknown weight_param "{self.weight_param}"')

        # pytorch default is
        # for weights:
        # kaiming_uniform from unif[-bound, bound]
        # bound = sqrt(3) * gain / sqrt(in_features)
        # gain = sqrt(2 / (1 + sqrt(5)^2)) = sqrt(1/3)
        # therefore bound = 1 / sqrt(in_features)
        # for biases it's also unif[-1/sqrt(in_features), 1/sqrt(in_features)]

        if self.weight_init_mode == 'normal':
            weight = torch.randn(in_features, self.out_features, device=ds.device)
        elif self.weight_init_mode == 'uniform':
            # include np.sqrt(3) to ensure variance = 1
            weight = np.sqrt(3) * (2 * torch.rand(in_features, self.out_features, device=ds.device) - 1)
        elif self.weight_init_mode == 'zeros' or self.weight_init_mode == 'zero':
            weight = torch.zeros(in_features, self.out_features, device=ds.device)
        elif self.weight_init_mode == 'std':
            weight = torch.randn(in_features, self.out_features, device=ds.device)
            x = ds.tensors['x_cont']
            weight = weight / x.matmul(weight_gain * init_factor * weight).std(dim=-2, correction=0, keepdim=True)
        elif self.weight_init_mode == 'sqmom':
            weight = torch.randn(in_features, self.out_features, device=ds.device)
            x = ds.tensors['x_cont']
            weight = weight / x.matmul(weight_gain * init_factor * weight).square().sum(dim=-2, keepdim=True).sqrt()
        else:
            raise ValueError(f'Unknown weight_init_mode: {self.weight_init_mode}')

        # print(f'{repr(weight)=}')
        # print(f'{hash_tensor(weight)=}')

        if self.use_norm_weight:
            factor = np.sqrt(self.out_features / in_features) if self.norm_weight_transpose else 1.0
            return NormWeightLayer(Variable(init_factor * weight, trainable=True,
                                            hyper_factors={'lr': lr_factor, 'wd': wd_factor,
                                                           'l2': l2_factor, 'l1': l1_factor}),
                                   factor=weight_gain * factor, fitter=self, transpose=self.norm_weight_transpose)
        else:
            return WeightLayer(Variable(init_factor * weight, trainable=True,
                                        hyper_factors={'lr': lr_factor, 'wd': wd_factor,
                                                       'l2': l2_factor, 'l1': l1_factor}), factor=weight_gain)


class BiasFitter(Fitter):
    def __init__(self, **config):
        super().__init__(modified_tensors=['x_cont'])
        self.in_features = config.get('in_features', None)
        self.bias_init_mode = config.get('bias_init_mode', 'zeros')
        self.bias_init_gain = config.get('bias_init_gain', 1.0)
        self.bias_lr_factor = config.get('bias_lr_factor', 1.0)
        self.bias_l1_reg_factor = config.get('bias_l1_reg_factor', 1.0)
        self.bias_l2_reg_factor = config.get('bias_l2_reg_factor', 1.0)
        self.bias_wd_factor = config.get('bias_wd_factor', 1.0)
        self.bias_param = config.get('bias_param', 'standard')
        self.layer_position = config.get('layer_position', None)
        self.bias_gain = config.get('bias_gain', 1.0)
        # todo: adjust for some bias init modes
        super().__init__(
            needs_tensors=self.bias_init_mode in ['he+5', 'mean', 'neg-uniform-dynamic', 'neg-uniform-dynamic-2',
                                                  'normal-dynamic'])

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def heplus_bias(self, x, n_simplex):
        idxs = torch.randint(0, x.shape[0], size=(x.shape[1], n_simplex), device=x.device)
        simplex_weights = torch.distributions.Exponential(1.0).sample((x.shape[1], n_simplex))
        simplex_weights = simplex_weights.to(x.device)
        simplex_weights /= simplex_weights.sum(dim=1)[:, None]
        out_selected = torch.stack([x[idxs[:, i], torch.arange(x.shape[1], device=x.device)]
                                    for i in range(n_simplex)], dim=1)
        return -(out_selected * simplex_weights).sum(dim=1)

    def _fit(self, ds: DictDataset):
        n_features = ds.tensor_infos['x_cont'].get_n_features()

        lr_factor = self.bias_lr_factor
        bias_gain = self.bias_gain
        l2_factor = self.bias_l2_reg_factor
        l1_factor = self.bias_l1_reg_factor
        wd_factor = self.bias_wd_factor

        if self.bias_param == 'mup-sgd' and self.layer_position == 'first':
            # corresponds to fan_out in Table 3 of "Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer"
            lr_factor *= self.in_features
        elif self.bias_param == 'ntk-adam':
            lr_factor = self.bias_lr_factor / np.sqrt(n_features)
            l1_factor = self.bias_l1_reg_factor / np.sqrt(n_features)
            l2_factor = self.bias_l2_reg_factor / np.sqrt(n_features)

        if self.bias_init_mode == 'zeros' or self.bias_init_mode == 'zero':
            bias = torch.zeros(n_features, device=ds.device)
        elif self.bias_init_mode == 'normal':
            bias = torch.randn(n_features, device=ds.device)
        elif self.bias_init_mode == 'uniform':
            # include np.sqrt(3) to ensure variance = 1
            bias = np.sqrt(3) * (2 * torch.rand(n_features, device=ds.device) - 1)
        elif self.bias_init_mode == 'neg-uniform':
            bias = np.sqrt(3) * (-torch.rand(n_features, device=ds.device))
        elif self.bias_init_mode == 'neg-uniform-dynamic':
            mean = ds.tensors['x_cont'].mean(dim=-2)
            std = ds.tensors['x_cont'].std(dim=-2, correction=0)
            bias = -std * (mean + np.sqrt(3) * torch.rand(n_features, device=ds.device))
        elif self.bias_init_mode == 'neg-uniform-dynamic-2':
            mean = ds.tensors['x_cont'].mean(dim=-2)
            std = ds.tensors['x_cont'].std(dim=-2, correction=0)
            bias = -mean - std * np.sqrt(3) * torch.rand(n_features, device=ds.device)
        elif self.bias_init_mode == 'normal-dynamic':
            mean = ds.tensors['x_cont'].mean(dim=-2)
            std = ds.tensors['x_cont'].std(dim=-2, correction=0)
            bias = -mean + std * torch.randn(n_features, device=ds.device)
        elif self.bias_init_mode == 'he+5':
            bias = self.heplus_bias(ds.tensors['x_cont'], 5)
        elif self.bias_init_mode == 'mean':
            bias = -ds.tensors['x_cont'].mean(dim=-2)
        elif self.bias_init_mode == 'pytorch-default':
            bias = np.sqrt(1.0 / self.in_features) * (2 * torch.rand(n_features, device=ds.device) - 1)
        else:
            raise ValueError(f'Unknown bias_init_mode: {self.bias_init_mode}')

        # print(f'{repr(bias)=}')
        # print(f'{hash_tensor(bias)=}')

        return BiasLayer(Variable(self.bias_init_gain * bias[None, :] / bias_gain, trainable=True,
                                  hyper_factors={'lr': lr_factor, 'wd': wd_factor,
                                                 'l1_reg': l1_factor,
                                                 'l2_reg': l2_factor}),
                         factor=bias_gain)


class ScaleFitter(Fitter):
    def __init__(self, **config):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])
        self.scale_init_gain = config.get('scale_init_gain', 1.0)
        self.scale_lr_factor = config.get('scale_lr_factor', 1.0)
        self.scale_wd_factor = config.get('scale_wd_factor', 1.0)
        self.scale_l2_reg_factor = config.get('scale_l2_reg_factor', 1.0)
        self.scale_l1_reg_factor = config.get('scale_l1_reg_factor', 1.0)
        self.scale_trainable = config.get('scale_trainable', True)
        self.scale_param = config.get('scale_param', 'standard')

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset):
        in_features = ds.tensor_infos['x_cont'].get_n_features()
        lr_factor = self.scale_lr_factor
        init_gain = self.scale_init_gain
        wd_factor = self.scale_wd_factor
        l2_reg_factor = self.scale_l2_reg_factor
        l1_reg_factor = self.scale_l1_reg_factor

        if self.scale_param == 'mup-adam-custom':
            lr_factor = self.scale_lr_factor / in_features
        elif self.scale_param == 'ntk-v2':
            lr_factor = self.scale_lr_factor / np.sqrt(in_features)
        elif self.scale_param == 'ntk-adam':
            lr_factor = self.scale_lr_factor / np.sqrt(in_features)
        elif self.scale_param == 'ntk-adam-v2':
            lr_factor = self.scale_lr_factor / np.sqrt(in_features)
            l2_reg_factor = self.scale_l2_reg_factor / np.sqrt(in_features)
            l1_reg_factor = self.scale_l1_reg_factor / np.sqrt(in_features)
        n_features = ds.tensor_infos['x_cont'].get_n_features()
        scale = init_gain * torch.ones(n_features, device=ds.device)
        return ScaleLayer(Variable(scale[None, :], trainable=self.scale_trainable,
                                   hyper_factors={'lr': lr_factor, 'wd': wd_factor,
                                                  'l2_reg': l2_reg_factor, 'l1_reg': l1_reg_factor}))


class ScaleFactory(FitterFactory):
    def __init__(self, **config):
        super().__init__()
        self.config = config

    def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter:
        return ScaleFitter(**self.config)


class DropoutLayer(Layer):
    def __init__(self):
        super().__init__()
        self.hyper_getter = self.context.hp_manager.register_hyper('p_drop', self.context.scope)

    def forward_cont(self, x):
        p_drop = self.hyper_getter()
        if p_drop == 0.0:
            return x
        return F.dropout(x, p_drop, training=self.training)


class DropoutFitter(Fitter):
    def __init__(self):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        return DropoutLayer()


class NoiseLayer(Layer):
    def __init__(self):
        super().__init__()
        self.sigma_getter = self.context.hp_manager.register_hyper('layer_noise_sigma', self.context.scope)

    def forward_cont(self, x):
        sigma = self.sigma_getter()
        if sigma == 0.0 or not self.training:
            return x
        return x + sigma * torch.randn_like(x)


class NoiseFitter(Fitter):
    def __init__(self, **config):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        return NoiseLayer()


# ------ Regression output rescaling / clamping -------


class ClampLayer(Layer):
    def __init__(self, low: Variable, high: Variable):
        super().__init__()
        self.low = low
        self.high = high

    def forward_cont(self, x):
        if self.training:
            return x
        else:
            return torch.min(torch.max(x, self.low), self.high)

    def _stack(self, layers):
        return ClampLayer(Variable.stack([l.low for l in layers]),
                          Variable.stack([l.high for l in layers]))


class ClampOutputFactory(Fitter, FitterFactory):
    def __init__(self, **config):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])
        self.config = config

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return 2 * self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        y = TrainContext.get_global_context().hp_manager.get_more_info_dict()['trainval_ds'].tensors['y']
        return ClampLayer(low=Variable(y.min(dim=-2, keepdim=True)[0], trainable=False),
                          high=Variable(y.max(dim=-2, keepdim=True)[0], trainable=False))


class NormalizeOutputLayer(Layer):
    def __init__(self, mean: Variable, std: Variable):
        super().__init__()
        self.mean = mean
        self.std = std

    def forward_tensors(self, tensors):
        tensors = copy.copy(tensors)  # shallow copy
        if self.training:
            assert 'y' in tensors
            tensors['y'] = (tensors['y'] - self.mean) / (self.std + 1e-30)
        else:
            tensors['x_cont'] = tensors['x_cont'] * self.std + self.mean
        return tensors

    def _stack(self, layers):
        return NormalizeOutputLayer(mean=Variable.stack([l.mean for l in layers]),
                                    std=Variable.stack([l.std for l in layers]))


class NormalizeOutputFactory(Fitter, FitterFactory):
    def __init__(self, **config):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return 2 * self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        y = TrainContext.get_global_context().hp_manager.get_more_info_dict()['trainval_ds'].tensors['y']
        return NormalizeOutputLayer(mean=Variable(y.mean(dim=-2, keepdim=True), trainable=False),
                                    std=Variable(y.std(dim=-2, correction=0, keepdim=True), trainable=False))


class NormWeightLayer(Layer):
    def __init__(self, weight: Variable, factor: float, fitter: Fitter, transpose=False):
        super().__init__(fitter=fitter)
        self.weight = weight
        self.factor = factor
        self.transpose = transpose

    def forward_cont(self, x):
        return x.matmul(self.factor * self.weight / self.weight.norm(dim=-1 if self.transpose else -2, keepdim=True))

    def _stack(self, layers):
        return NormWeightLayer(weight=Variable.stack([l.weight for l in layers]),
                               factor=layers[0].factor,
                               fitter=layers[0].fitter,
                               transpose=layers[0].transpose)


class FixedScaleFactory(Fitter, FitterFactory):
    def __init__(self, scale: torch.Tensor):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])
        self.scale = scale

    def _fit(self, ds: DictDataset) -> Layer:
        return ScaleLayer(Variable(self.scale, trainable=False))


class FeatureImportanceFactory(Fitter, FitterFactory):
    def __init__(self):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        scale = TrainContext.get_global_context().hp_manager.get_more_info_dict()['feature_importances'][None, :]
        return ScaleLayer(Variable(scale.to(ds.device), trainable=False))


class FixedWeightFactory(Fitter, FitterFactory):
    def __init__(self):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        weight = TrainContext.get_global_context().hp_manager.get_more_info_dict()['fixed_weights']
        return WeightLayer(Variable(weight.to(ds.device), trainable=False))


class RFFeatureImportanceFactory(Fitter, FitterFactory):
    def __init__(self):
        super().__init__(needs_tensors=True, is_individual=True, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        x = ds.tensors['x_cont'].cpu().numpy()
        y = ds.tensors['y'].cpu().numpy()
        n_estimators = 50
        if ds.tensor_infos['y'].is_cont():
            # assume it's regression
            model = RandomForestRegressor(n_estimators=n_estimators, n_jobs=1)
        else:
            # assume it's classification
            model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=1)
        model.fit(x, y)
        scale = torch.as_tensor(model.feature_importances_, dtype=torch.float32, device=ds.device)
        # print(f'RF feature importances: {scale}')
        scale *= np.sqrt(scale.shape[0]) / scale.norm(dim=-1)
        return ScaleLayer(Variable(scale[None, :], trainable=False))


# ------ Mixup and Label smoothing ------

class PLREmbeddingsFactory(Fitter, FitterFactory):
    # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings
    def __init__(self, plr_sigma: float = 1.0, plr_hidden_1: int = 8, plr_hidden_2: int = 8,
                 plr_lr_factor: float = 1.0, plr_lr_factor_1: float = 1.0, plr_lr_factor_2: float = 1.0,
                 plr_wd_factor: float = 1.0, plr_act_name: str = 'relu',
                 plr_use_densenet: bool = False, plr_use_cos_bias: bool = False, **config):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])
        self.plr_sigma = plr_sigma
        self.plr_hidden_1 = plr_hidden_1
        self.plr_hidden_2 = plr_hidden_2
        self.plr_lr_factor = plr_lr_factor
        self.plr_lr_factor_1 = plr_lr_factor_1
        self.plr_lr_factor_2 = plr_lr_factor_2
        self.plr_wd_factor = plr_wd_factor
        self.plr_act_name = plr_act_name
        self.plr_use_densenet = plr_use_densenet
        self.plr_use_cos_bias = plr_use_cos_bias
        if not plr_use_cos_bias:
            assert plr_hidden_1 % 2 == 0

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        n_cont = self._get_n_values(tensor_infos, ['x_cont'])
        hidden_2 = self.plr_hidden_2
        if self.plr_use_densenet:
            hidden_2 -= 1  # don't count densenet output for parameters
        if self.plr_use_cos_bias:
            return n_cont * (2 * self.plr_hidden_1 + (self.plr_hidden_1 + 1) * hidden_2)
        else:
            return n_cont * (self.plr_hidden_1 // 2 + (self.plr_hidden_1 + 1) * hidden_2)

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        hidden_2 = self.plr_hidden_2
        if self.plr_use_densenet:
            # for before the torch.cat() and after the torch.cat()
            hidden_2 = 2 * hidden_2 - 1
        if self.plr_act_name != 'linear':
            hidden_2 += self.plr_hidden_2
        n_cont = self._get_n_values(tensor_infos, ['x_cont'])
        if self.plr_use_cos_bias:
            # 3 for wx, wx+b, cos(wx+b)
            return n_cont * (3 * self.plr_hidden_1 + hidden_2)
        else:
            # in first hidden layer, have wx, sin(wx), cos(wx), cat(...)
            return n_cont * (int(2.5 * self.plr_hidden_1) + hidden_2)

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        return utils.update_dict(tensor_infos,
                                 {'x_cont': TensorInfo(
                                     feat_shape=[tensor_infos['x_cont'].get_n_features() * self.plr_hidden_2])})

    def _fit(self, ds: DictDataset) -> Layer:
        n_cont = ds.tensor_infos['x_cont'].get_n_features()  # assuming that the shape is rank 1
        hyper_factors_1 = {'lr': self.plr_lr_factor * self.plr_lr_factor_1, 'wd': self.plr_wd_factor}
        hyper_factors_2 = {'lr': self.plr_lr_factor * self.plr_lr_factor_2, 'wd': self.plr_wd_factor}

        if self.plr_use_cos_bias:
            with sub_scope_context('weight_1'):
                weight_1 = Variable(self.plr_sigma * torch.randn(n_cont, 1, self.plr_hidden_1, device=ds.device),
                                    hyper_factors=hyper_factors_1)
            with sub_scope_context('bias_1'):
                # use uniform [-pi, pi] instead of uniform [0, 2pi] for smaller values in case of weight decay
                bias_1 = Variable(np.pi * (-1 + 2 * torch.rand(n_cont, 1, self.plr_hidden_1, device=ds.device)),
                                  hyper_factors=hyper_factors_1)
        else:
            # normal initialization as in the paper
            with sub_scope_context('weight_1'):
                weight_1 = Variable(self.plr_sigma * torch.randn(n_cont, 1, self.plr_hidden_1 // 2, device=ds.device),
                                    hyper_factors=hyper_factors_1)

        # kaiming init from nn.Linear
        in_features = self.plr_hidden_1
        hidden_2 = self.plr_hidden_2
        if self.plr_use_densenet:
            hidden_2 -= 1
        with sub_scope_context('weight_2'):
            weight_2 = Variable(
                (-1 + 2 * torch.rand(n_cont, self.plr_hidden_1, hidden_2, device=ds.device))
                / np.sqrt(in_features),
                hyper_factors=hyper_factors_2)
        with sub_scope_context('bias_2'):
            bias_2 = Variable(
                (-1 + 2 * torch.rand(n_cont, 1, hidden_2, device=ds.device)) / np.sqrt(in_features),
                hyper_factors=hyper_factors_2)

        if self.plr_use_cos_bias:
            return PLREmbeddingsLayerCosBias(fitter=self, weight_1=weight_1, weight_2=weight_2, bias_1=bias_1,
                                             bias_2=bias_2, plr_act_name=self.plr_act_name,
                                             plr_use_densenet=self.plr_use_densenet)
        else:
            return PLREmbeddingsLayer(fitter=self, weight_1=weight_1, weight_2=weight_2, bias_2=bias_2,
                                      plr_act_name=self.plr_act_name, plr_use_densenet=self.plr_use_densenet)


class PLREmbeddingsLayer(Layer):
    # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings
    # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package
    def __init__(self, fitter: Fitter, weight_1: Variable, weight_2: Variable, bias_2: Variable, plr_act_name: str,
                 plr_use_densenet: bool = False):
        super().__init__(fitter=fitter)
        self.weight_1 = weight_1
        self.weight_2 = weight_2
        self.bias_2 = bias_2
        self.plr_act_name = plr_act_name
        self.plr_use_densenet = plr_use_densenet

    def forward_cont(self, x):
        # transpose to treat the continuous feature dimension like a batched dimension
        # then add a new channel dimension
        # shape will be (vectorized..., n_cont, batch, 1)
        x_orig = x
        x = x.transpose(-1, -2).unsqueeze(-1)
        x = 2 * torch.pi * x.matmul(self.weight_1)  # matmul is automatically batched
        x = torch.cat([torch.cos(x), torch.sin(x)], dim=-1)
        x = x.matmul(self.weight_2)  # matmul is automatically batched
        x = x + self.bias_2
        if self.plr_act_name == 'relu':
            x = torch.relu(x)
        elif self.plr_act_name == 'linear':
            pass
        else:
            raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"')
        # bring back n_cont dimension after n_batch
        # then flatten the last two dimensions
        x = x.transpose(-2, -3)
        x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1])
        if self.plr_use_densenet:
            x = torch.cat([x, x_orig], dim=-1)
        return x

    def _stack(self, layers):
        return PLREmbeddingsLayer(fitter=layers[0].fitter,
                                  weight_1=Variable.stack([l.weight_1 for l in layers]),
                                  weight_2=Variable.stack([l.weight_2 for l in layers]),
                                  bias_2=Variable.stack([l.bias_2 for l in layers]),
                                  plr_act_name=layers[0].plr_act_name,
                                  plr_use_densenet=layers[0].plr_use_densenet)


class PLREmbeddingsLayerCosBias(Layer):
    # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings
    # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package
    def __init__(self, fitter: Fitter, weight_1: Variable, bias_1: Variable,
                 weight_2: Variable, bias_2: Variable, plr_act_name: str,
                 plr_use_densenet: bool = False):
        super().__init__(fitter=fitter)
        self.weight_1 = weight_1
        self.weight_2 = weight_2
        self.bias_1 = bias_1
        self.bias_2 = bias_2
        self.plr_act_name = plr_act_name
        self.plr_use_densenet = plr_use_densenet

    def forward_cont(self, x):
        # transpose to treat the continuous feature dimension like a batched dimension
        # then add a new channel dimension
        # shape will be (vectorized..., n_cont, batch, 1)
        x_orig = x
        x = x.transpose(-1, -2).unsqueeze(-1)
        x = 2 * torch.pi * x.matmul(self.weight_1)  # matmul is automatically batched
        x = x + self.bias_1
        # x = torch.sin(x)
        x = torch.cos(x)
        x = x.matmul(self.weight_2)  # matmul is automatically batched
        x = x + self.bias_2
        if self.plr_act_name == 'relu':
            x = torch.relu(x)
        elif self.plr_act_name == 'linear':
            pass
        else:
            raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"')
        # bring back n_cont dimension after n_batch
        # then flatten the last two dimensions
        x = x.transpose(-2, -3)
        x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1])
        if self.plr_use_densenet:
            x = torch.cat([x, x_orig], dim=-1)
        return x

    def _stack(self, layers):
        return PLREmbeddingsLayerCosBias(fitter=layers[0].fitter,
                                         weight_1=Variable.stack([l.weight_1 for l in layers]),
                                         weight_2=Variable.stack([l.weight_2 for l in layers]),
                                         bias_1=Variable.stack([l.bias_1 for l in layers]),
                                         bias_2=Variable.stack([l.bias_2 for l in layers]),
                                         plr_act_name=layers[0].plr_act_name,
                                         plr_use_densenet=layers[0].plr_use_densenet)


class PeriodicEmbeddingsFactory(Fitter, FitterFactory):
    # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings
    def __init__(self, periodic_emb_sigma: float = 1.0, periodic_emb_dim: int = 8,
                 periodic_emb_lr_factor: float = 1.0, periodic_emb_wd_factor: float = 1.0,
                 periodic_emb_only_cos: bool = False, periodic_emb_densenet: bool = False, **config):
        super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont'])
        self.periodic_emb_sigma = periodic_emb_sigma
        self.periodic_emb_dim = periodic_emb_dim
        self.periodic_emb_lr_factor = periodic_emb_lr_factor
        self.periodic_emb_wd_factor = periodic_emb_wd_factor
        self.periodic_emb_only_cos = periodic_emb_only_cos
        self.periodic_emb_densenet = periodic_emb_densenet

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        n_params_single = self.periodic_emb_dim
        if self.periodic_emb_densenet:
            n_params_single -= 1
        if self.periodic_emb_only_cos:
            n_params_single *= 2
        else:
            n_params_single //= 2
        return self._get_n_values(tensor_infos, ['x_cont']) * n_params_single

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        n_cont = self._get_n_values(tensor_infos, ['x_cont'])
        # factor 2 * for sin, cos, x, and concat
        return 2 * n_cont * self.periodic_emb_dim

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]:
        return utils.update_dict(tensor_infos,
                                 {'x_cont': TensorInfo(
                                     feat_shape=[tensor_infos['x_cont'].get_n_features() * self.periodic_emb_dim])})

    def _fit(self, ds: DictDataset) -> Layer:
        n_cont = ds.tensor_infos['x_cont'].get_n_features()  # assuming that the shape is rank 1
        hyper_factors = {'lr': self.periodic_emb_lr_factor, 'wd': self.periodic_emb_wd_factor}

        param_dim = self.periodic_emb_dim
        if self.periodic_emb_densenet:
            param_dim -= 1
        if self.periodic_emb_only_cos:
            # not implemented because it turned out to be not so good to omit the linear layer afterward
            raise NotImplementedError()
        else:
            if param_dim % 2 == 1:
                raise ValueError(f'Wrong parity for periodic_emb_dim, got {self.periodic_emb_dim=}')
            param_dim //= 2

            with sub_scope_context('weight'):
                weight = Variable(
                    self.periodic_emb_sigma * torch.randn(n_cont, 1, param_dim, device=ds.device),
                    hyper_factors=hyper_factors)

            return PeriodicEmbeddingsLayerSinCos(self, weight, periodic_emb_densenet=self.periodic_emb_densenet)


class PeriodicEmbeddingsLayerSinCos(Layer):
    # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings
    # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package
    def __init__(self, fitter: Fitter, weight: Variable, periodic_emb_densenet: bool):
        super().__init__(fitter=fitter)
        self.weight = weight
        self.periodic_emb_densenet = periodic_emb_densenet

    def forward_cont(self, x):
        # transpose to treat the continuous feature dimension like a batched dimension
        # then add a new channel dimension
        # shape will be (vectorized..., n_cont, batch, 1)
        x_orig = x
        x = x.transpose(-1, -2).unsqueeze(-1)
        x = 2 * torch.pi * x.matmul(self.weight)  # matmul is automatically batched
        x = torch.cat([torch.cos(x), torch.sin(x)], dim=-1)
        # bring back n_cont dimension after n_batch
        # then flatten the last two dimensions
        x = x.transpose(-2, -3)
        x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1])
        if self.periodic_emb_densenet:
            x = torch.cat([x, x_orig], dim=-1)
        return x

    def _stack(self, layers):
        return PeriodicEmbeddingsLayerSinCos(fitter=layers[0].fitter,
                                             weight=Variable.stack([l.weight for l in layers]),
                                             periodic_emb_densenet=layers[0].periodic_emb_densenet)


class ToSoftLabelLayer(Layer):
    def __init__(self, y_tensor_info, fitter: Fitter):
        super().__init__(fitter=fitter)
        self.y_tensor_info = y_tensor_info

    def forward_tensors(self, tensors):
        if 'y' not in tensors:
            return tensors
        else:
            y = tensors['y']
            y_cs = self.y_tensor_info.get_cat_sizes().numpy()
            new_y_cols = []
            for i, cs in enumerate(y_cs):
                if cs == 0:
                    # already continuous
                    new_y_cols.append(y[tuple([slice(None)] * (y.dim() - 1) + [slice(i, i + 1)])])
                else:
                    # make continuous
                    # todo: is there a better one-hot function without the long -> float conversion?
                    new_y_cols.append(F.one_hot(y[tuple([slice(None)] * (y.dim() - 1) + [i])], num_classes=cs).float())
            return utils.join_dicts(tensors, {'y': torch.cat(new_y_cols, dim=-1)})


class ToSoftLabelFitter(Fitter):
    def __init__(self):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['y'])

    def forward_tensor_infos(self, tensor_infos):
        if 'y' not in tensor_infos:
            return tensor_infos

        new_y_shape = sum([max(1, cs) for cs in tensor_infos['y'].get_cat_sizes().numpy()])
        return utils.update_dict(tensor_infos, {'y': TensorInfo(feat_shape=[new_y_shape])})

    def _fit(self, ds: DictDataset) -> Layer:
        return ToSoftLabelLayer(y_tensor_info=ds.tensor_infos['y'], fitter=self)


class LabelSmoothingLayer(Layer):
    # assumes soft labels as inputs
    def __init__(self, ls_dist: Variable):
        super().__init__()
        self.hyper_getter = self.context.hp_manager.register_hyper('ls_eps', self.context.scope)
        self.ls_dist = ls_dist

    def forward_tensors(self, tensors):
        # print(f'{self.training=}, {list(tensors.keys())=}')
        # if not self.training or 'y' not in tensors:
        if 'y' not in tensors:
            return tensors

        ls_eps = self.hyper_getter()
        # print(f'{ls_eps=:g}')
        y = tensors['y']
        y = (1.0 - ls_eps) * y + ls_eps * self.ls_dist
        return utils.update_dict(tensors, {'y': y})

    def _stack(self, layers):
        return LabelSmoothingLayer(Variable.stack([l.ls_dist for l in layers]))


class LabelSmoothingFitter(Fitter):
    def __init__(self, use_ls_prior=False, **config):
        # todo: we set needs_tensors=True and is_individual=True here
        #  because the transformation can depend on the hyperparameter ls_eps, which can be scheduled.
        #  If needs_tensors=True, this fitter is not fitted for one-time preprocessing,
        #  where the schedules are not yet available.
        #  ideally, super().__init__() would use another parameter is_dynamic or so which could be set to true instead
        # formerly, we used needs_tensors=use_ls_prior
        super().__init__(needs_tensors=True, is_individual=True, modified_tensors=['y'])
        self.use_ls_prior = use_ls_prior

    def _fit(self, ds: DictDataset) -> Layer:
        # consistency check since y must be soft labels and not hard labels
        assert ds.tensor_infos['y'].is_cont()
        # y is assumed to already be converted to one-hot
        if self.use_ls_prior:
            y = ds.tensors['y']
            ls_dist = y.mean(dim=-2, keepdim=True)
        else:
            n_classes = ds.tensor_infos['y'].get_n_features()
            ls_dist = torch.ones(1, n_classes, device=ds.device) / n_classes
        return LabelSmoothingLayer(Variable(ls_dist, trainable=False))


class LabelSmoothingFactory(FitterFactory):
    def __init__(self, **config):
        super().__init__()
        self.config = config

    def _create(self, tensor_infos) -> Fitter:
        if tensor_infos['y'].get_cat_sizes()[0].item() > 0:
            # labels are still in categorical form
            return SequentialFitter([ToSoftLabelFitter(), LabelSmoothingFitter(**self.config)])

        return LabelSmoothingFitter(**self.config)


class StochasticLabelNoiseLayer(Layer):
    def __init__(self):
        super().__init__()
        self.sigma_getter = self.context.hp_manager.register_hyper('sln_sigma', self.context.scope)

    def forward_tensors(self, tensors):
        if 'y' not in tensors:
            return tensors

        y = tensors['y']
        return utils.join_dicts(tensors, {'y': y + self.sigma_getter() * torch.randn_like(y)})


class StochasticLabelNoiseFitter(Fitter):
    def __init__(self):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        # todo: could do a consistency check since y must be soft labels and not hard labels
        return StochasticLabelNoiseLayer()


class StochasticLabelNoiseFactory(FitterFactory):
    def _create(self, tensor_infos) -> Fitter:
        if tensor_infos['y'].get_cat_sizes()[0].item() > 0:
            # labels are still in categorical form
            return SequentialFitter([ToSoftLabelFitter(), StochasticLabelNoiseFitter()])

        return StochasticLabelNoiseFitter()


# implementing "Feature Selection using Stochastic Gates"
class StochasticGateLayer(Layer):
    def __init__(self, mu: Variable):
        super().__init__()
        self.sigma_getter = self.context.hp_manager.register_hyper('sg_sigma', self.context.scope)
        self.lambda_getter = self.context.hp_manager.register_hyper('sg_lambda', self.context.scope)
        self.mu = mu

    def forward_cont(self, x):
        mu = self.mu
        if self.training:
            sigma = self.sigma_getter()
            mu = mu + sigma * torch.randn_like(x)
            reg = gauss_cdf(self.mu / sigma).mean(dim=-1).mean(dim=-1).sum()
            self.context.hp_manager.add_reg_term(self.lambda_getter() * reg)
        z = mu.clamp(0.0, 1.0)
        # z = z / (z.mean(dim=-1, keepdim=True) + 1e-8)
        return x * z

    def _stack(self, layers):
        return StochasticGateLayer(Variable.stack([l.mu for l in layers]))


class StochasticGateFactory(Fitter, FitterFactory):
    def __init__(self):
        super().__init__(needs_tensors=False, modified_tensors=['x_cont'])

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        # rough upper bound
        return 15 * self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        # see https://github.com/runopti/stg/blob/9f630968c4f14cff6da4e54421c497f24ac1e08e/python/stg/layers.py#L10
        n_cont = ds.tensor_infos['x_cont'].get_n_features()
        return StochasticGateLayer(Variable(0.5 * torch.ones(1, n_cont, device=ds.device), hyper_factors={'wd': 0.0}))


class AntisymmetricInitializationFactory(FitterFactory):
    def __init__(self, factory, **config):
        super().__init__()
        self.factory = factory
        self.config = config

    def _create(self, tensor_infos) -> Fitter:
        fitter = self.factory.create(tensor_infos)
        # return AntisymmetricInitializationFitter(fitter, **self.config)

        # only duplicate the part of the fitter that is actually learnable
        common, individual = fitter.split_off_individual()
        return SequentialFitter([common, AntisymmetricInitializationFitter(individual, **self.config)])


class AntisymmetricInitializationFitter(Fitter):
    """
    Implements the antisymmetric initialization trick from http://proceedings.mlr.press/v107/zhang20a/zhang20a.pdf
    """

    def __init__(self, fitter: Fitter, **config):
        super().__init__(needs_tensors=fitter.needs_tensors, is_individual=fitter.is_individual,
                         scope_names=fitter.scope_names, modified_tensors=fitter.modified_tensors)
        self.fitter = fitter
        self.asi_factor = config.get('asi_factor', 1 / np.sqrt(2))

    def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]):
        return self.fitter.forward_tensor_infos(tensor_infos)

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]):
        return 2 * self.fitter.get_n_params(tensor_infos)

    def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]):
        return 2 * self.fitter.get_n_forward(tensor_infos)  # maybe not entirely accurate but almost

    def _fit(self, ds: DictDataset) -> Layer:
        tfm1 = self.fitter.fit(ds)
        tfm2 = self.fitter.fit(ds)
        with torch.no_grad():
            for p1, p2 in zip(tfm1.parameters(), tfm2.parameters()):
                p2.data = p1.data
            for b1, b2 in zip(tfm1.buffers(), tfm2.buffers()):
                b2.data = b1.data
        # multiply by 1/sqrt(2) at the end to preserve the learning speed for SGD,
        # however, would need to multiply by 0.5 for adam
        return SequentialLayer([SubtractionLayer(tfm1, tfm2),
                                FunctionLayer(lambda x, a=self.asi_factor: a * x)])

    def __str__(self):
        sub_strings = ['  ' + line for line in str(self.fitter).split('\n')]
        return f'{self.__class__.__name__} (\n' + '\n'.join(sub_strings) + '\n)\n'


class SubtractionLayer(Layer):
    def __init__(self, layer1: Layer, layer2: Layer):
        super().__init__()
        self.layer1 = layer1
        self.layer2 = layer2

    def forward_tensor_infos(self, tensor_infos):
        return utils.join_dicts(self.layer1.forward_tensor_infos(tensor_infos),
                                self.layer2.forward_tensor_infos(tensor_infos))

    def forward_tensors(self, tensors):
        out1 = self.layer1.forward_tensors(tensors)
        out2 = self.layer2.forward_tensors(tensors)
        if 'x_cont' not in out2:
            return utils.join_dicts(out1, out2)

        return utils.join_dicts(out1, out2, {'x_cont': out1['x_cont'] - out2['x_cont']})

    def _stack(self, layers):
        return SubtractionLayer(layers[0].layer1.stack([l.layer1 for l in layers]),
                                layers[0].layer2.stack([l.layer2 for l in layers]))


================================================
FILE: pytabkit/models/nn_models/pipeline.py
================================================
from typing import List, Dict, Union

import sklearn
import torch
from sklearn.base import BaseEstimator, TransformerMixin

from pytabkit.models import utils
from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.nn_models.base import Layer, Variable, Fitter, FitterFactory, IdentityLayer, BiasLayer, ScaleLayer, \
    SequentialLayer
from pytabkit.models.torch_utils import torch_np_quantile


# todo: add factories


class ReplaceMissingContLayer(Layer):
    def __init__(self, means: Variable):
        super().__init__()
        if not isinstance(means, Variable):
            raise ValueError('means is not a Variable')
        self.means = means

    def forward_cont(self, x):
        return torch.where(torch.isnan(x), self.means, x)

    def _stack(self, layers: List['ReplaceMissingContLayer']):
        return ReplaceMissingContLayer(Variable.stack([layer.means for layer in layers]))


class MeanReplaceMissingContFactory(Fitter, FitterFactory):
    def __init__(self, trainable=False, **config):
        super().__init__(is_individual=trainable, modified_tensors=['x_cont'])
        self.trainable = trainable

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        x_cont = ds.tensors['x_cont']
        is_nan = torch.isnan(x_cont)
        x_cont_replaced = torch.where(is_nan, torch.zeros_like(x_cont), x_cont)
        means = x_cont_replaced.sum(dim=-2, keepdim=True) \
                / (x_cont.shape[-2] - is_nan.float().sum(dim=-2, keepdim=True) + 1e-30)
        return ReplaceMissingContLayer(Variable(means, trainable=self.trainable))


class MeanCenterFactory(Fitter, FitterFactory):
    def __init__(self, trainable=False, **config):
        super().__init__(is_individual=trainable, modified_tensors=['x_cont'])
        self.trainable = trainable

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        return BiasLayer(Variable(-ds.tensors['x_cont'].mean(dim=-2, keepdim=True),
                                  trainable=self.trainable))


class MedianCenterFactory(Fitter, FitterFactory):
    def __init__(self, median_center_trainable=False, **config):
        super().__init__(is_individual=median_center_trainable, modified_tensors=['x_cont'])
        self.trainable = median_center_trainable

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        # quantile requires PyTorch >= 1.7.0
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()

        # use quantile function from numpy since the torch one can use large amounts of RAM for some reason
        return BiasLayer(Variable(-torch_np_quantile(ds.tensors['x_cont'], 0.5, dim=-2, keepdim=True),
                                  trainable=self.trainable))


class L2NormalizeFactory(Fitter, FitterFactory):
    def __init__(self, trainable=False, l2_normalize_eps=1e-8, **config):
        super().__init__(is_individual=trainable, modified_tensors=['x_cont'])
        self.trainable = trainable
        self.eps = l2_normalize_eps

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        scale = 1.0 / (ds.tensors['x_cont'] ** 2 + self.eps).mean(dim=-2, keepdim=True).sqrt()
        scale[:, (ds.tensors['x_cont'] ** 2).mean(dim=-2) == 0.0] = 0.0
        return ScaleLayer(Variable(scale, trainable=self.trainable))


class L1NormalizeFactory(Fitter, FitterFactory):
    def __init__(self, trainable=False, eps=1e-8, **config):
        super().__init__(is_individual=trainable, modified_tensors=['x_cont'])
        self.trainable = trainable
        self.eps = eps

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        scale = 1.0 / (ds.tensors['x_cont'].abs() + self.eps).mean(dim=-2, keepdim=True)
        return ScaleLayer(Variable(scale, trainable=self.trainable))


class MinMaxScaleFactory(Fitter, FitterFactory):
    def __init__(self, trainable=False, eps=1e-8, **config):
        super().__init__(is_individual=trainable, modified_tensors=['x_cont'])
        self.trainable = trainable
        self.eps = eps

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return 2 * self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        x_cont = ds.tensors['x_cont']
        x_min = x_cont.min(dim=-2, keepdim=True)[0]
        x_max = x_cont.max(dim=-2, keepdim=True)[0]
        scale = 2.0 / (x_max - x_min + self.eps)
        bias = -0.5 * (x_max + x_min)
        return SequentialLayer([BiasLayer(Variable(bias, trainable=self.trainable)),
                                ScaleLayer(Variable(scale, trainable=self.trainable))])


class RobustScaleFactory(Fitter, FitterFactory):
    def __init__(self, robust_scale_trainable=False, robust_scale_eps=1e-30, **config):
        super().__init__(is_individual=robust_scale_trainable, modified_tensors=['x_cont'])
        self.trainable = robust_scale_trainable
        self.robust_scale_eps = robust_scale_eps

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        x_cont = ds.tensors['x_cont']
        quant_diff = torch_np_quantile(x_cont, 0.75, dim=-2) - torch_np_quantile(x_cont, 0.25, dim=-2)
        max, _ = x_cont.max(dim=-2)
        min, _ = x_cont.min(dim=-2)
        idxs = quant_diff == 0.0
        quant_diff[idxs] = 0.5 * (max[idxs] - min[idxs])
        factors = 1.0 / (quant_diff + self.robust_scale_eps)
        factors[quant_diff == 0.0] = 0.0
        return ScaleLayer(Variable(factors[None, :], trainable=self.trainable))


class RobustScaleV2Factory(Fitter, FitterFactory):
    def __init__(self, robust_scale_trainable=False, **config):
        super().__init__(is_individual=robust_scale_trainable, modified_tensors=['x_cont'])
        self.trainable = robust_scale_trainable

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        x_cont = ds.tensors['x_cont']
        x_cont_sorted, _ = torch.sort(x_cont, dim=-2)
        quantiles = torch.linspace(0.0, 1.0, x_cont.shape[-2], device=x_cont.device)
        opposite_dists = x_cont_sorted.flip(dims=[-2]) - x_cont_sorted
        opposite_quantile_dists = quantiles.flip(dims=[0]) - quantiles
        quarter_idx = x_cont.shape[-2] // 4 + 1
        possible_factors = 2.0 * opposite_quantile_dists[:quarter_idx, None] / \
                           (1e-30 + opposite_dists[..., :quarter_idx, :])
        factors = possible_factors.min(dim=-2, keepdim=True)[0]
        return ScaleLayer(Variable(factors, trainable=self.trainable))


class GlobalScaleNormalizeFactory(Fitter, FitterFactory):
    def __init__(self, global_scale_factor=1.0, **config):
        super().__init__(is_individual=False, modified_tensors=['x_cont'])
        self.global_scale_factor = global_scale_factor

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self._get_n_values(tensor_infos, ['x_cont'])

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        x_cont = ds.tensors['x_cont']
        scale = self.global_scale_factor / (x_cont ** 2 + 1e-30).mean().sqrt().item()
        return ScaleLayer(Variable(scale * torch.ones(1, 1, device=x_cont.device), trainable=False))


class ThermometerCodingLayer(Layer):
    def __init__(self, centers: Variable, scale: float, fitter: Fitter):
        super().__init__(fitter=fitter)
        self.centers = centers
        self.scale = scale  # todo: could make scale a variable and allow for different scales per center

    def forward_cont(self, x):
        shifted = self.scale * (x.unsqueeze(-1) - self.centers)
        return torch.tanh(shifted.reshape(list(x.shape[:-1]) + [-1]))

    def _stack(self, layers):
        return ThermometerCodingLayer(Variable.stack([l.centers for l in layers]), layers[0].scale, layers[0].fitter)


class ThermometerCodingFactory(Fitter, FitterFactory):
    def __init__(self, tc_low=-1.0, tc_high=1.0, tc_num=3, tc_scale=1.0, **config):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont'])
        self.tc_low = tc_low
        self.tc_high = tc_high
        self.tc_num = tc_num
        self.tc_scale = tc_scale

    def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int:
        return self.tc_num

    def forward_tensor_infos(self, tensor_infos):
        n_cont = tensor_infos['x_cont'].get_n_features()
        return utils.join_dicts(tensor_infos, {'x_cont': TensorInfo(feat_shape=[n_cont * self.tc_num])})

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        centers = torch.linspace(self.tc_low, self.tc_high, self.tc_num, device=ds.device)[None, None, :]
        centers = Variable(centers, trainable=False)
        return ThermometerCodingLayer(centers=centers, scale=self.tc_scale, fitter=self)


class CircleCodingLayer(Layer):
    def __init__(self, scale: float, fitter: Fitter):
        super().__init__(fitter=fitter)
        self.scale = scale

    def forward_cont(self, x):
        x = (1.0 / self.scale) * x
        factor = 1.0 / torch.sqrt(1.0 + x ** 2)
        return torch.cat([x * factor, torch.ones_like(x) * factor], dim=-1)

    def _stack(self, layers):
        return CircleCodingLayer(layers[0].scale, layers[0].fitter)


class CircleCodingFactory(Fitter, FitterFactory):
    def __init__(self, circle_coding_scale=1.0, **config):
        super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont'])
        self.scale = circle_coding_scale

    def forward_tensor_infos(self, tensor_infos):
        n_cont = tensor_infos['x_cont'].get_n_features()
        return utils.join_dicts(tensor_infos, {'x_cont': TensorInfo(feat_shape=[n_cont * 2])})

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        return CircleCodingLayer(scale=self.scale, fitter=self)


def apply_tfms_rec(tfms: Union[BaseEstimator, List], x: torch.Tensor):
    if isinstance(tfms, list):
        return torch.stack([apply_tfms_rec(tfm, x[i]) for i, tfm in enumerate(tfms)], dim=0)
    else:
        return torch.as_tensor(tfms.transform(x.detach().cpu().numpy()), dtype=x.dtype, device=x.device)


class SklearnTransformLayer(Layer):
    def __init__(self, tfms: Union[BaseEstimator, List], fitter: Fitter):
        super().__init__(fitter=fitter)
        self.tfms = tfms

    def forward_cont(self, x):
        return apply_tfms_rec(self.tfms, x)

    def _stack(self, layers):
        return SklearnTransformLayer(tfms=[l.tfms for l in layers], fitter=layers[0].fitter)


class SklearnTransformFactory(Fitter, FitterFactory):
    def __init__(self, tfm: BaseEstimator, **config):
        super().__init__(needs_tensors=True, is_individual=False, modified_tensors=['x_cont'])
        self.tfm = tfm

    def _fit(self, ds: DictDataset) -> Layer:
        if ds.tensor_infos['x_cont'].is_empty():
            return IdentityLayer()
        tfm = sklearn.base.clone(self.tfm)
        tfm.fit(ds.tensors['x_cont'].detach().cpu().numpy())
        return SklearnTransformLayer(tfm, fitter=self)


================================================
FILE: pytabkit/models/nn_models/rtdl_num_embeddings.py
================================================
# taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py
"""On Embeddings for Numerical Features in Tabular Deep Learning."""

__version__ = '0.0.11'

__all__ = [
    'LinearEmbeddings',
    'LinearReLUEmbeddings',
    'PeriodicEmbeddings',
    'compute_bins',
    'PiecewiseLinearEncoding',
    'PiecewiseLinearEmbeddings',
]

import math
import warnings
from collections import OrderedDict
from typing import Any, Literal, Optional, Union

try:
    import sklearn.tree as sklearn_tree
except ImportError:
    sklearn_tree = None

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.parameter import Parameter

try:
    from tqdm import tqdm
except ImportError:
    tqdm = None


def _check_input_shape(x: Tensor, expected_n_features: int) -> None:
    if x.ndim < 1:
        raise ValueError(
            f'The input must have at least one dimension, however: {x.ndim=}'
        )
    if x.shape[-1] != expected_n_features:
        raise ValueError(
            'The last dimension of the input was expected to be'
            f' {expected_n_features}, however, {x.shape[-1]=}'
        )


class LinearEmbeddings(nn.Module):
    """Linear embeddings for continuous features.

    **Shape**

    - Input: `(*, n_features)`
    - Output: `(*, n_features, d_embedding)`

    **Examples**

    >>> batch_size = 2
    >>> n_cont_features = 3
    >>> x = torch.randn(batch_size, n_cont_features)
    >>> d_embedding = 4
    >>> m = LinearEmbeddings(n_cont_features, d_embedding)
    >>> m(x).shape
    torch.Size([2, 3, 4])
    """

    def __init__(self, n_features: int, d_embedding: int) -> None:
        """
        Args:
            n_features: the number of continuous features.
            d_embedding: the embedding size.
        """
        if n_features <= 0:
            raise ValueError(f'n_features must be positive, however: {n_features=}')
        if d_embedding <= 0:
            raise ValueError(f'd_embedding must be positive, however: {d_embedding=}')

        super().__init__()
        self.weight = Parameter(torch.empty(n_features, d_embedding))
        self.bias = Parameter(torch.empty(n_features, d_embedding))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        d_rqsrt = self.weight.shape[1] ** -0.5
        nn.init.uniform_(self.weight, -d_rqsrt, d_rqsrt)
        nn.init.uniform_(self.bias, -d_rqsrt, d_rqsrt)

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        _check_input_shape(x, self.weight.shape[0])
        return torch.addcmul(self.bias, self.weight, x[..., None])


class LinearReLUEmbeddings(nn.Sequential):
    """Simple non-linear embeddings for continuous features.

    **Shape**

    - Input: `(*, n_features)`
    - Output: `(*, n_features, d_embedding)`

    **Examples**

    >>> batch_size = 2
    >>> n_cont_features = 3
    >>> x = torch.randn(batch_size, n_cont_features)
    >>>
    >>> d_embedding = 32
    >>> m = LinearReLUEmbeddings(n_cont_features, d_embedding)
    >>> m(x).shape
    torch.Size([2, 3, 32])
    """

    def __init__(self, n_features: int, d_embedding: int = 32) -> None:
        super().__init__(
            OrderedDict(
                [
                    (
                        'linear',
                        LinearEmbeddings(n_features, d_embedding),
                    ),
                    ('activation', nn.ReLU()),
                ]
            )
        )


class _Periodic(nn.Module):
    """
    NOTE: THIS MODULE SHOULD NOT BE USED DIRECTLY.

    Technically, this is a linear embedding without bias followed by
    the periodic activations. The scale of the initialization
    (defined by the `sigma` argument) plays an important role.
    """

    def __init__(self, n_features: int, k: int, sigma: float) -> None:
        if sigma <= 0.0:
            raise ValueError(f'sigma must be positive, however: {sigma=}')

        super().__init__()
        self._sigma = sigma
        self.weight = Parameter(torch.empty(n_features, k))
        self.reset_parameters()

    def reset_parameters(self):
        """Reset the parameters."""
        # NOTE[DIFF]
        # Here, extreme values (~0.3% probability) are explicitly avoided just in case.
        # In the paper, there was no protection from extreme values.
        bound = self._sigma * 3
        nn.init.trunc_normal_(self.weight, 0.0, self._sigma, a=-bound, b=bound)

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        _check_input_shape(x, self.weight.shape[0])
        x = 2 * math.pi * self.weight * x[..., None]
        x = torch.cat([torch.cos(x), torch.sin(x)], -1)
        return x


# _NLinear is a simplified copy of delu.nn.NLinear:
# https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html
class _NLinear(nn.Module):
    """N *separate* linear layers for N feature embeddings.

    In other words,
    each feature embedding is transformed by its own dedicated linear layer.
    """

    def __init__(
        self, n: int, in_features: int, out_features: int, bias: bool = True
    ) -> None:
        super().__init__()
        self.weight = Parameter(torch.empty(n, in_features, out_features))
        self.bias = Parameter(torch.empty(n, out_features)) if bias else None
        self.reset_parameters()

    def reset_parameters(self):
        """Reset the parameters."""
        d_in_rsqrt = self.weight.shape[-2] ** -0.5
        nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Do the forward pass."""
        if x.ndim != 3:
            raise ValueError(
                '_NLinear supports only inputs with exactly one batch dimension,'
                ' so `x` must have a shape like (BATCH_SIZE, N_FEATURES, D_EMBEDDING).'
            )
        assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]

        x = x.transpose(0, 1)
        x = x @ self.weight
        x = x.transpose(0, 1)
        if self.bias is not None:
            x = x + self.bias
        return x


class PeriodicEmbeddings(nn.Module):
    """Embeddings for continuous features based on periodic activations.

    See README for details.

    **Shape**

    - Input: `(*, n_features)`
    - Output: `(*, n_features, d_embedding)`

    **Examples**

    >>> batch_size = 2
    >>> n_cont_features = 3
    >>> x = torch.randn(batch_size, n_cont_features)
    >>>
    >>> d_embedding = 24
    >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=False)
    >>> m(x).shape
    torch.Size([2, 3, 24])
    >>>
    >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=True)
    >>> m(x).shape
    torch.Size([2, 3, 24])
    >>>
    >>> # PL embeddings.
    >>> m = PeriodicEmbeddings(n_cont_features, d_embedding=8, activation=False, lite=False)
    >>> m(x).shape
    torch.Size([2, 3, 8])
    """  # noqa: E501

    def __init__(
        self,
        n_features: int,
        d_embedding: int = 24,
        *,
        n_frequencies: int = 48,
        frequency_init_scale: float = 0.01,
        activation: bool = True,
        lite: bool,
    ) -> None:
        """
        Args:
            n_features: the number of features.
            d_embedding: the embedding size.
            n_frequencies: the number of frequencies for each feature.
                (denoted as "k" in Section 3.3 in the paper).
            frequency_init_scale: the initialization scale for the first linear layer
                (denoted as "sigma" in Section 3.3 in the paper).
                **This is an important hyperparameter**, see README for details.
            activation: if `False`, the ReLU activation is not applied.
                Must be `True` if ``lite=True``.
            lite: if True, the outer linear layer is shared between all features.
                See README for details.
        """
        super().__init__()
        self.periodic = _Periodic(n_features, n_frequencies, frequency_init_scale)
        self.linear: Union[nn.Linear, _NLinear]
        if lite:
            # NOTE[DIFF]
            # The lite variation was introduced in a different paper
            # (about the TabR model).
            if not activation:
                raise ValueError('lite=True is allowed only when activation=True')
            self.linear = nn.Linear(2 * n_frequencies, d_embedding)
        else:
            self.linear = _NLinear(n_features, 2 * n_frequencies, d_embedding)
        self.activation = nn.ReLU() if activation else None

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        x = self.periodic(x)
        x = self.linear(x)
        if self.activation is not None:
            x = self.activation(x)
        return x


def _check_bins(bins: list[Tensor]) -> None:
    if not bins:
        raise ValueError('The list of bins must not be empty')
    for i, feature_bins in enumerate(bins):
        if not isinstance(feature_bins, Tensor):
            raise ValueError(
                'bins must be a list of PyTorch tensors. '
                f'However, for {i=}: {type(bins[i])=}'
            )
        if feature_bins.ndim != 1:
            raise ValueError(
                'Each item of the bin list must have exactly one dimension.'
                f' However, for {i=}: {bins[i].ndim=}'
            )
        if len(feature_bins) < 2:
            raise ValueError(
                'All features must have at least two bin edges.'
                f' However, for {i=}: {len(bins[i])=}'
            )
        if not feature_bins.isfinite().all():
            raise ValueError(
                'Bin edges must not contain nan/inf/-inf.'
                f' However, this is not true for the {i}-th feature'
            )
        if (feature_bins[:-1] >= feature_bins[1:]).any():
            raise ValueError(
                'Bin edges must be sorted.'
                f' However, the for the {i}-th feature, the bin edges are not sorted'
            )
        if len(feature_bins) == 2:
            warnings.warn(
                f'The {i}-th feature has just two bin edges, which means only one bin.'
                ' Strictly speaking, using a single bin for the'
                ' piecewise-linear encoding should not break anything,'
                ' but it is the same as using sklearn.preprocessing.MinMaxScaler'
            )


def compute_bins(
    X: torch.Tensor,
    n_bins: int = 48,
    *,
    tree_kwargs: Optional[dict[str, Any]] = None,
    y: Optional[Tensor] = None,
    regression: Optional[bool] = None,
    verbose: bool = False,
) -> list[Tensor]:
    """Compute the bin boundaries for `PiecewiseLinearEncoding` and `PiecewiseLinearEmbeddings`.

    **Usage**

    Compute bins using quantiles (Section 3.2.1 in the paper):

    >>> X_train = torch.randn(10000, 2)
    >>> bins = compute_bins(X_train)

    Compute bins using decision trees (Section 3.2.2 in the paper):

    >>> X_train = torch.randn(10000, 2)
    >>> y_train = torch.randn(len(X_train))
    >>> bins = compute_bins(
    ...     X_train,
    ...     y=y_train,
    ...     regression=True,
    ...     tree_kwargs={'min_samples_leaf': 64, 'min_impurity_decrease': 1e-4},
    ... )

    Args:
        X: the training features.
        n_bins: the number of bins.
        tree_kwargs: keyword arguments for `sklearn.tree.DecisionTreeRegressor`
            (if ``regression=True``) or `sklearn.tree.DecisionTreeClassifier`
            (if ``regression=False``).
            NOTE: requires ``scikit-learn>=1.0,>2`` to be installed.
        y: the training labels (must be provided if ``tree`` is not None).
        regression: whether the labels are regression labels
            (must be provided if ``tree`` is not None).
        verbose: if True and ``tree_kwargs`` is not None, than ``tqdm``
            (must be installed) will report the progress while fitting trees.

    Returns:
        A list of bin edges for all features. For one feature:

        - the maximum possible number of bin edges is ``n_bins + 1``.
        - the minimum possible number of bin edges is ``1``.
    """  # noqa: E501
    if not isinstance(X, Tensor):
        raise ValueError(f'X must be a PyTorch tensor, however: {type(X)=}')
    if X.ndim != 2:
        raise ValueError(f'X must have exactly two dimensions, however: {X.ndim=}')
    if X.shape[0] < 2:
        raise ValueError(f'X must have at least two rows, however: {X.shape[0]=}')
    if X.shape[1] < 1:
        raise ValueError(f'X must have at least one column, however: {X.shape[1]=}')
    if not X.isfinite().all():
        raise ValueError('X must not contain nan/inf/-inf.')
    if (X == X[0]).all(dim=0).any():
        raise ValueError(
            'All columns of X must have at least two distinct values.'
            ' However, X contains columns with just one distinct value.'
        )
    if n_bins <= 1 or n_bins >= len(X):
        raise ValueError(
            'n_bins must be more than 1, but less than len(X), however:'
            f' {n_bins=}, {len(X)=}'
        )

    if tree_kwargs is None:
        if y is not None or regression is not None or verbose:
            raise ValueError(
                'If tree_kwargs is None, then y must be None, regression must be None'
                ' and verbose must be False'
            )

        _upper = 2**24  # 16_777_216
        if len(X) > _upper:
            warnings.warn(
                f'Computing quantile-based bins for more than {_upper} million objects'
                ' may not be possible due to the limitation of PyTorch'
                ' (for details, see https://github.com/pytorch/pytorch/issues/64947;'
                ' if that issue is successfully resolved, this warning may be irrelevant).'  # noqa
                ' As a workaround, subsample the data, i.e. instead of'
                '\ncompute_bins(X, ...)'
                '\ndo'
                '\ncompute_bins(X[torch.randperm(len(X), device=X.device)[:16_777_216]], ...)'  # noqa
                '\nOn CUDA, the computation can still fail with OOM even after'
                ' subsampling. If this is the case, try passing features by groups:'
                '\nbins = sum('
                '\n    compute_bins(X[:, idx], ...)'
                '\n    for idx in torch.arange(len(X), device=X.device).split(group_size),'  # noqa
                '\n    start=[]'
                '\n)'
                '\nAnother option is to perform the computation on CPU:'
                '\ncompute_bins(X.cpu(), ...)'
            )
        del _upper

        # NOTE[DIFF]
        # The code below is more correct than the original implementation,
        # because the original implementation contains an unintentional divergence
        # from what is written in the paper. That divergence affected only the
        # quantile-based embeddings, but not the tree-based embeddings.
        # For historical reference, here is the original, less correct, implementation:
        # https://github.com/yandex-research/tabular-dl-num-embeddings/blob/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8/bin/train4.py#L612C30-L612C30
        # (explanation: limiting the number of quantiles by the number of distinct
        #  values is NOT the same as removing identical quantiles after computing them).
        bins = [
            q.unique()
            for q in torch.quantile(
                X, torch.linspace(0.0, 1.0, n_bins + 1).to(X), dim=0
            ).T
        ]
        _check_bins(bins)
        return bins

    else:
        if sklearn_tree is None:
            raise RuntimeError(
                'The scikit-learn package is missing.'
                ' See README.md for installation instructions'
            )
        if y is None or regression is None:
            raise ValueError(
                'If tree_kwargs is not None, then y and regression must not be None'
            )
        if y.ndim != 1:
            raise ValueError(f'y must have exactly one dimension, however: {y.ndim=}')
        if len(y) != len(X):
            raise ValueError(
                f'len(y) must be equal to len(X), however: {len(y)=}, {len(X)=}'
            )
        if y is None or regression is None:
            raise ValueError(
                'If tree_kwargs is not None, then y and regression must not be None'
            )
        if 'max_leaf_nodes' in tree_kwargs:
            raise ValueError(
                'tree_kwargs must not contain the key "max_leaf_nodes"'
                ' (it will be set to n_bins automatically).'
            )

        if verbose:
            if tqdm is None:
                raise ImportError('If verbose is True, tqdm must be installed')
            tqdm_ = tqdm
        else:
            tqdm_ = lambda x: x  # noqa: E731

        if X.device.type != 'cpu' or y.device.type != 'cpu':
            warnings.warn(
                'Computing tree-based bins involves the conversion of the input PyTorch'
                ' tensors to NumPy arrays. The provided PyTorch tensors are not'
                ' located on CPU, so the conversion has some overhead.',
                UserWarning,
            )
        X_numpy = X.cpu().numpy()
        y_numpy = y.cpu().numpy()
        bins = []
        for column in tqdm_(X_numpy.T):
            feature_bin_edges = [float(column.min()), float(column.max())]
            tree = (
                (
                    sklearn_tree.DecisionTreeRegressor
                    if regression
                    else sklearn_tree.DecisionTreeClassifier
                )(max_leaf_nodes=n_bins, **tree_kwargs)
                .fit(column.reshape(-1, 1), y_numpy)
                .tree_
            )
            for node_id in range(tree.node_count):
                # The following condition is True only for split nodes. Source:
                # https://scikit-learn.org/1.0/auto_examples/tree/plot_unveil_tree_structure.html#tree-structure
                if tree.children_left[node_id] != tree.children_right[node_id]:
                    feature_bin_edges.append(float(tree.threshold[node_id]))
            bins.append(torch.as_tensor(feature_bin_edges).unique())
        _check_bins(bins)
        return [x.to(device=X.device, dtype=X.dtype) for x in bins]


class _PiecewiseLinearEncodingImpl(nn.Module):
    """Piecewise-linear encoding.

    NOTE: THIS CLASS SHOULD NOT BE USED DIRECTLY.
    In particular, this class does *not* add any positional information
    to feature encodings. Thus, for Transformer-like models,
    `PiecewiseLinearEmbeddings` is the only valid option.

    Note:
        This is the *encoding* module, not the *embedding* module,
        so it only implements Equation 1 (Figure 1) from the paper,
        and does not have trainable parameters.

    **Shape**

    * Input: ``(*, n_features)``
    * Output: ``(*, n_features, max_n_bins)``,
      where ``max_n_bins`` is the maximum number of bins over all features:
      ``max_n_bins = max(len(b) - 1 for b in bins)``.

    To understand the output structure,
    consider a feature with the number of bins ``n_bins``.
    Formally, its piecewise-linear encoding is a vector of the size ``n_bins``
    that looks as follows::

        x_ple = [1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]

    However, this class will instead produce a vector of the size ``max_n_bins``::

        x_ple_actual = [*x_ple[:-1], *zeros(max_n_bins - n_bins), x_ple[-1]]

    In other words:

    * The last encoding component is **always** located in the end,
      even if ``n_bins == 1`` (i.e. even if it is the only component).
    * The leading ``n_bins - 1`` components are located in the beginning.
    * Everything in-between is always set to zeros (like "padding", but in the middle).

    This implementation is *significantly* faster than the original one.
    It relies on two key observations:

    * The piecewise-linear encoding is just
      a non-trainable linear transformation followed by a clamp-based activation.
      Pseudocode: `PiecewiseLinearEncoding(x) = Activation(Linear(x))`.
      The parameters of the linear transformation are defined by the bin edges.
    * Aligning the *last* encoding channel across all features
      allows applying the aforementioned activation simultaneously to all features
      without the loop over features.
    """

    weight: Tensor
    """The weight of the linear transformation mentioned in the class docstring."""

    bias: Tensor
    """The bias of the linear transformation mentioned in the class docstring."""

    single_bin_mask: Optional[Tensor]
    """The indicators of the features with only one bin."""

    mask: Optional[Tensor]
    """The indicators of the "valid" (i.e. "non-padding") part of the encoding."""

    def __init__(self, bins: list[Tensor]) -> None:
        """
        Args:
            bins: the bins computed by `compute_bins`.
        """
        assert len(bins) > 0
        super().__init__()

        n_features = len(bins)
        n_bins = [len(x) - 1 for x in bins]
        max_n_bins = max(n_bins)

        self.register_buffer('weight', torch.zeros(n_features, max_n_bins))
        self.register_buffer('bias', torch.zeros(n_features, max_n_bins))

        single_bin_mask = torch.tensor(n_bins) == 1
        self.register_buffer(
            'single_bin_mask', single_bin_mask if single_bin_mask.any() else None
        )

        self.register_buffer(
            'mask',
            # The mask is needed if features have different number of bins.
            None
            if all(len(x) == len(bins[0]) for x in bins)
            else torch.row_stack(
                [
                    torch.cat(
                        [
                            # The number of bins for this feature, minus 1:
                            torch.ones((len(x) - 1) - 1, dtype=torch.bool),
                            # Unused components (always zeros):
                            torch.zeros(max_n_bins - (len(x) - 1), dtype=torch.bool),
                            # The last bin:
                            torch.ones(1, dtype=torch.bool),
                        ]
                    )
                    # x is a tensor containing the bin bounds for a given feature.
                    for x in bins
                ]
            ),
        )

        for i, bin_edges in enumerate(bins):
            # Formally, the piecewise-linear encoding of one feature looks as follows:
            # `[1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]`
            # The linear transformation based on the weight and bias defined below
            # implements the expression in the middle before the clipping to [0, 1].
            # Note that the actual encoding layout produced by this class
            # is slightly different. See the docstring of this class for details.
            bin_width = bin_edges.diff()
            w = 1.0 / bin_width
            b = -bin_edges[:-1] / bin_width
            # The last encoding component:
            self.weight[i, -1] = w[-1]
            self.bias[i, -1] = b[-1]
            # The leading encoding components:
            self.weight[i, : n_bins[i] - 1] = w[:-1]
            self.bias[i, : n_bins[i] - 1] = b[:-1]
            # All in-between components will always be zeros,
            # because the weight and bias are initialized with zeros.

    def get_max_n_bins(self) -> int:
        return self.weight.shape[-1]

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        x = torch.addcmul(self.bias, self.weight, x[..., None])
        if x.shape[-1] > 1:
            x = torch.cat(
                [
                    x[..., :1].clamp_max(1.0),
                    x[..., 1:-1].clamp(0.0, 1.0),
                    (
                        x[..., -1:].clamp_min(0.0)
                        if self.single_bin_mask is None
                        else torch.where(
                            # For features with only one bin,
                            # the whole "piecewise-linear" encoding effectively behaves
                            # like mix-max scaling
                            # (assuming that the edges of the single bin
                            #  are the minimum and maximum feature values).
                            self.single_bin_mask[..., None],
                            x[..., -1:],
                            x[..., -1:].clamp_min(0.0),
                        )
                    ),
                ],
                dim=-1,
            )
        return x


class PiecewiseLinearEncoding(nn.Module):
    """Piecewise-linear encoding.

    See README for detailed explanation.

    **Shape**

    - Input: ``(*, n_features)``
    - Output: ``(*, total_n_bins)``,
      where ``total_n_bins`` is the total number of bins for all features:
      ``total_n_bins = sum(len(b) - 1 for b in bins)``.

    Technically, the output of this module is the flattened output
    of `_PiecewiseLinearEncoding` with all "padding" values removed.
    """

    def __init__(self, bins: list[Tensor]) -> None:
        """
        Args:
            bins: the bins computed by `compute_bins`.
        """
        super().__init__()
        self.impl = _PiecewiseLinearEncodingImpl(bins)

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        x = self.impl(x)
        return x.flatten(-2) if self.impl.mask is None else x[:, self.impl.mask]


class PiecewiseLinearEmbeddings(nn.Module):
    """Piecewise-linear embeddings.

    **Shape**

    - Input: ``(batch_size, n_features)``
    - Output: ``(batch_size, n_features, d_embedding)``
    """

    def __init__(
        self,
        bins: list[Tensor],
        d_embedding: int,
        *,
        activation: bool,
        version: Literal[None, 'A', 'B'] = None,
    ) -> None:
        """
        Args:
            bins: the bins computed by `compute_bins`.
            d_embedding: the embedding size.
            activation: if True, the ReLU activation is additionally applied in the end.
            version: the preset for various implementation details, such as
                parametrization and initialization. See README for details.
        """
        if d_embedding <= 0:
            raise ValueError(
                f'd_embedding must be a positive integer, however: {d_embedding=}'
            )
        _check_bins(bins)
        if version is None:
            warnings.warn(
                'The `version` argument is not provided, so version="A" will be used'
                ' for backward compatibility.'
                ' See README for recommendations regarding `version`.'
                ' In future, omitting this argument will result in an exception.'
            )
            version = 'A'

        super().__init__()
        n_features = len(bins)
        # NOTE[DIFF]
        # version="B" was introduced in a different paper (about the TabM model).
        is_version_B = version == 'B'

        self.linear0 = (
            LinearEmbeddings(n_features, d_embedding) if is_version_B else None
        )
        self.impl = _PiecewiseLinearEncodingImpl(bins)
        self.linear = _NLinear(
            len(bins),
            self.impl.get_max_n_bins(),
            d_embedding,
            # For the version "B", the bias is already presented in self.linear0.
            bias=not is_version_B,
        )
        if is_version_B:
            # Because of the following line, at initialization,
            # the whole embedding behaves like a linear embedding.
            # The piecewise-linear component is incrementally learnt during training.
            nn.init.zeros_(self.linear.weight)
        self.activation = nn.ReLU() if activation else None

    def forward(self, x: Tensor) -> Tensor:
        """Do the forward pass."""
        if x.ndim != 2:
            raise ValueError(
                'For now, only inputs with exactly one batch dimension are supported.'
            )

        x_linear = None if self.linear0 is None else self.linear0(x)

        x_ple = self.impl(x)
        x_ple = self.linear(x_ple)
        if self.activation is not None:
            x_ple = self.activation(x_ple)
        return x_ple if x_linear is None else x_linear + x_ple

================================================
FILE: pytabkit/models/nn_models/rtdl_resnet.py
================================================
import math
import numbers
import typing as ty

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as nn_init
from torch import Tensor
import numpy as np
import pandas as pd
import torch.nn as nn
import skorch
from skorch.callbacks import EarlyStopping, LRScheduler, PrintLog
from skorch import NeuralNetRegressor, NeuralNetClassifier
from skorch.dataset import Dataset
from skorch.callbacks import EpochScoring
from skorch.callbacks import WandbLogger
from skorch.callbacks import Callback, Checkpoint
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW, Adam, SGD
# import sys
# sys.path.append("")
import numpy as np
import os
from functools import partial
from copy import deepcopy
from .rtdl_num_embeddings import PeriodicEmbeddings


# code adapted from https://github.com/yandex-research/rtdl/tree/e5dac7f1bb33078699f5079ce301dc907c5b512a/bin

def reglu(x: Tensor) -> Tensor:
    a, b = x.chunk(2, dim=-1)
    return a * F.relu(b)


def geglu(x: Tensor) -> Tensor:
    a, b = x.chunk(2, dim=-1)
    return a * F.gelu(b)


def get_activation_fn(name: str) -> ty.Callable[[Tensor], Tensor]:
    return (
        reglu
        if name == 'reglu'
        else geglu
        if name == 'geglu'
        else torch.sigmoid
        if name == 'sigmoid'
        else getattr(F, name)
    )


def get_nonglu_activation_fn(name: str) -> ty.Callable[[Tensor], Tensor]:
    return (
        F.relu
        if name == 'reglu'
        else F.gelu
        if name == 'geglu'
        else get_activation_fn(name)
    )


def print_but_serializable(*args, **kwargs):
    # this is a dummy function to prevent an obscure error in pickling skorch objects
    # containing callbacks with sink=print
    # The error occurs when ray.init() and FunctionProcess() are both used. Error message:
    # _pickle.PicklingError: Can't pickle <built-in function print>: it's not the same object as builtins.print
    print(*args, **kwargs)


class RTDL_MLP(nn.Module):
    # baseline MLP
    def __init__(
            self,
            *,
            d_in: int,
            n_layers: int,
            d_layers: ty.Union[int, ty.List[int]],
            d_first_layer: int,
            d_last_layer: int,
            dropout: float,
            d_out: int,
            categories: ty.Optional[ty.List[int]],
            d_embedding: int,
            regression: bool,
            categorical_indicator,
            num_emb_type: str = 'none',
            num_emb_dim: int = 24,
            num_emb_hidden_dim: int = 48,
            num_emb_sigma: float = 0.01,
            num_emb_lite: bool = False
    ) -> None:
        super().__init__()

        self.regression = regression
        self.categorical_indicator = categorical_indicator  # Added
        self.categories = categories  # Added

        if num_emb_type == 'none':
            self.num_emb_layer = nn.Identity()
        elif num_emb_type == 'plr':
            self.num_emb_layer = nn.Sequential(PeriodicEmbeddings(d_in, num_emb_dim,
                                                                  n_frequencies=num_emb_hidden_dim,
                                                                  frequency_init_scale=num_emb_sigma,
                                                                  lite=num_emb_lite), nn.Flatten())
            d_in = d_in * num_emb_dim
        elif num_emb_type == 'pl':
            self.num_emb_layer = nn.Sequential(PeriodicEmbeddings(d_in, num_emb_dim,
                                                                  n_frequencies=num_emb_hidden_dim,
                                                                  frequency_init_scale=num_emb_sigma,
                                                                  activation=False,
                                                                  lite=num_emb_lite), nn.Flatten())
            d_in = d_in * num_emb_dim
        else:
            raise ValueError(f'Unknown numerical embedding type "{num_emb_type}"')

        if categories is not None and len(categories) > 0:
            d_in += len(categories) * d_embedding
            category_offsets = torch.tensor(
                np.concatenate([[0],
                                np.array(categories[:-1], dtype=np.int64)
                                ])
            ).cumsum(0)
            self.register_buffer("category_offsets", category_offsets)
            self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding)
            nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
            # set the embedding of the last category of each feature to zero
            # it represents the "missing" category, i.e. the categories that is not present
            # in the training set
            for i, c in enumerate(categories):
                self.category_embeddings.weight.data[
                    category_offsets[i] + c - 1
                    ].zero_()

        if isinstance(d_layers, numbers.Number):
            d_layers = [d_first_layer] + [d_layers for _ in range(n_layers)] + [d_last_layer]  # CHANGED
        else:
            assert len(d_layers) == n_layers

        self.layers = nn.ModuleList(
            [
                nn.Linear(d_layers[i - 1] if i else d_in, x)
                for i, x in enumerate(d_layers)
            ]
        )
        self.dropout = dropout
        self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)

    def forward(self, x):

        if not self.categorical_indicator is None:
            x_num = x[:, ~self.categorical_indicator].float()
            x_cat = x[:, self.categorical_indicator].long()
        else:
            x_num = x
            x_cat = None

        # Added: Numerical embeddings
        x_num = self.num_emb_layer(x_num)
        x = []
        if x_num is not None:
            x.append(x_num)
        if x_cat is not None:
            # replace -1 by the last category
            for i in range(x_cat.shape[1]):
                x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1
            x.append(
                self.category_embeddings(x_cat + self.category_offsets[None]).view(
                    x_cat.size(0), -1
                )
            )
        x = torch.cat(x, dim=-1)

        for layer in self.layers:
            x = layer(x)
            x = F.relu(x)
            if self.dropout:
                x = F.dropout(x, self.dropout, self.training)
        x = self.head(x)
        if not self.regression:
            x = x.squeeze(-1)
        return x


class ResNet(nn.Module):
    def __init__(
            self,
            *,
            d_in: int,
            categories: ty.Optional[ty.List[int]],
            d_embedding: int,
            d: int,
            d_hidden_factor: float,
            n_layers: int,
            activation: str,
            normalization: str,
            hidden_dropout: float,
            residual_dropout: float,
            d_out: int,
            regression: bool,
            categorical_indicator
    ) -> None:
        super().__init__()

        def make_normalization():
            return {"batchnorm": nn.BatchNorm1d, "layernorm": nn.LayerNorm}[
                normalization
            ](d)

        self.categorical_indicator = categorical_indicator  # Added
        self.regression = regression
        self.main_activation = get_activation_fn(activation)
        self.last_activation = get_nonglu_activation_fn(activation)
        self.residual_dropout = residual_dropout
        self.hidden_dropout = hidden_dropout

        d_hidden = int(d * d_hidden_factor)
        self.categories = categories
        if categories is not None and len(categories) > 0:
            d_in += len(categories) * d_embedding
            category_offsets = torch.tensor(
                np.concatenate([[0],
                                np.array(categories[:-1], dtype=np.int64)
                                ])
            ).cumsum(0)
            self.register_buffer("category_offsets", category_offsets)
            self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding)
            nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
            # set the embedding of the last category of each feature to zero
            # it represents the "missing" category, i.e. the categories that is not present
            # in the training set
            for i, c in enumerate(categories):
                self.category_embeddings.weight.data[
                    category_offsets[i] + c - 1
                    ].zero_()

        self.first_layer = nn.Linear(d_in, d)
        self.layers = nn.ModuleList(
            [
                nn.ModuleDict(
                    {
                        "norm": make_normalization(),
                        "linear0": nn.Linear(
                            d, d_hidden * (2 if activation.endswith("glu") else 1)
                        ),
                        "linear1": nn.Linear(d_hidden, d),
                    }
                )
                for _ in range(n_layers)
            ]
        )
        self.last_normalization = make_normalization()
        self.head = nn.Linear(d, d_out)

    def forward(self, x) -> Tensor:
        if not self.categorical_indicator is None:
            x_num = x[:, ~self.categorical_indicator].float()
            x_cat = x[:, self.categorical_indicator].long()
        else:
            x_num = x
            x_cat = None
        x = []
        if x_num is not None and x_num.numel() > 0:
            x.append(x_num)
        if x_cat is not None and x_cat.numel() > 0:
            # replace -1 by the last category
            for i in range(x_cat.shape[1]):
                x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1
            x.append(
                self.category_embeddings(x_cat + self.category_offsets[None]).view(
                    x_cat.size(0), -1
                )
            )
        x = torch.cat(x, dim=-1)

        x = self.first_layer(x)
        for layer in self.layers:
            layer = ty.cast(ty.Dict[str, nn.Module], layer)
            z = x
            z = layer["norm"](z)
            z = layer["linear0"](z)
            z = self.main_activation(z)
            if self.hidden_dropout:
                z = F.dropout(z, self.hidden_dropout, self.training)
            z = layer["linear1"](z)
            if self.residual_dropout:
                z = F.dropout(z, self.residual_dropout, self.training)
            x = x + z
        x = self.last_normalization(x)
        x = self.last_activation(x)
        x = self.head(x)
        if not self.regression:
            x = x.squeeze(-1)
        return x
    
class Tokenizer(nn.Module):
    category_offsets: ty.Optional[Tensor]

    def __init__(
        self,
        d_numerical: int,
        categories: ty.Optional[ty.List[int]],
        d_token: int,
        bias: bool,
    ) -> None:
        #categories = None
        super().__init__()
        if categories is None:
            d_bias = d_numerical
            self.category_offsets = None
            self.category_embeddings = None
        else:
            d_bias = d_numerical + len(categories)
            category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
            self.register_buffer('category_offsets', category_offsets)
            self.category_embeddings = nn.Embedding(sum(categories), d_token)
            nn_init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
            print(f'{self.category_embeddings.weight.shape=}')
            # set the embedding of the last category of each feature to zero
            # it represents the "missing" category, i.e. the categories that is not present
            # in the training set
            for i, c in enumerate(categories):
                self.category_embeddings.weight.data[
                    category_offsets[i] + c - 1
                    ].zero_()
            

        # take [CLS] token into account
        self.weight = nn.Parameter(Tensor(d_numerical + 1, d_token))
        self.bias = nn.Parameter(Tensor(d_bias, d_token)) if bias else None
        # The initialization is inspired by nn.Linear
        nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5))
        self.categories = categories

    @property
    def n_tokens(self) -> int:
        return len(self.weight) + (
            0 if self.category_offsets is None else len(self.category_offsets)
        )

    def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor:
        x_some = x_num if x_cat is None else x_cat
        assert x_some is not None
        x_num = torch.cat(
            [torch.ones(len(x_some), 1, device=x_some.device)]  # [CLS]
            + ([] if x_num is None else [x_num]),
            dim=1,
        )
        x = self.weight[None] * x_num[:, :, None]
        if x_cat is not None:
            # replace -1 by the last category
            for i in range(x_cat.shape[1]):
                x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1
            x = torch.cat(
                [x, self.category_embeddings(x_cat + self.category_offsets[None])],
                dim=1,
            )
        if self.bias is not None:
            bias = torch.cat(
                [
                    torch.zeros(1, self.bias.shape[1], device=x.device),
                    self.bias,
                ]
            )
            x = x + bias[None]
        return x


class MultiheadAttention(nn.Module):
    def __init__(
        self, d: int, n_heads: int, dropout: float, initialization: str
    ) -> None:
        if n_heads > 1:
            assert d % n_heads == 0
        assert initialization in ['xavier', 'kaiming']

        super().__init__()
        self.W_q = nn.Linear(d, d)
        self.W_k = nn.Linear(d, d)
        self.W_v = nn.Linear(d, d)
        self.W_out = nn.Linear(d, d) if n_heads > 1 else None
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout) if dropout else None

        for m in [self.W_q, self.W_k, self.W_v]:
            if initialization == 'xavier' and (n_heads > 1 or m is not self.W_v):
                # gain is needed since W_qkv is represented with 3 separate layers
                nn_init.xavier_uniform_(m.weight, gain=1 / math.sqrt(2))
            nn_init.zeros_(m.bias)
        if self.W_out is not None:
            nn_init.zeros_(self.W_out.bias)

    def _reshape(self, x: Tensor) -> Tensor:
        batch_size, n_tokens, d = x.shape
        d_head = d // self.n_heads
        return (
            x.reshape(batch_size, n_tokens, self.n_heads, d_head)
            .transpose(1, 2)
            .reshape(batch_size * self.n_heads, n_tokens, d_head)
        )

    def forward(
        self,
        x_q: Tensor,
        x_kv: Tensor,
        key_compression: ty.Optional[nn.Linear],
        value_compression: ty.Optional[nn.Linear],
    ) -> Tensor:
        q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
        for tensor in [q, k, v]:
            assert tensor.shape[-1] % self.n_heads == 0
        if key_compression is not None:
            assert value_compression is not None
            k = key_compression(k.transpose(1, 2)).transpose(1, 2)
            v = value_compression(v.transpose(1, 2)).transpose(1, 2)
        else:
            assert value_compression is None

        batch_size = len(q)
        d_head_key = k.shape[-1] // self.n_heads
        d_head_value = v.shape[-1] // self.n_heads
        n_q_tokens = q.shape[1]

        q = self._reshape(q)
        k = self._reshape(k)
        attention = F.softmax(q @ k.transpose(1, 2) / math.sqrt(d_head_key), dim=-1)
        if self.dropout is not None:
            attention = self.dropout(attention)
        x = attention @ self._reshape(v)
        x = (
            x.reshape(batch_size, self.n_heads, n_q_tokens, d_head_value)
            .transpose(1, 2)
            .reshape(batch_size, n_q_tokens, self.n_heads * d_head_value)
        )
        if self.W_out is not None:
            x = self.W_out(x)
        return x


class FT_Transformer(nn.Module):
    """Transformer.

    References:
    - https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
    - https://github.com/facebookresearch/pytext/tree/master/pytext/models/representations/transformer
    - https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L19
    """

    def __init__(
        self,
        *,
        # tokenizer
        d_in: int, #changed name
        categories: ty.Optional[ty.List[int]],
        token_bias: bool,
        # transformer
        n_layers: int,
        d_token: int,
        n_heads: int,
        d_ffn_factor: float,
        attention_dropout: float,
        ffn_dropout: float,
        residual_dropout: float,
        activation: str,
        prenormalization: bool,
        initialization: str,
        # linformer
        kv_compression: ty.Optional[float],
        kv_compression_sharing: ty.Optional[str],
        #
        d_out: int,
        regression: bool,
        categorical_indicator
    ) -> None:
        assert (kv_compression is None) ^ (kv_compression_sharing is not None)
        super().__init__()
        self.tokenizer = Tokenizer(d_in, categories, d_token, token_bias)
        n_tokens = self.tokenizer.n_tokens
        # print("d_token {}".format(d_token))

        self.categorical_indicator = categorical_indicator
        self.regression = regression

        def make_kv_compression():
            assert kv_compression
            compression = nn.Linear(
                n_tokens, int(n_tokens * kv_compression), bias=False
            )
            if initialization == 'xavier':
                nn_init.xavier_uniform_(compression.weight)
            return compression

        self.shared_kv_compression = (
            make_kv_compression()
            if kv_compression and kv_compression_sharing == 'layerwise'
            else None
        )

        def make_normalization():
            return nn.LayerNorm(d_token)

        d_hidden = int(d_token * d_ffn_factor)
        self.layers = nn.ModuleList([])
        for layer_idx in range(n_layers):
            layer = nn.ModuleDict(
                {
                    'attention': MultiheadAttention(
                        d_token, n_heads, attention_dropout, initialization
                    ),
                    'linear0': nn.Linear(
                        d_token, d_hidden * (2 if activation.endswith('glu') else 1)
                    ),
                    'linear1': nn.Linear(d_hidden, d_token),
                    'norm1': make_normalization(),
                }
            )
            if not prenormalization or layer_idx:
                layer['norm0'] = make_normalization()
            if kv_compression and self.shared_kv_compression is None:
                layer['key_compression'] = make_kv_compression()
                if kv_compression_sharing == 'headwise':
                    layer['value_compression'] = make_kv_compression()
                else:
                    assert kv_compression_sharing == 'key-value'
            self.layers.append(layer)

        self.activation = get_activation_fn(activation)
        self.last_activation = get_nonglu_activation_fn(activation)
        self.prenormalization = prenormalization
        self.last_normalization = make_normalization() if prenormalization else None
        self.ffn_dropout = ffn_dropout
        self.residual_dropout = residual_dropout
        self.head = nn.Linear(d_token, d_out)

    def _get_kv_compressions(self, layer):
        return (
            (self.shared_kv_compression, self.shared_kv_compression)
            if self.shared_kv_compression is not None
            else (layer['key_compression'], layer['value_compression'])
            if 'key_compression' in layer and 'value_compression' in layer
            else (layer['key_compression'], layer['key_compression'])
            if 'key_compression' in layer
            else (None, None)
        )

    def _start_residual(self, x, layer, norm_idx):
        x_residual = x
        if self.prenormalization:
            norm_key = f'norm{norm_idx}'
            if norm_key in layer:
                x_residual = layer[norm_key](x_residual)
        return x_residual

    def _end_residual(self, x, x_residual, layer, norm_idx):
        if self.residual_dropout:
            x_residual = F.dropout(x_residual, self.residual_dropout, self.training)
        x = x + x_residual
        if not self.prenormalization:
            x = layer[f'norm{norm_idx}'](x)
        return x

    def forward(self, x) -> Tensor:
        if not self.categorical_indicator is None:
            x_num = x[:, ~self.categorical_indicator].float()
            x_cat = x[:, self.categorical_indicator].long() #TODO
        else:
            x_num = x
            x_cat = None
        #x_cat = None #FIXME
        x = self.tokenizer(x_num, x_cat)

        for layer_idx, layer in enumerate(self.layers):
            is_last_layer = layer_idx + 1 == len(self.layers)
            layer = ty.cast(ty.Dict[str, nn.Module], layer)

            x_residual = self._start_residual(x, layer, 0)
            x_residual = layer['attention'](
                # for the last attention, it is enough to process only [CLS]
                (x_residual[:, :1] if is_last_layer else x_residual),
                x_residual,
                *self._get_kv_compressions(layer),
            )
            if is_last_layer:
                x = x[:, : x_residual.shape[1]]
            x = self._end_residual(x, x_residual, layer, 0)

            x_residual = self._start_residual(x, layer, 1)
            x_residual = layer['linear0'](x_residual)
            x_residual = self.activation(x_residual)
            if self.ffn_dropout:
                x_residual = F.dropout(x_residual, self.ffn_dropout, self.training)
            x_residual = layer['linear1'](x_residual)
            x = self._end_residual(x, x_residual, layer, 1)

        assert x.shape[1] == 1
        x = x[:, 0]
        if self.last_normalization is not None:
            x = self.last_normalization(x)
        x = self.last_activation(x)
        x = self.head(x)
        if not self.regression:
            x = x.squeeze(-1)
        return x


class InputShapeSetterResnet(skorch.callbacks.Callback):
    def __init__(
            self, regression=False, batch_size=None, cat_features=None, categories=None
    ):
        self.cat_features = cat_features
        self.regression = regression
        self.batch_size = batch_size
        self.categories = categories

    def on_train_begin(self, net, X, y):
        if net.categorical_indicator is None:
            if self.cat_features is not None:
                # TODO: it's redundant
                net.set_categorical_indicator(
                    np.array([i in self.cat_features for i in range(X.shape[1])])
                )
            else:
                d_in = X.shape[1]
                categories = None
        else:
            d_in = X.shape[1] - sum(net.categorical_indicator)
            if self.categories is None:
                categories = [
                    # +1 for the unknown category
                    len(set(X[:, i])) + 1 for i in np.where(net.categorical_indicator)[0]
                ]
            else:
                categories = self.categories
        if self.regression:
            d_out = 1
        else:
            if hasattr(net, "n_classes"):
                d_out = net.n_classes
            else:
                assert y.max() + 1 == len(set(y))
                d_out = int(y.max() + 1)

        net.set_params(
            module__d_in=d_in,
            module__categories=categories,  # FIXME #lib.get_categories(X_cat),
            module__categorical_indicator=torch.BoolTensor(net.categorical_indicator)
            if net.categorical_indicator is not None
            else None,
            module__d_out=d_out,
        )


class LearningRateLogger(Callback):
    def on_epoch_begin(self, net, dataset_train=None, dataset_valid=None, **kwargs):
        callbacks = net.callbacks
        for callback in callbacks:
            if isinstance(callback, WandbLogger):
                callback.wandb_run.log(
                    {"log_lr": np.log10(net.optimizer_.param_groups[0]["lr"])}
                )


class UniquePrefixCheckpoint(Checkpoint):
    """
    This class has two purposes:
    - add a unique prefix to the checkpoint file to avoid
    conflicts between different runs in parallel
    - remove the checkpoint file when training is finished
    to avoid having too many files
    """

    def initialize(self):
        print("Initializing UniquePrefixCheckpoint")
        self.fn_prefix = str(id(self))
        print("fn_prefix is {}".format(self.fn_prefix))
        return super(UniquePrefixCheckpoint, self).initialize()

    # override method to delete the checkpoint file
    def on_train_end(self, net, **kwargs):
        print("train end")
        if not self.load_best or self.monitor is None:
            return
        self._sink("Loading best checkpoint after training.", net.verbose)
        is_regression = isinstance(net, NeuralNetRegressorWrapped)
        try:
            net.load_params(checkpoint=self, use_safetensors=self.use_safetensors)
            # addition
            print(f"removing {self.dirname}/{self.fn_prefix}params.pt")
            os.remove(f"{self.dirname}/{self.fn_prefix}params.pt")
            # if doing regression check if constant_val_mse is better than valid_loss_best
            # if so, replace the model prediction with constant prediction
            if is_regression:
                constant_val_mse = net.history[:, "constant_val_mse"][0]  # all the same
                all_val_mse = net.history[:, "valid_loss"]
                # remove nan and inf
                all_val_mse = np.array(all_val_mse)[~np.isnan(all_val_mse)]

                if not len(all_val_mse) or np.all(all_val_mse > constant_val_mse):
                    print("All valid loss are worse than constant prediction")
                    print("Replacing model prediction with constant prediction")
                    net.set_predict_mean(True)
        except FileNotFoundError:
            print("COULD NOT FIND CHECKPOINT FILE")
            if not is_regression:
                # this should only happen for regression
                raise
            # check that valid loss is always nan or inf
            valid_loss = net.history[:, "valid_loss"]
            assert np.all(np.isnan(valid_loss) | np.isinf(valid_loss))
            print("valid loss is always nan or inf")
            print("Replacing model prediction with constant prediction")
            net.set_predict_mean(True)


class MyCustomError(Exception):
    pass


class EarlyStoppingCustomError(EarlyStopping):
    def on_epoch_end(self, net, **kwargs):
        current_score = net.history[-1, self.monitor]
        if not self._is_score_improved(current_score):
            self.misses_ += 1
        else:
            self.misses_ = 0
            self.dynamic_threshold_ = self._calc_new_threshold(current_score)
            self.best_epoch_ = net.history[-1, "epoch"]
            if self.load_best:
                self.best_model_weights_ = deepcopy(net.module_.state_dict())
        if self.misses_ == self.patience:
            if net.verbose:
                self._sink("Stopping since {} has not improved in the last "
                           "{} epochs.".format(self.monitor, self.patience),
                           verbose=net.verbose)
            raise MyCustomError


class NeuralNetRegressorWrapped(NeuralNetRegressor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.categorical_indicator = None
        self.predict_mean = False  # whether to predict y_train mean if
        # the network predictions are nan or too bad
        self.y_train_mean = None

    def set_categorical_indicator(self, categorical_indicator):
        self.categorical_indicator = categorical_indicator

    def set_predict_mean(self, predict_mean):
        self.predict_mean = predict_mean

    def set_y_train_mean(self, y_train_mean):
        self.y_train_mean = y_train_mean

    def get_default_callbacks(self):
        callbacks = [cb for cb in super().get_default_callbacks() if not isinstance(cb[1], PrintLog)]
        callbacks.append(('print_log', PrintLog(sink=print_but_serializable)))
        print(callbacks)
        return callbacks

    def fit(self, X, y):
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        self.set_y_train_mean(np.mean(y))
        return super().fit(X, y)

    def predict(self, X):
        if self.predict_mean:
            return np.ones((X.shape[0], 1)) * self.y_train_mean
        else:
            return super().predict(X)

    # adapted from skorch code 
    # to remove ignoring keyboard interrupt
    # as it can be dangerous for benchmarking
    # pylint: disable=unused-argument
    def partial_fit(self, X, y=None, classes=None, **fit_params):
        if not self.initialized_:
            self.initialize()

        self.notify('on_train_begin', X=X, y=y)
        try:
            self.fit_loop(X, y, **fit_params)
        # except KeyboardInterrupt:
        except MyCustomError:
            pass
        self.notify('on_train_end', X=X, y=y)
        return self


class NeuralNetClassifierWrapped(NeuralNetClassifier):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.categorical_indicator = None
        self.n_classes = None  # automatically inferred from train if not set

    def set_categorical_indicator(self, categorical_indicator):
        self.categorical_indicator = categorical_indicator

    def set_n_classes(self, n_classes):
        self.n_classes = n_classes

    def fit(self, X, y):
        y = y.astype(np.int64)
        return super().fit(X, y)

    def get_default_callbacks(self):
        callbacks = [cb for cb in super().get_default_callbacks() if not isinstance(cb[1], PrintLog)]
        callbacks.append(('print_log', PrintLog(sink=print_but_serializable)))
        print(callbacks)
        return callbacks

    # adapted from skorch code 
    # to remove ignoring keyboard interrupt
    # as it can be dangerous for benchmarking
    # pylint: disable=unused-argument
    def partial_fit(self, X, y=None, classes=None, **fit_params):
        if not self.initialized_:
            self.initialize()

        self.notify('on_train_begin', X=X, y=y)
        try:
            self.fit_loop(X, y, **fit_params)
        # except KeyboardInterrupt:
        except MyCustomError:
            pass
        self.notify('on_train_end', X=X, y=y)
        return self
    
# for FT-Transformer, we extend the NeuralNet class to allow different weight decay for different
# parts of the network
def initialize_optimizer_ft_transformer(self, triggered_directly=None):
    """Initialize the model optimizer. If ``self.optimizer__lr``
    is not set, use ``self.lr`` instead.

    Parameters
    ----------
    triggered_directly
        Deprecated, don't use it anymore.

    """
    # handle deprecated parameter
    # if triggered_directly is not None:
    #     warnings.warn(
    #         "The 'triggered_directly' argument to 'initialize_optimizer' is "
    #         "deprecated, please don't use it anymore.", DeprecationWarning)

    named_parameters = list(self.get_all_learnable_params())
    # print
    no_wd_names = ['tokenizer', '.norm', '.bias']
    for x in ['tokenizer', '.norm', '.bias']:
        assert any(x in a for a in (b[0] for b in named_parameters)) #TODO improve this

    def needs_wd(name):
        return all(x not in name for x in no_wd_names)

    named_parameters_grouped = [
        {'params': [v for k, v in named_parameters if needs_wd(k)]},
        {
            'params': [v for k, v in named_parameters if not needs_wd(k)],
            'weight_decay': 0.0,
        }]
    
    args, kwargs = self.get_params_for_optimizer(
        'optimizer', named_parameters)

    # pylint: disable=attribute-defined-outside-init
    self.optimizer_ = self.optimizer(named_parameters_grouped, **kwargs)
    return self

class NeuralNetClassifierCustomOptim(NeuralNetClassifierWrapped):
    def initialize_optimizer(self, triggered_directly=None):
        return initialize_optimizer_ft_transformer(self, triggered_directly)
    
class NeuralNetRegressorCustomOptim(NeuralNetRegressorWrapped):
    def initialize_optimizer(self, triggered_directly=None):
        return initialize_optimizer_ft_transformer(self, triggered_directly)

def mse_constant_predictor(model, X, y):
    return np.mean((y - model.y_train_mean) ** 2)


def create_regressor_skorch(
        id=None, wandb_run=None, use_checkpoints=True, cat_features=None,
        model_name="resnet", checkpoint_dir="skorch_cp", **kwargs
):
    print("RTDL regressor")
    if "lr_scheduler" not in kwargs:
        lr_scheduler = False
    else:
        lr_scheduler = kwargs.pop("lr_scheduler")
    if "es_patience" not in kwargs.keys():
        es_patience = 40
    else:
        es_patience = kwargs.pop("es_patience")
    if "lr_patience" not in kwargs.keys():
        lr_patience = 30
    else:
        lr_patience = kwargs.pop("lr_patience")
    if "optimizer" not in kwargs.keys():
        optimizer = "adamw"
    else:
        optimizer = kwargs.pop("optimizer")
    if optimizer == "adam":
        optimizer = Adam
    elif optimizer == "adamw":
        optimizer = AdamW
    elif optimizer == "sgd":
        optimizer = SGD
    if "batch_size" not in kwargs.keys():
        batch_size = 128
    else:
        batch_size = kwargs.pop("batch_size")
    if "categories" not in kwargs.keys():
        categories = None
    else:
        categories = kwargs.pop("categories")
    callbacks = [
        InputShapeSetterResnet(
            regression=True, cat_features=cat_features, categories=categories,
            batch_size=batch_size
        ),
        EpochScoring(scoring=mse_constant_predictor, name="constant_val_mse", on_train=False),
        EarlyStoppingCustomError(monitor="valid_loss", patience=es_patience, sink=print_but_serializable),
    ]

    if lr_scheduler:
        callbacks.append(
            LRScheduler(
                policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5, factor=0.2
            )
        )  # FIXME make customizable
    if use_checkpoints:
        callbacks.append(
            UniquePrefixCheckpoint(
                dirname=checkpoint_dir,
                f_params=r"params.pt",
                f_optimizer=None,
                f_criterion=None,
                f_history=None,
                load_best=True,
                monitor="valid_loss_best",
                sink=print_but_serializable,
            )
        )
    if not wandb_run is None:
        callbacks.append(WandbLogger(wandb_run, save_model=False))
        callbacks.append(LearningRateLogger())

    nn_class = NeuralNetRegressorCustomOptim if model_name == "ft_transformer" else NeuralNetRegressorWrapped
    if model_name == "ft_transformer":
        model_class = FT_Transformer
    elif model_name == "resnet":
        model_class = ResNet
    elif model_name == "mlp":
        model_class = RTDL_MLP
    else:
        raise ValueError(f'Model {model_name} not implemented here! Choose from "ft_transformer", "resnet", "mlp"')

    new_kwargs = dict(optimizer=optimizer,
        batch_size=max(
            batch_size, 1
        ),  # if batch size is float, it will be reset during fit
        iterator_train__shuffle=True,
        module__d_in=1,  # will be change when fitted
        module__categories=None,  # will be change when fitted
        module__d_out=1,  # idem
        module__regression=True,
        module__categorical_indicator=None,  # will be change when fitted
        callbacks=callbacks,
        **kwargs)

    # cannot do the try/catch here because params are validated in fit()
    # try:
    #     # try the torch_load_kwargs but it's only available in newer versions of skorch
    #     model = nn_class(
    #         model_class,
    #         # Shuffle training data on each epoch
    #         **new_kwargs,
    #         torch_load_kwargs={'weights_only': False}, # quick-fix for pickling errors in torch>=2.6
    #     )
    # except ValueError:
    #     model = nn_class(
    #         model_class,
    #         # Shuffle training data on each epoch
    #         **new_kwargs,
    #     )

    model = nn_class(
        model_class,
        # Shuffle training data on each epoch
        **new_kwargs,
    )

    return model


def create_classifier_skorch(
        id=None, wandb_run=None, use_checkpoints=True, cat_features=None,
        model_name="resnet", checkpoint_dir="skorch_cp", val_metric_name: str = 'class_error',
        **kwargs
):
    print("RTDL classifier")
    if "lr_scheduler" not in kwargs:
        lr_scheduler = False
    else:
        lr_scheduler = kwargs.pop("lr_scheduler")
    if "es_patience" not in kwargs.keys():
        es_patience = 40
    else:
        es_patience = kwargs.pop("es_patience")
    if "lr_patience" not in kwargs.keys():
        lr_patience = 30
    else:
        lr_patience = kwargs.pop("lr_patience")
    if "optimizer" not in kwargs.keys():
        optimizer = "adamw"
    else:
        optimizer = kwargs.pop("optimizer")
    if optimizer == "adam":
        optimizer = Adam
    elif optimizer == "adamw":
        optimizer = AdamW
    elif optimizer == "sgd":
        optimizer = SGD
    if "batch_size" not in kwargs.keys():
        batch_size = 128
    else:
        batch_size = kwargs.pop("batch_size")
    if "categories" not in kwargs.keys():
        categories = None
    else:
        categories = kwargs.pop("categories")
    callbacks = [
        InputShapeSetterResnet(
            regression=False, cat_features=cat_features, categories=categories,
            batch_size=batch_size
        ),
        EpochScoring(scoring="accuracy", name="train_accuracy", on_train=True),
    ]
    if val_metric_name == 'class_error':
        callbacks.append(EarlyStoppingCustomError(monitor="valid_acc", patience=es_patience,
                                                  lower_is_better=False, sink=print_but_serializable))
    elif val_metric_name == 'cross_entropy':
        print(f'Using early stopping on cross-entropy loss')
        callbacks.append(EarlyStoppingCustomError(monitor='valid_loss', patience=es_patience,
                                                  lower_is_better=True, sink=print_but_serializable))
    else:
        raise ValueError(f'Validation metric {val_metric_name} not implemented here!')

    if lr_scheduler:
        callbacks.append(
            LRScheduler(
                policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5, factor=0.2
            )
        )  # FIXME make customizable
    if use_checkpoints:
        callbacks.append(
            UniquePrefixCheckpoint(
                dirname=checkpoint_dir,
                f_params=r"params.pt",
                f_optimizer=None,
                f_criterion=None,
                f_history=None,
                load_best=True,
                monitor="valid_acc_best" if val_metric_name == 'class_error' else 'valid_loss_best',
                sink=print_but_serializable,
            )
        )
    if not wandb_run is None:
        callbacks.append(WandbLogger(wandb_run, save_model=False))
        callbacks.append(LearningRateLogger())

    nn_class = NeuralNetClassifierCustomOptim if model_name == "ft_transformer" else NeuralNetClassifierWrapped
    if model_name == "ft_transformer":
        model_class = FT_Transformer
    elif model_name == "resnet":
        model_class = ResNet
    elif model_name == "mlp":
        model_class = RTDL_MLP
    else:
        raise ValueError(f'Model {model_name} not implemented here! Choose from "ft_transformer", "resnet", "mlp"')
    model = nn_class(
        model_class,
        # Shuffle training data on each epoch
        criterion=nn.CrossEntropyLoss,
        optimizer=optimizer,
        batch_size=max(
            batch_size, 1
        ),  # if batch size is float, it will be reset during fit
        iterator_train__shuffle=True,
        module__d_in=1,  # will be change when fitted
        module__categories=None,  # will be change when fitted
        module__d_out=1,  # idem
        module__regression=False,
        module__categorical_indicator=None,  # will be change when fitted
        callbacks=callbacks,
        **kwargs,
    )

    return model


create_resnet_regressor_skorch = partial(create_regressor_skorch, model_name="resnet", use_checkpoints=True)
create_resnet_classifier_skorch = partial(create_classifier_skorch, model_name="resnet", use_checkpoints=True)
create_mlp_regressor_skorch = partial(create_regressor_skorch, model_name="mlp", use_checkpoints=True)
create_mlp_classifier_skorch = partial(create_classifier_skorch, model_name="mlp", use_checkpoints=True)
create_ft_transformer_regressor_skorch = partial(create_regressor_skorch, model_name="ft_transformer", use_checkpoints=True)
create_ft_transformer_classifier_skorch = partial(create_classifier_skorch, model_name="ft_transformer", use_checkpoints=True)


================================================
FILE: pytabkit/models/nn_models/tabm.py
================================================
# License: https://github.com/yandex-research/tabm/blob/main/LICENSE

# NOTE
# The minimum required versions of the dependencies are specified in README.md.

from __future__ import annotations

import itertools
from typing import Any, Literal, Optional, Union, List, Dict

from pytabkit.models.nn_models import rtdl_num_embeddings

import pytabkit.models.nn_models.rtdl_num_embeddings
import torch
import torch.nn as nn
from torch import Tensor


# ======================================================================================
# Initialization
# ======================================================================================
def init_rsqrt_uniform_(x: Tensor, d: int) -> Tensor:
    assert d > 0
    d_rsqrt = d**-0.5
    return nn.init.uniform_(x, -d_rsqrt, d_rsqrt)


@torch.inference_mode()
def init_random_signs_(x: Tensor) -> Tensor:
    return x.bernoulli_(0.5).mul_(2).add_(-1)


# ======================================================================================
# Modules
# ======================================================================================
class NLinear(nn.Module):
    """N linear layers applied in parallel to N disjoint parts of the input.

    **Shape**

    - Input: ``(B, N, in_features)``
    - Output: ``(B, N, out_features)``

    The i-th linear layer is applied to the i-th matrix of the shape (B, in_features).

    Technically, this is a simplified version of delu.nn.NLinear:
    https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html.
    The difference is that this layer supports only 3D inputs
    with exactly one batch dimension. By contrast, delu.nn.NLinear supports
    any number of batch dimensions.
    """

    def __init__(
        self, n: int, in_features: int, out_features: int, bias: bool = True
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n, in_features, out_features))
        self.bias = nn.Parameter(torch.empty(n, out_features)) if bias else None
        self.reset_parameters()

    def reset_parameters(self):
        d = self.weight.shape[-2]
        init_rsqrt_uniform_(self.weight, d)
        if self.bias is not None:
            init_rsqrt_uniform_(self.bias, d)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        assert x.ndim == 3
        assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]

        x = x.transpose(0, 1)
        x = x @ self.weight
        x = x.transpose(0, 1)
        if self.bias is not None:
            x = x + self.bias
        return x


class OneHotEncoding0d(nn.Module):
    # Input:  (*, n_cat_features=len(cardinalities))
    # Output: (*, sum(cardinalities))

    def __init__(self, cardinalities: List[int]) -> None:
        super().__init__()
        self._cardinalities = cardinalities

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 1
        assert x.shape[-1] == len(self._cardinalities)

        return torch.cat(
            [
                # NOTE
                # This is a quick hack to support out-of-vocabulary categories.
                #
                # Recall that lib.data.transform_cat encodes categorical features
                # as follows:
                # - In-vocabulary values receive indices from `range(cardinality)`.
                # - All out-of-vocabulary values (i.e. new categories in validation
                #   and test data that are not presented in the training data)
                #   receive the index `cardinality`.
                #
                # As such, the line below will produce the standard one-hot encoding for
                # known categories, and the all-zeros encoding for unknown categories.
                # This may not be the best approach to deal with unknown values,
                # but should be enough for our purposes.
                nn.functional.one_hot(x[..., i], cardinality + 1)[..., :-1]
                for i, cardinality in enumerate(self._cardinalities)
            ],
            -1,
        )


class ScaleEnsemble(nn.Module):
    def __init__(
        self,
        k: int,
        d: int,
        *,
        init: Literal['ones', 'normal', 'random-signs'],
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(k, d))
        self._weight_init = init
        self.reset_parameters()

    def reset_parameters(self) -> None:
        if self._weight_init == 'ones':
            nn.init.ones_(self.weight)
        elif self._weight_init == 'normal':
            nn.init.normal_(self.weight)
        elif self._weight_init == 'random-signs':
            init_random_signs_(self.weight)
        else:
            raise ValueError(f'Unknown weight_init: {self._weight_init}')

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 2
        return x * self.weight


class LinearEfficientEnsemble(nn.Module):
    """
    This layer is a more configurable version of the "BatchEnsemble" layer
    from the paper
    "BatchEnsemble: An Alternative Approach to Efficient Ensemble and Lifelong Learning"
    (link: https://arxiv.org/abs/2002.06715).

    First, this layer allows to select only some of the "ensembled" parts:
    - the input scaling  (r_i in the BatchEnsemble paper)
    - the output scaling (s_i in the BatchEnsemble paper)
    - the output bias    (not mentioned in the BatchEnsemble paper,
                          but is presented in public implementations)

    Second, the initialization of the scaling weights is configurable
    through the `scaling_init` argument.

    NOTE
    The term "adapter" is used in the TabM paper only to tell the story.
    The original BatchEnsemble paper does NOT use this term. So this class also
    avoids the term "adapter".
    """

    r: Optional[Tensor]
    s: Optional[Tensor]
    bias: Optional[Tensor]

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        *,
        k: int,
        ensemble_scaling_in: bool,
        ensemble_scaling_out: bool,
        ensemble_bias: bool,
        scaling_init: Literal['ones', 'random-signs'],
    ):
        assert k > 0
        if ensemble_bias:
            assert bias
        super().__init__()

        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        self.register_parameter(
            'r',
            (
                nn.Parameter(torch.empty(k, in_features))
                if ensemble_scaling_in
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            's',
            (
                nn.Parameter(torch.empty(k, out_features))
                if ensemble_scaling_out
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            'bias',
            (
                nn.Parameter(torch.empty(out_features))  # type: ignore[code]
                if bias and not ensemble_bias
                else nn.Parameter(torch.empty(k, out_features))
                if ensemble_bias
                else None
            ),
        )

        self.in_features = in_features
        self.out_features = out_features
        self.k = k
        self.scaling_init = scaling_init

        self.reset_parameters()

    def reset_parameters(self):
        init_rsqrt_uniform_(self.weight, self.in_features)
        scaling_init_fn = {'ones': nn.init.ones_, 'random-signs': init_random_signs_}[
            self.scaling_init
        ]
        if self.r is not None:
            scaling_init_fn(self.r)
        if self.s is not None:
            scaling_init_fn(self.s)
        if self.bias is not None:
            bias_init = torch.empty(
                # NOTE: the shape of bias_init is (out_features,) not (k, out_features).
                # It means that all biases have the same initialization.
                # This is similar to having one shared bias plus
                # k zero-initialized non-shared biases.
                self.out_features,
                dtype=self.weight.dtype,
                device=self.weight.device,
            )
            bias_init = init_rsqrt_uniform_(bias_init, self.in_features)
            with torch.inference_mode():
                self.bias.copy_(bias_init)

    def forward(self, x: Tensor) -> Tensor:
        # x.shape == (B, K, D)
        assert x.ndim == 3

        # >>> The equation (5) from the BatchEnsemble paper (arXiv v2).
        if self.r is not None:
            x = x * self.r
        x = x @ self.weight.T
        if self.s is not None:
            x = x * self.s
        # <<<

        if self.bias is not None:
            x = x + self.bias
        return x


class MLP(nn.Module):
    def __init__(
        self,
        *,
        d_in: Optional[int] = None,
        d_out: Optional[int] = None,
        n_blocks: int,
        d_block: int,
        dropout: float,
        activation: str = 'ReLU',
    ) -> None:
        super().__init__()

        d_first = d_block if d_in is None else d_in
        self.blocks = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(d_first if i == 0 else d_block, d_block),
                    getattr(nn, activation)(),
                    nn.Dropout(dropout),
                )
                for i in range(n_blocks)
            ]
        )
        self.output = None if d_out is None else nn.Linear(d_block, d_out)

    def forward(self, x: Tensor) -> Tensor:
        for block in self.blocks:
            x = block(x)
        if self.output is not None:
            x = self.output(x)
        return x


def make_efficient_ensemble(module: nn.Module, EnsembleLayer, **kwargs) -> None:
    """Replace linear layers with efficient ensembles of linear layers.

    NOTE
    In the paper, there are no experiments with networks with normalization layers.
    Perhaps, their trainable weights (the affine transformations) also need
    "ensemblification" as in the paper about "FiLM-Ensemble".
    Additional experiments are required to make conclusions.
    """
    for name, submodule in list(module.named_children()):
        if isinstance(submodule, nn.Linear):
            module.add_module(
                name,
                EnsembleLayer(
                    in_features=submodule.in_features,
                    out_features=submodule.out_features,
                    bias=submodule.bias is not None,
                    **kwargs,
                ),
            )
        else:
            make_efficient_ensemble(submodule, EnsembleLayer, **kwargs)


def _get_first_ensemble_layer(backbone: MLP) -> LinearEfficientEnsemble:
    if isinstance(backbone, MLP):
        return backbone.blocks[0][0]  # type: ignore[code]
    else:
        raise RuntimeError(f'Unsupported backbone: {backbone}')


@torch.inference_mode()
def _init_first_adapter(
    weight: Tensor,
    distribution: Literal['normal', 'random-signs'],
    init_sections: List[int],
) -> None:
    """Initialize the first adapter.

    NOTE
    The `init_sections` argument is a historical artifact that accidentally leaked
    from irrelevant experiments to the final models. Perhaps, the code related
    to `init_sections` can be simply removed, but this was not tested.
    """
    assert weight.ndim == 2
    assert weight.shape[1] == sum(init_sections)

    if distribution == 'normal':
        init_fn_ = nn.init.normal_
    elif distribution == 'random-signs':
        init_fn_ = init_random_signs_
    else:
        raise ValueError(f'Unknown distribution: {distribution}')

    section_bounds = [0, *torch.tensor(init_sections).cumsum(0).tolist()]
    for i in range(len(init_sections)):
        # NOTE
        # As noted above, this section-based initialization is an arbitrary historical
        # artifact. Consider the first adapter of one ensemble member.
        # This adapter vector is implicitly split into "sections",
        # where one section corresponds to one feature. The code below ensures that
        # the adapter weights in one section are initialized with the same random value
        # from the given distribution.
        w = torch.empty((len(weight), 1), dtype=weight.dtype, device=weight.device)
        init_fn_(w)
        weight[:, section_bounds[i] : section_bounds[i + 1]] = w


_CUSTOM_MODULES = {
    # https://docs.python.org/3/library/stdtypes.html#definition.__name__
    CustomModule.__name__: CustomModule
    for CustomModule in [
        rtdl_num_embeddings.LinearEmbeddings,
        rtdl_num_embeddings.LinearReLUEmbeddings,
        rtdl_num_embeddings.PeriodicEmbeddings,
        rtdl_num_embeddings.PiecewiseLinearEmbeddings,
        MLP,
    ]
}


def make_module(type: str, *args, **kwargs) -> nn.Module:
    Module = getattr(nn, type, None)
    if Module is None:
        Module = _CUSTOM_MODULES[type]
    return Module(*args, **kwargs)


# ======================================================================================
# Optimization
# ======================================================================================
def default_zero_weight_decay_condition(
    module_name: str, module: nn.Module, parameter_name: str, parameter: nn.Parameter
):

    del module_name, parameter
    return parameter_name.endswith('bias') or isinstance(
        module,
        (nn.BatchNorm1d, nn.LayerNorm, nn.InstanceNorm1d, rtdl_num_embeddings.LinearEmbeddings,
        rtdl_num_embeddings.LinearReLUEmbeddings, rtdl_num_embeddings._Periodic),
    )


def make_parameter_groups(
    module: nn.Module,
    zero_weight_decay_condition=default_zero_weight_decay_condition,
    custom_groups: Optional[List[Dict[str, Any]]] = None,
) -> List[Dict[str, Any]]:
    if custom_groups is None:
        custom_groups = []
    custom_params = frozenset(
        itertools.chain.from_iterable(group['params'] for group in custom_groups)
    )
    assert len(custom_params) == sum(
        len(group['params']) for group in custom_groups
    ), 'Parameters in custom_groups must not intersect'
    zero_wd_params = frozenset(
        p
        for mn, m in module.named_modules()
        for pn, p in m.named_parameters()
        if p not in custom_params and zero_weight_decay_condition(mn, m, pn, p)
    )
    default_group = {
        'params': [
            p
            for p in module.parameters()
            if p not in custom_params and p not in zero_wd_params
        ]
    }
    return [
        default_group,
        {'params': list(zero_wd_params), 'weight_decay': 0.0},
        *custom_groups,
    ]


# ======================================================================================
# The model
# ======================================================================================
class Model(nn.Module):
    """MLP & TabM."""

    def __init__(
        self,
        *,
        n_num_features: int,
        cat_cardinalities: List[int],
        n_classes: Optional[int],
        backbone: dict,
        bins: Optional[List[Tensor]],  # For piecewise-linear encoding/embeddings.
        num_embeddings: Optional[Dict] = None,
        arch_type: Literal[
            # Plain feed-forward network without any kind of ensembling.
            'plain',
            #
            # TabM
            'tabm',
            #
            # TabM-mini
            'tabm-mini',
            #
            # TabM-packed
            'tabm-packed',
            #
            # TabM. The first adapter is initialized from the normal distribution.
            # This variant was not used in the paper, but it may be useful in practice.
            'tabm-normal',
            #
            # TabM-mini. The adapter is initialized from the normal distribution.
            # This variant was not used in the paper.
            'tabm-mini-normal',
        ],
        k: Optional[int] = None,
        share_training_batches: bool = True,
    ) -> None:
        # >>> Validate arguments.
        assert n_num_features >= 0
        assert n_num_features or cat_cardinalities
        if arch_type == 'plain':
            assert k is None
            assert (
                share_training_batches
            ), 'If `arch_type` is set to "plain", then `simple` must remain True'
        else:
            assert k is not None
            assert k > 0

        super().__init__()

        # >>> Continuous (numerical) features
        first_adapter_sections = []  # See the comment in `_init_first_adapter`.

        if n_num_features == 0:
            assert bins is None
            self.num_module = None
            d_num = 0

        elif num_embeddings is None:
            assert bins is None
            self.num_module = None
            d_num = n_num_features
            first_adapter_sections.extend(1 for _ in range(n_num_features))

        else:
            if bins is None:
                self.num_module = make_module(
                    **num_embeddings, n_features=n_num_features
                )
            else:
                assert num_embeddings['type'].startswith('PiecewiseLinearEmbeddings')
                self.num_module = make_module(**num_embeddings, bins=bins)
            d_num = n_num_features * num_embeddings['d_embedding']
            first_adapter_sections.extend(
                num_embeddings['d_embedding'] for _ in range(n_num_features)
            )

        # >>> Categorical features
        self.cat_module = (
            OneHotEncoding0d(cat_cardinalities) if cat_cardinalities else None
        )
        first_adapter_sections.extend(cat_cardinalities)
        d_cat = sum(cat_cardinalities)

        # >>> Backbone
        d_flat = d_num + d_cat
        self.minimal_ensemble_adapter = None
        # Any backbone can be here but we provide only MLP
        self.backbone = make_module(d_in=d_flat, **backbone)

        if arch_type != 'plain':
            assert k is not None
            first_adapter_init = (
                None
                if arch_type == 'tabm-packed'
                else 'normal'
                if arch_type in ('tabm-mini-normal', 'tabm-normal')
                # For other arch_types, the initialization depends
                # on the presence of num_embeddings.
                else 'random-signs'
                if num_embeddings is None
                else 'normal'
            )

            if arch_type in ('tabm', 'tabm-normal'):
                # Like BatchEnsemble, but all multiplicative adapters,
                # except for the very first one, are initialized with ones.
                assert first_adapter_init is not None
                make_efficient_ensemble(
                    self.backbone,
                    LinearEfficientEnsemble,
                    k=k,
                    ensemble_scaling_in=True,
                    ensemble_scaling_out=True,
                    ensemble_bias=True,
                    scaling_init='ones',
                )
                _init_first_adapter(
                    _get_first_ensemble_layer(self.backbone).r,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            elif arch_type in ('tabm-mini', 'tabm-mini-normal'):
                # MiniEnsemble
                assert first_adapter_init is not None
                self.minimal_ensemble_adapter = ScaleEnsemble(
                    k,
                    d_flat,
                    init='random-signs' if num_embeddings is None else 'normal',
                )
                _init_first_adapter(
                    self.minimal_ensemble_adapter.weight,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            elif arch_type == 'tabm-packed':
                # Packed ensemble.
                # In terms of the Packed Ensembles paper by Laurent et al.,
                # TabM-packed is PackedEnsemble(alpha=k, M=k, gamma=1).
                assert first_adapter_init is None
                make_efficient_ensemble(self.backbone, NLinear, n=k)

            else:
                raise ValueError(f'Unknown arch_type: {arch_type}')

        # >>> Output
        d_block = backbone['d_block']
        d_out = 1 if n_classes is None else n_classes
        self.output = (
            nn.Linear(d_block, d_out)
            if arch_type == 'plain'
            else NLinear(k, d_block, d_out)  # type: ignore[code]
        )

        # >>>
        self.arch_type = arch_type
        self.k = k
        self.share_training_batches = share_training_batches

    def forward(
        self, x_num: Optional[Tensor] = None, x_cat: Optional[Tensor] = None
    ) -> Tensor:
        x = []
        if x_num is not None:
            x.append(x_num if self.num_module is None else self.num_module(x_num))
        if x_cat is None:
            assert self.cat_module is None
        else:
            assert self.cat_module is not None
            x.append(self.cat_module(x_cat).float())
        x = torch.column_stack([x_.flatten(1, -1) for x_ in x])

        if self.k is not None:
            if self.share_training_batches or not self.training:
                # (B, D) -> (B, K, D)
                x = x[:, None].expand(-1, self.k, -1)
            else:
                # (B * K, D) -> (B, K, D)
                x = x.reshape(len(x) // self.k, self.k, *x.shape[1:])
            if self.minimal_ensemble_adapter is not None:
                x = self.minimal_ensemble_adapter(x)
        else:
            assert self.minimal_ensemble_adapter is None

        x = self.backbone(x)
        x = self.output(x)
        if self.k is None:
            # Adjust the output shape for plain networks to make them compatible
            # with the rest of the script (loss, metrics, predictions, ...).
            # (B, D_OUT) -> (B, 1, D_OUT)
            x = x[:, None]
        return x

================================================
FILE: pytabkit/models/nn_models/tabr.py
================================================
import os
import inspect
import warnings
import math
from functools import partial

import numpy as np
import torch
from torch import Tensor
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from pytabkit.models.nn_models import tabr_lib as lib
import torch.nn as nn
from torchmetrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError, AUROC, MeanAbsoluteError
from typing import Any, Optional, Union, Literal, Callable

try:
    import lightning.pytorch as pl
except ImportError:
    import pytorch_lightning as pl


class NTPLinearLayer(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True, bias_factor: float = 0.1, linear_init_type: str = 'default'):
        super().__init__()
        self.use_bias = bias
        if linear_init_type == 'default':
            self.weight = nn.Parameter(-1+2*torch.rand(in_features, out_features))
            if self.use_bias:
                self.bias = nn.Parameter((-1+2*torch.rand(1, out_features)) / np.sqrt(in_features))
        elif linear_init_type == 'normal':
            self.weight = nn.Parameter(torch.randn(in_features, out_features))
            if self.use_bias:
                self.bias = nn.Parameter(torch.randn(1, out_features))
        else:
            raise ValueError(f'Unknown linear_init_type "{linear_init_type}"')
        self.bias_factor = bias_factor
        self.weight_factor = 1./np.sqrt(in_features)

    def forward(self, x):
        x = self.weight_factor * x @ self.weight
        if self.use_bias:
            x = x + self.bias_factor * self.bias
        return x


class ParametricMishActivationLayer(nn.Module):
    def __init__(self, n_features: int, lr_factor: float = 1.0):
        super().__init__()
        self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features))
        self.lr_factor = lr_factor

    def f(self, x):
        return x.mul(torch.tanh(F.softplus(x)))

    def forward(self, x):
        # print(f'{self.weight.mean().item()=:g}')
        return x + self.lr_factor * (self.f(x) - x) * self.weight


class ParametricReluActivationLayer(nn.Module):
    def __init__(self, n_features: int, lr_factor: float = 1.0):
        super().__init__()
        self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features))
        self.lr_factor = lr_factor

    def f(self, x):
        return torch.relu(x)

    def forward(self, x):
        # print(f'{self.weight.mean().item()=:g}')
        return x + self.lr_factor * (self.f(x) - x) * self.weight


class ScalingLayer(nn.Module):
    def __init__(self, n_features: int, lr_factor: float = 6.0):
        super().__init__()
        self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features))
        self.lr_factor = lr_factor

    def forward(self, x):
        return self.lr_factor * x * self.weight[None, :]


def bce_with_logits_and_label_smoothing(inputs, *args, ls_eps: float, **kwargs):
    return (1 - 0.5 * ls_eps) * F.binary_cross_entropy_with_logits(inputs, *args, **kwargs) \
        + 0.5 * ls_eps * F.binary_cross_entropy_with_logits(-inputs, *args, **kwargs)


# adapted from https://github.com/yandex-research/tabular-dl-tabr/tree/main/bin
class TabrModel(nn.Module):
    def __init__(
            self,
            *,
            #
            n_num_features: int,
            n_bin_features: int,
            cat_cardinalities: list[int],
            n_classes: Optional[int],
            #
            num_embeddings: Optional[dict],  # lib.deep.ModuleSpec
            d_main: int,
            d_multiplier: float,
            encoder_n_blocks: int,
            predictor_n_blocks: int,
            mixer_normalization: Union[bool, Literal['auto']],
            context_dropout: float,
            dropout0: float,
            dropout1: Union[float, Literal['dropout0']],
            normalization: str,
            activation: str,
            #
            # The following options should be used only when truly needed.
            memory_efficient: bool = False,
            candidate_encoding_batch_size: Optional[int] = None,
            # extra options not in the original tabr
            add_scaling_layer: bool = False,
            scale_lr_factor: float = 6.0,
            use_ntp_linear: bool = False,
            linear_init_type: str = 'default',  # only relevant if use_ntp_linear=True
            use_ntp_encoder: bool = False,
    ) -> None:
        # import locally so importing this file doesn't cause problems if faiss is not installed
        # import in constructor as well to make model fail earlier if not installed
        import faiss
        import faiss.contrib.torch_utils  # noqa  << this line makes faiss work with PyTorch
        if not memory_efficient:
            assert candidate_encoding_batch_size is None
        if mixer_normalization == 'auto':
            mixer_normalization = encoder_n_blocks > 0
        if encoder_n_blocks == 0:
            assert not mixer_normalization
        super().__init__()
        if dropout1 == 'dropout0':
            dropout1 = dropout0

        self.one_hot_encoder = (
            lib.OneHotEncoder(cat_cardinalities) if cat_cardinalities else None
        )
        self.num_embeddings = (
            None
            if num_embeddings is None
            else lib.make_module(num_embeddings, n_features=n_num_features)
        )

        print(f'{add_scaling_layer=}')
        print(f'{activation=}')
        print(f'{scale_lr_factor=}')

        # >>> E
        d_in = (
                n_num_features
                * (1 if num_embeddings is None else num_embeddings['d_embedding'])
                + n_bin_features
                + sum(cat_cardinalities)
        )
        d_block = int(d_main * d_multiplier)
        Normalization = getattr(nn, normalization)
        if activation == 'pmish':
            Activation = lambda n_features: ParametricMishActivationLayer(n_features=n_features)
        elif activation == 'prelu':
            Activation = lambda n_features: ParametricReluActivationLayer(n_features=n_features)
        else:
            Activation = lambda n_features: getattr(nn, activation)()

        if use_ntp_linear:
            print(f'Using NTP linear layer with init {linear_init_type}')
            Linear = lambda in_features, out_features, bias=True: NTPLinearLayer(in_features, out_features, bias=bias, bias_factor=0.1, linear_init_type=linear_init_type)
        else:
            Linear = nn.Linear

        def make_block(prenorm: bool) -> nn.Sequential:
            return nn.Sequential(
                *([Normalization(d_main)] if prenorm else []),
                Linear(d_main, d_block),
                Activation(d_block),
                nn.Dropout(dropout0),
                Linear(d_block, d_main),
                nn.Dropout(dropout1),
            )

        self.scale = ScalingLayer(d_in, lr_factor=scale_lr_factor) if add_scaling_layer else nn.Identity()
        self.linear = Linear(d_in, d_main)
        self.blocks0 = nn.ModuleList(
            [make_block(i > 0) for i in range(encoder_n_blocks)]
        )

        # >>> R
        self.normalization = Normalization(d_main) if mixer_normalization else None
        self.label_encoder = (
            Linear(1, d_main) if use_ntp_encoder else nn.Linear(1, d_main)
            if n_classes is None
            else nn.Sequential(
                nn.Embedding(n_classes, d_main), lib.Lambda(lambda x: x.squeeze(-2))
            )
        )
        self.K = Linear(d_main, d_main)
        self.T = nn.Sequential(
            Linear(d_main, d_block),
            Activation(d_block),
            nn.Dropout(dropout0),
            Linear(d_block, d_main, bias=False),
        )
        self.dropout = nn.Dropout(context_dropout)

        # >>> P
        self.blocks1 = nn.ModuleList(
            [make_block(True) for _ in range(predictor_n_blocks)]
        )
        self.head = nn.Sequential(
            Normalization(d_main),
            Activation(d_main),
            Linear(d_main, lib.get_d_out(n_classes)),
        )

        # >>>
        self.search_index = None
        self.memory_efficient = memory_efficient
        self.candidate_encoding_batch_size = candidate_encoding_batch_size
        self.reset_parameters()

    def reset_parameters(self):
        if isinstance(self.label_encoder, nn.Linear) or isinstance(self.label_encoder, NTPLinearLayer):
            bound = 1 / math.sqrt(2.0)
            nn.init.uniform_(self.label_encoder.weight, -bound, bound)  # type: ignore[code]  # noqa: E501
            nn.init.uniform_(self.label_encoder.bias, -bound, bound)  # type: ignore[code]  # noqa: E501
        else:
            assert isinstance(self.label_encoder[0], nn.Embedding)
            nn.init.uniform_(self.label_encoder[0].weight, -1.0, 1.0)  # type: ignore[code]  # noqa: E501

    def _encode(self, x_: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
        x_num = x_.get('num')
        x_bin = x_.get('bin')
        x_cat = x_.get('cat')
        del x_

        x = []
        if x_num is None:
            # assert self.num_embeddings is None
            pass  # changed to make it easier to use with all-categorical datasets
        else:
            x.append(
                x_num
                if self.num_embeddings is None
                else self.num_embeddings(x_num).flatten(1)
            )
        if x_bin is not None:
            x.append(x_bin)
        if x_cat is None:
            assert self.one_hot_encoder is None
        else:
            assert self.one_hot_encoder is not None
            x.append(self.one_hot_encoder(x_cat))
        assert x
        x = torch.cat(x, dim=1).float()

        x = self.scale(x)
        x = self.linear(x)
        for block in self.blocks0:
            x = x + block(x)
        k = self.K(x if self.normalization is None else self.normalization(x))
        return x, k

    def forward(
            self,
            *,
            x_: dict[str, Tensor],
            y: Optional[Tensor],
            candidate_x_: dict[str, Tensor],
            candidate_y: Tensor,
            context_size: int,
            is_train: bool,
    ) -> Tensor:
        # print('forward()')
        # import locally so importing this file doesn't cause problems if faiss is not installed
        import faiss
        import faiss.contrib.torch_utils  # noqa  << this line makes faiss work with PyTorch

        # >>>
        with torch.set_grad_enabled(
                torch.is_grad_enabled() and not self.memory_efficient
        ):
            # NOTE: during evaluation, candidate keys can be computed just once, which
            # looks like an easy opportunity for optimization. However:
            # - if your dataset is small or/and the encoder is just a linear layer
            #   (no embeddings and encoder_n_blocks=0), then encoding candidates
            #   is not a bottleneck.
            # - implementing this optimization makes the code complex and/or unobvious,
            #   because there are many things that should be taken into account:
            #     - is the input coming from the "train" part?
            #     - is self.training True or False?
            #     - is PyTorch autograd enabled?
            #     - is saving and loading checkpoints handled correctly?
            # This is why we do not implement this optimization.

            # When memory_efficient is True, this potentially heavy computation is
            # performed without gradients.
            # Later, it is recomputed with gradients only for the context objects.
            candidate_k = (
                self._encode(candidate_x_)[1]
                if self.candidate_encoding_batch_size is None
                else torch.cat(
                    [
                        self._encode(x)[1]
                        for x in lib.iter_batches(
                        candidate_x_, self.candidate_encoding_batch_size
                    )
                    ]
                )
            )
        x, k = self._encode(x_)
        if is_train:
            # NOTE: here, we add the training batch back to the candidates after the
            # function `apply_model` removed them. The further code relies
            # on the fact that the first batch_size candidates come from the
            # training batch.
            assert y is not None
            candidate_k = torch.cat([k, candidate_k])
            candidate_y = torch.cat([y, candidate_y])
        else:
            assert y is None

        # >>>
        # The search below is optimized for larger datasets and is significantly faster
        # than the naive solution (keep autograd on + manually compute all pairwise
        # squared L2 distances + torch.topk).
        # For smaller datasets, however, the naive solution can actually be faster.
        batch_size, d_main = k.shape
        device = k.device
        with torch.no_grad():
            if self.search_index is None:
                # self.search_index = (
                #     faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main)
                #     if device.type == 'cuda'
                #     else faiss.IndexFlatL2(d_main)
                # )
                if device.type == 'cpu':
                    self.search_index = faiss.IndexFlatL2(d_main)
                elif device.type == 'cuda':
                    gpu_index = 0 if device.index is None else device.index
                    cfg = faiss.GpuIndexFlatConfig()
                    cfg.device = gpu_index
                    self.search_index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main, cfg)
                else:
                    raise ValueError()
            # Updating the index is much faster than creating a new one.
            self.search_index.reset()
            self.search_index.add(candidate_k)  # type: ignore[code]
            distances: Tensor
            context_idx: Tensor
            distances, context_idx = self.search_index.search(  # type: ignore[code]
                k, context_size + (1 if is_train else 0)
            )
            if is_train:
                # NOTE: to avoid leakage, the index i must be removed from the i-th row,
                # (because of how candidate_k is constructed).
                distances[
                    context_idx == torch.arange(batch_size, device=device)[:, None]
                    ] = torch.inf
                # Not the most elegant solution to remove the argmax, but anyway.
                context_idx = context_idx.gather(-1, distances.argsort()[:, :-1])

        if self.memory_efficient and torch.is_grad_enabled():
            assert is_train
            # Repeating the same computation,
            # but now only for the context objects and with autograd on.
            context_k = self._encode(
                {
                    ftype: torch.cat([x_[ftype], candidate_x_[ftype]])[
                        context_idx
                    ].flatten(0, 1)
                    for ftype in x_
                }
            )[1].reshape(batch_size, context_size, -1)
        else:
            context_k = candidate_k[context_idx]

        # In theory, when autograd is off, the distances obtained during the search
        # can be reused. However, this is not a bottleneck, so let's keep it simple
        # and use the same code to compute `similarities` during both
        # training and evaluation.
        similarities = (
                -k.square().sum(-1, keepdim=True)
                + (2 * (k[..., None, :] @ context_k.transpose(-1, -2))).squeeze(-2)
                - context_k.square().sum(-1)
        )
        probs = F.softmax(similarities, dim=-1)
        probs = self.dropout(probs)

        context_y_emb = self.label_encoder(candidate_y[context_idx][..., None])
        values = context_y_emb + self.T(k[:, None] - context_k)
        context_x = (probs[:, None] @ values).squeeze(1)
        x = x + context_x

        # >>>
        for block in self.blocks1:
            x = x + block(x)
        x = self.head(x)
        return x


def zero_wd_condition(
        module_name: str,
        module: nn.Module,
        parameter_name: str,
        parameter: nn.parameter.Parameter,
):
    return (
            'label_encoder' in module_name
            or 'label_encoder' in parameter_name
            or lib.default_zero_weight_decay_condition(
        module_name, module, parameter_name, parameter
    )
    )


class TabrLightning(pl.LightningModule):
    def __init__(self, model, train_dataset,
                 val_dataset, C, n_classes):
        super().__init__()
        self.model = model
        self.dataset = train_dataset
        self.val_dataset = val_dataset
        self.C = C
        if n_classes == 2:
            self.task_type = "binary"
        elif n_classes > 2:
            self.task_type = "multiclass"
        else:
            self.task_type = "regression"

        ls_eps = self.C.get('ls_eps', 0.0)
        print(f'{ls_eps=}')

        self.loss_fn = (
            partial(bce_with_logits_and_label_smoothing, ls_eps=ls_eps)
            if self.task_type == "binary"
            else partial(F.cross_entropy, label_smoothing=ls_eps)
            if self.task_type == "multiclass"
            else F.mse_loss
        )
        # Define metrics for binary and multiclass classification
        if self.task_type in ["binary", "multiclass"]:
            self.train_accuracy = Accuracy(task=self.task_type, num_classes=n_classes)
            self.train_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type)
            self.train_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type)
            self.train_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_accuracy = Accuracy(task=self.task_type, num_classes=n_classes)
            self.val_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type)

        # Define metrics for regression
        elif self.task_type == "regression":
            self.train_mse = MeanSquaredError()
            self.val_mse = MeanSquaredError()
            self.train_mae = MeanAbsoluteError()
            self.val_mae = MeanAbsoluteError()

    def setup(self, stage=None):
        self.train_size = len(self.dataset)
        self.train_indices = torch.arange(self.train_size, device=self.device)
        # move the dataset to the device
        # I think that's what tabr does, but
        # we could also keep it on the cpu
        for key in self.dataset.data:
            if self.dataset.data[key] is not None:
                self.dataset.data[key] = self.dataset.data[key].to(self.device)
        for key in self.val_dataset.data:
            if self.val_dataset.data[key] is not None:
                self.val_dataset.data[key] = self.val_dataset.data[key].to(self.device)

    def get_Xy(self, part: str, idx) -> tuple[dict[str, Tensor], Tensor]:
        if self.val_dataset.data['Y'].get_device() == -1:
            # is still on CPU
            self.setup()
        if part == "train":
            dataset = self.dataset
        elif part == "val":
            dataset = self.val_dataset
        batch = (
            {
                key[2:]: dataset.data[key]
                for key in dataset.data
                if key.startswith('X_')
            },
            dataset.data["Y"],
        )
        return (
            batch
            if idx is None
            else ({k: v[idx] for k, v in batch[0].items()}, batch[1][idx])
        )

    def training_step(self, batch, batch_idx):
        # batch should contain dictionaries with keys
        # "x_num", "x_bin", "x_cat", "y" and "indices"
        batch_indices = batch["indices"]  # batch_idx is the id of the batch itself
        # batch_indices contains the ids of the samples in the batch

        x, y = self.get_Xy('train', batch_indices)

        # we're in training mode
        # Remove the training batch from the candidates
        candidate_indices = self.train_indices[~torch.isin(self.train_indices, batch_indices)]

        candidate_x, candidate_y = self.get_Xy('train', candidate_indices)

        # Call the model's forward method
        output = self.model(
            x_=x,
            y=y,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            context_size=self.C["context_size"],
            is_train=True
        ).squeeze(-1)
        y = y.float() if self.task_type == "regression" else y.long()
        # binary cross entropy with logits needs float
        loss = self.loss_fn(output, y.float() \
            if self.task_type == "binary" \
            else y)
        # Log the loss and return it
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)

        if self.task_type in ["binary", "multiclass"]:
            self.train_accuracy.update(output, y)
            self.train_precision.update(output, y)
            self.train_recall.update(output, y)
            self.train_f1_score.update(output, y)
            self.log('train_accuracy', self.train_accuracy, on_epoch=True, prog_bar=True)
            self.log('train_precision', self.train_precision, on_epoch=True)
            self.log('train_recall', self.train_recall, on_epoch=True)
            self.log('train_f1_score', self.train_f1_score, on_epoch=True)
        elif self.task_type == "regression":
            self.train_mse.update(output, y)
            self.train_mae.update(output, y)
            self.log('train_mse', self.train_mse, on_epoch=True)
            self.log('train_mae', self.train_mae, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        if batch_idx == 0:
            print(f'Validation in epoch {self.current_epoch}', flush=True)
        # print(f'Validation step', flush=True)
        # TODO: do like test to save gpu memory?
        batch_indices = batch["indices"]  # batch_idx is the idxs of the batch samples
        x, y = self.get_Xy("val", batch_indices)

        candidate_indices = self.train_indices
        candidate_x, candidate_y = self.get_Xy('train', candidate_indices)

        output = self.model(
            x_=x,
            y=None,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            context_size=self.C["context_size"],
            is_train=False,
        ).squeeze(-1)
        y = y.float() if self.task_type == "regression" else y.long()
        # binary cross entropy with logits needs float
        loss = self.loss_fn(output, y.float() \
            if self.task_type == "binary" \
            else y)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)  # Log validation loss

        if self.task_type in ["binary", "multiclass"]:
            self.val_accuracy.update(output, y)
            self.val_precision.update(output, y)
            self.val_recall.update(output, y)
            self.val_f1_score(output, y)
            self.log('val_accuracy', self.val_accuracy, on_epoch=True, prog_bar=True)
            self.log('val_precision', self.val_precision, on_epoch=True)
            self.log('val_recall', self.val_recall, on_epoch=True)
            self.log('val_f1_score', self.val_f1_score, on_epoch=True)
        elif self.task_type == "regression":
            self.val_mse.update(output, y)
            self.log('val_mse', self.val_mse, on_epoch=True)
            self.val_mae.update(output, y)
            self.log('val_mae', self.val_mae, on_epoch=True, prog_bar=True)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        # here batch shouldn't contain indices nor y
        x = {
            key[2:]: batch[key]
            for key in batch
            if key.startswith('X_')
        }
        candidate_indices = self.train_indices
        candidate_x, candidate_y = self.get_Xy('train', candidate_indices)

        output = self.model(
            x_=x,
            y=None,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            context_size=self.C["context_size"],
            is_train=False,
        ).squeeze(-1)

        # in binary case, we need to convert it to 2-class logits
        if self.task_type == "binary":
            # it will be passed to a softmax, so we need to add a 0
            # to make the probabilities right
            output = torch.stack([torch.zeros_like(output), output], dim=1)
        elif self.task_type == "regression":
            output = output.unsqueeze(1)

        return output

    def configure_optimizers(self):
        optimizer_config = self.C["optimizer"].copy()
        optimizer = lib.make_optimizer(
            self.model, **optimizer_config, zero_weight_decay_condition=zero_wd_condition
        )
        return optimizer

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.C["batch_size"], shuffle=True,
                          num_workers=0, #max(1, min(self.C["n_threads"] - 1, 8)),
                          persistent_workers=False)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.C["eval_batch_size"], shuffle=False,
                          num_workers=0, #max(1, min(self.C["n_threads"] - 1, 8)),
                          persistent_workers=False)


================================================
FILE: pytabkit/models/nn_models/tabr_context_freeze.py
================================================
import os
import inspect
import warnings
import math
from functools import partial

import torch
from torch import Tensor
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from pytabkit.models.nn_models import tabr_lib as lib
import torch.nn as nn
from torchmetrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError, AUROC, MeanAbsoluteError
from typing import Any, Optional, Union, Literal, Callable, NamedTuple
from tqdm import tqdm

try:
    import lightning.pytorch as pl
except ImportError:
    import pytorch_lightning as pl

from pytabkit.models.nn_models.tabr import ParametricMishActivationLayer, ParametricReluActivationLayer, ScalingLayer, \
    bce_with_logits_and_label_smoothing


# taken from https://github.com/yandex-research/tabular-dl-tabr/tree/main/bin
# and https://github.com/yandex-research/tabular-dl-tabr/blob/main/bin/tabr_scaling.py
class TabrModelContextFreeze(nn.Module):
    class ForwardOutput(NamedTuple):
        y_pred: Tensor
        context_idx: Tensor
        context_probs: Tensor

    def __init__(
            self,
            *,
            #
            n_num_features: int,
            n_bin_features: int,
            cat_cardinalities: list[int],
            n_classes: Optional[int],
            #
            num_embeddings: Optional[dict],  # lib.deep.ModuleSpec
            d_main: int,
            d_multiplier: float,
            encoder_n_blocks: int,
            predictor_n_blocks: int,
            mixer_normalization: Union[bool, Literal['auto']],
            context_dropout: float,
            dropout0: float,
            dropout1: Union[float, Literal['dropout0']],
            normalization: str,
            activation: str,
            #
            # The following options should be used only when truly needed.
            memory_efficient: bool = False,
            candidate_encoding_batch_size: Optional[int] = None,
            add_scaling_layer: bool = False,
            scale_lr_factor: float = 6.0,
    ) -> None:
        # import locally so importing this file doesn't cause problems if faiss is not installed
        # import in constructor as well to make model fail earlier if not installed
        import faiss
        import faiss.contrib.torch_utils  # noqa  << this line makes faiss work with PyTorch
        if not memory_efficient:
            assert candidate_encoding_batch_size is None
        if mixer_normalization == 'auto':
            mixer_normalization = encoder_n_blocks > 0
        if encoder_n_blocks == 0:
            assert not mixer_normalization
        super().__init__()
        if dropout1 == 'dropout0':
            dropout1 = dropout0

        self.one_hot_encoder = (
            lib.OneHotEncoder(cat_cardinalities) if cat_cardinalities else None
        )
        self.num_embeddings = (
            None
            if num_embeddings is None
            else lib.make_module(num_embeddings, n_features=n_num_features)
        )

        print(f'{add_scaling_layer=}')
        print(f'{activation=}')
        print(f'{scale_lr_factor=}')

        # >>> E
        d_in = (
                n_num_features
                * (1 if num_embeddings is None else num_embeddings['d_embedding'])
                + n_bin_features
                + sum(cat_cardinalities)
        )
        d_block = int(d_main * d_multiplier)
        Normalization = getattr(nn, normalization)
        if activation == 'pmish':
            Activation = lambda n_features: ParametricMishActivationLayer(n_features=n_features)
        elif activation == 'prelu':
            Activation = lambda n_features: ParametricReluActivationLayer(n_features=n_features)
        else:
            Activation = lambda n_features: getattr(nn, activation)()

        def make_block(prenorm: bool) -> nn.Sequential:
            return nn.Sequential(
                *([Normalization(d_main)] if prenorm else []),
                nn.Linear(d_main, d_block),
                Activation(d_block),
                nn.Dropout(dropout0),
                nn.Linear(d_block, d_main),
                nn.Dropout(dropout1),
            )

        self.scale = ScalingLayer(d_in, lr_factor=scale_lr_factor) if add_scaling_layer else nn.Identity()
        self.linear = nn.Linear(d_in, d_main)
        self.blocks0 = nn.ModuleList(
            [make_block(i > 0) for i in range(encoder_n_blocks)]
        )

        # >>> R
        self.normalization = Normalization(d_main) if mixer_normalization else None
        self.label_encoder = (
            nn.Linear(1, d_main)
            if n_classes is None
            else nn.Sequential(
                nn.Embedding(n_classes, d_main), lib.Lambda(lambda x: x.squeeze(-2))
            )
        )
        self.K = nn.Linear(d_main, d_main)
        self.T = nn.Sequential(
            nn.Linear(d_main, d_block),
            Activation(d_block),
            nn.Dropout(dropout0),
            nn.Linear(d_block, d_main, bias=False),
        )
        self.dropout = nn.Dropout(context_dropout)

        # >>> P
        self.blocks1 = nn.ModuleList(
            [make_block(True) for _ in range(predictor_n_blocks)]
        )
        self.head = nn.Sequential(
            Normalization(d_main),
            Activation(d_main),
            nn.Linear(d_main, lib.get_d_out(n_classes)),
        )

        # >>>
        self.search_index = None
        self.memory_efficient = memory_efficient
        self.candidate_encoding_batch_size = candidate_encoding_batch_size
        self.reset_parameters()

    def reset_parameters(self):
        if isinstance(self.label_encoder, nn.Linear):
            bound = 1 / math.sqrt(2.0)
            nn.init.uniform_(self.label_encoder.weight, -bound, bound)  # type: ignore[code]  # noqa: E501
            nn.init.uniform_(self.label_encoder.bias, -bound, bound)  # type: ignore[code]  # noqa: E501
        else:
            assert isinstance(self.label_encoder[0], nn.Embedding)
            nn.init.uniform_(self.label_encoder[0].weight, -1.0, 1.0)  # type: ignore[code]  # noqa: E501

    def _encode(self, x_: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
        x_num = x_.get('num')
        x_bin = x_.get('bin')
        x_cat = x_.get('cat')
        del x_

        x = []
        if x_num is None:
            # assert self.num_embeddings is None
            pass  # changed to make it easier to use with all-categorical datasets
        else:
            x.append(
                x_num
                if self.num_embeddings is None
                else self.num_embeddings(x_num).flatten(1)
            )
        if x_bin is not None:
            x.append(x_bin)
        if x_cat is None:
            assert self.one_hot_encoder is None
        else:
            assert self.one_hot_encoder is not None
            x.append(self.one_hot_encoder(x_cat))
        assert x
        x = torch.cat(x, dim=1).float()

        x = self.scale(x)
        x = self.linear(x)
        for block in self.blocks0:
            x = x + block(x)
        k = self.K(x if self.normalization is None else self.normalization(x))
        return x, k

    def forward(
            self,
            *,
            x_: dict[str, Tensor],
            y: Optional[Tensor],
            idx: Optional[Tensor],
            candidate_x_: dict[str, Tensor],
            candidate_y: Tensor,
            candidate_idx: Tensor,
            context_size: int,
            context_idx: Optional[Tensor],
            is_train: bool,
    ):
        # import locally so importing this file doesn't cause problems if faiss is not installed
        import faiss
        import faiss.contrib.torch_utils  # noqa  << this line makes faiss work with PyTorch
        # >>> E
        with torch.set_grad_enabled(
                torch.is_grad_enabled() and not self.memory_efficient
        ):
            candidate_k = (
                self._encode(candidate_x_)[1]
                if self.candidate_encoding_batch_size is None
                else torch.cat(
                    [
                        self._encode(x)[1]
                        for x in lib.iter_batches(
                        candidate_x_, self.candidate_encoding_batch_size
                    )
                    ]
                )
            )
        x, k = self._encode(x_)
        if is_train:
            assert y is not None
            assert idx is not None
            if context_idx is None:
                candidate_k = torch.cat([k, candidate_k])
                candidate_y = torch.cat([y, candidate_y])
                candidate_idx = torch.cat([idx, candidate_idx])
        else:
            assert y is None
            assert idx is None

        # >>>
        batch_size, d_main = k.shape
        device = k.device
        if context_idx is None:
            with torch.no_grad():
                if self.search_index is None:
                    # self.search_index = (
                    #     faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main)
                    #     if device.type == 'cuda'
                    #     else faiss.IndexFlatL2(d_main)
                    # )
                    if device.type == 'cpu':
                        self.search_index = faiss.IndexFlatL2(d_main)
                    elif device.type == 'cuda':
                        gpu_index = 0 if device.index is None else device.index
                        cfg = faiss.GpuIndexFlatConfig()
                        cfg.device = gpu_index
                        self.search_index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main, cfg)
                    else:
                        raise ValueError()
                self.search_index.reset()
                self.search_index.add(candidate_k)  # type: ignore[code]
                distances: Tensor
                distances, context_idx = self.search_index.search(  # type: ignore[code]
                    k, context_size + (1 if is_train else 0)
                )
                assert isinstance(context_idx, Tensor)
                if is_train:
                    distances[
                        context_idx == torch.arange(batch_size, device=device)[:, None]
                        ] = torch.inf
                    context_idx = context_idx.gather(-1, distances.argsort()[:, :-1])
        # print("context_idx", context_idx)
        # "absolute" means "not relative", i.e. the original indices in the train set.
        absolute_context_idx = candidate_idx[context_idx]

        if self.memory_efficient and torch.is_grad_enabled():
            assert is_train
            context_k = self._encode(
                {
                    ftype: torch.cat([x_[ftype], candidate_x_[ftype]])[
                        context_idx
                    ].flatten(0, 1)
                    for ftype in x_
                }
            )[1].reshape(batch_size, context_size, -1)
        else:
            context_k = candidate_k[context_idx]

        similarities = (
                -k.square().sum(-1, keepdim=True)
                + (2 * (k[..., None, :] @ context_k.transpose(-1, -2))).squeeze(-2)
                - context_k.square().sum(-1)
        )
        raw_probs = F.softmax(similarities, dim=-1)
        probs = self.dropout(raw_probs)

        context_y_emb: Tensor = self.label_encoder(candidate_y[context_idx][..., None])
        values: Tensor = context_y_emb + self.T(k[:, None] - context_k)
        context_x = (probs[:, None] @ values).squeeze(1)
        x = x + context_x

        # >>>
        for block in self.blocks1:
            x: Tensor = x + block(x)
        x: Tensor = self.head(x)
        return TabrModelContextFreeze.ForwardOutput(x, absolute_context_idx, raw_probs)


def zero_wd_condition(
        module_name: str,
        module: nn.Module,
        parameter_name: str,
        parameter: nn.parameter.Parameter,
):
    return (
            'label_encoder' in module_name
            or 'label_encoder' in parameter_name
            or lib.default_zero_weight_decay_condition(
        module_name, module, parameter_name, parameter
    )
    )


class TabrLightningContextFreeze(pl.LightningModule):
    def __init__(self, model, train_dataset,
                 val_dataset, C, n_classes):
        super().__init__()
        self.model = model
        self.dataset = train_dataset
        self.val_dataset = val_dataset
        self.C = C
        if n_classes == 2:
            self.task_type = "binary"
        elif n_classes > 2:
            self.task_type = "multiclass"
        else:
            self.task_type = "regression"

        ls_eps = self.C.get('ls_eps', 0.0)
        print(f'{ls_eps=}')

        self.loss_fn = (
            partial(bce_with_logits_and_label_smoothing, ls_eps=ls_eps)
            if self.task_type == "binary"
            else partial(F.cross_entropy, label_smoothing=ls_eps)
            if self.task_type == "multiclass"
            else F.mse_loss
        )
        # Define metrics for binary and multiclass classification
        if self.task_type in ["binary", "multiclass"]:
            self.train_accuracy = Accuracy(task=self.task_type, num_classes=n_classes)
            self.train_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type)
            self.train_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type)
            self.train_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_accuracy = Accuracy(task=self.task_type, num_classes=n_classes)
            self.val_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type)
            self.val_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type)

        # Define metrics for regression
        elif self.task_type == "regression":
            self.train_mse = MeanSquaredError()
            self.val_mse = MeanSquaredError()
            self.train_mae = MeanAbsoluteError()
            self.val_mae = MeanAbsoluteError()

        self.frozen_contexts = None

    def setup(self, stage=None):
        self.train_size = len(self.dataset)
        self.train_indices = torch.arange(self.train_size, device=self.device)
        # move the dataset to the device
        # I think that's what tabr does, but 
        # we could also keep it on the cpu
        for key in self.dataset.data:
            if self.dataset.data[key] is not None:
                self.dataset.data[key] = self.dataset.data[key].to(self.device)
        for key in self.val_dataset.data:
            if self.val_dataset.data[key] is not None:
                self.val_dataset.data[key] = self.val_dataset.data[key].to(self.device)

    def get_Xy(self, part: str, idx) -> tuple[dict[str, Tensor], Tensor]:
        if self.val_dataset.data['Y'].get_device() == -1:
            # is still on CPU
            self.setup()
        if part == "train":
            dataset = self.dataset
        elif part == "val":
            dataset = self.val_dataset
        batch = (
            {
                key[2:]: dataset.data[key]
                for key in dataset.data
                if key.startswith('X_')
            },
            dataset.data["Y"],
        )
        return (
            batch
            if idx is None
            else ({k: v[idx] for k, v in batch[0].items()}, batch[1][idx])
        )

    def apply_model(self, part, batch, batch_idx, training):
        # batch should contain dictionaries with keys
        # "x_num", "x_bin", "x_cat", "y" and "indices"
        batch_indices = batch["indices"].to(self.device)  # batch_idx is the id of the batch itself
        # batch_indices contains the ids of the samples in the batch

        # batch_indices contains the ids of the samples in the batch
        x, y = self.get_Xy(part, batch_indices)

        is_train = part == 'train'
        if training and self.frozen_contexts is not None:
            candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique(
                return_inverse=True
            )
        else:
            # Importantly, `training`, not `is_train` should be used to choose the queue
            candidate_indices = self.train_indices
            context_idx = None
            if is_train:
                # This is not done when there are frozen contexts, because they are
                # already valid.
                candidate_indices = candidate_indices[
                    ~torch.isin(candidate_indices, batch_indices)
                ]
        candidate_x, candidate_y = self.get_Xy(
            'train',
            candidate_indices,  # TODO check
        )

        fwd_out = self.model(
            x_=x,
            y=y if is_train else None,
            idx=batch_indices if is_train else None,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            candidate_idx=candidate_indices,
            context_idx=context_idx,
            context_size=self.C["context_size"],
            is_train=is_train,
        )
        return fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1)), y

    def training_step(self, batch, batch_idx):
        if batch_idx == 0 and self.current_epoch == self.C["freeze_contexts_after_n_epochs"]:
            # freeze the contexts
            print(f'Freezing contexts after {self.current_epoch} epochs', flush=True)
            # Get context_ids using evaluate?
            _, _, context_idx, _, _ = self.evaluate(self.C["eval_batch_size"],
                                                    progress_bar=True  # TODO
                                                    )
            self.frozen_contexts = torch.tensor(context_idx['train'], device=self.device)

        # # batch should contain dictionaries with keys
        # # "x_num", "x_bin", "x_cat", "y" and "indices"
        # batch_indices = batch["indices"] # batch_idx is the id of the batch itself
        # # batch_indices contains the ids of the samples in the batch

        # x, y = self.get_Xy('train', batch_indices)

        # if self.frozen_contexts is not None:
        #     candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique(
        #         return_inverse=True
        #     )
        # else:
        #     context_idx = None
        #     # we're in training mode
        #     # Remove the training batch from the candidates
        #     # This is not done when there are frozen contexts, because they are
        #     # already valid.
        #     candidate_indices = self.train_indices[~torch.isin(self.train_indices, batch_indices)]

        # candidate_x, candidate_y = self.get_Xy('train', candidate_indices) #TODO check

        # fwd_out = self.model(
        #     x_=x,
        #     y=y,
        #     idx=batch_indices,
        #     candidate_x_=candidate_x,
        #     candidate_y=candidate_y,
        #     candidate_idx=candidate_indices,
        #     context_idx=context_idx,
        #     context_size=self.C["context_size"],
        #     is_train=True
        # )
        # fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1))
        fwd_out, y = self.apply_model("train", batch, batch_idx, training=True)
        output, _, _ = fwd_out

        y = y.float() if self.task_type == "regression" else y.long()
        # binary cross entropy with logits needs float
        loss = self.loss_fn(output, y.float() \
            if self.task_type == "binary" \
            else y)
        # Log the loss and return it
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)

        if self.task_type in ["binary", "multiclass"]:
            self.train_accuracy.update(output, y)
            self.train_precision.update(output, y)
            self.train_recall.update(output, y)
            self.train_f1_score.update(output, y)
            self.log('train_accuracy', self.train_accuracy, on_epoch=True, prog_bar=True)
            self.log('train_precision', self.train_precision, on_epoch=True)
            self.log('train_recall', self.train_recall, on_epoch=True)
            self.log('train_f1_score', self.train_f1_score, on_epoch=True)
        elif self.task_type == "regression":
            self.train_mse.update(output, y)
            self.train_mae.update(output, y)
            self.log('train_mse', self.train_mse, on_epoch=True)
            self.log('train_mae', self.train_mae, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        if batch_idx == 0:
            print(f'Validation in epoch {self.current_epoch}', flush=True)
        # print(f'Validation step', flush=True)
        # TODO: do like test to save gpu memory?
        # batch_indices = batch["indices"]  # batch_idx is the idxs of the batch samples
        # x, y = self.get_Xy("val", batch_indices)

        # if self.frozen_contexts is not None:
        #     candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique(
        #         return_inverse=True
        #     )
        # else:
        #     context_idx = None
        #     candidate_indices = self.train_indices

        # candidate_x, candidate_y = self.get_Xy('train', candidate_indices)

        # fwd_out = self.model(
        #     x_=x,
        #     y=None,
        #     idx=None,
        #     candidate_x_=candidate_x,
        #     candidate_y=candidate_y,
        #     candidate_idx=candidate_indices,
        #     context_idx=context_idx,
        #     context_size=self.C["context_size"],
        #     is_train=False
        # )
        # fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1))
        fwd_out, y = self.apply_model("val", batch, batch_idx, training=False)
        output, _, _ = fwd_out
        y = y.float() if self.task_type == "regression" else y.long()
        # binary cross entropy with logits needs float
        loss = self.loss_fn(output, y.float() \
            if self.task_type == "binary" \
            else y)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)  # Log validation loss

        if self.task_type in ["binary", "multiclass"]:
            self.val_accuracy.update(output, y)
            self.val_precision.update(output, y)
            self.val_recall.update(output, y)
            self.val_f1_score(output, y)
            self.log('val_accuracy', self.val_accuracy, on_epoch=True, prog_bar=True)
            self.log('val_precision', self.val_precision, on_epoch=True)
            self.log('val_recall', self.val_recall, on_epoch=True)
            self.log('val_f1_score', self.val_f1_score, on_epoch=True)
        elif self.task_type == "regression":
            self.val_mse.update(output, y)
            self.log('val_mse', self.val_mse, on_epoch=True)
            self.val_mae.update(output, y)
            self.log('val_mae', self.val_mae, on_epoch=True, prog_bar=True)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        # here batch shouldn't contain indices nor y
        # TODO: use apply_model
        x = {
            key[2:]: batch[key]
            for key in batch
            if key.startswith('X_')
        }
        context_idx = None
        candidate_indices = self.train_indices
        candidate_x, candidate_y = self.get_Xy('train', candidate_indices)

        fwd_out = self.model(
            x_=x,
            y=None,
            idx=None,
            candidate_x_=candidate_x,
            candidate_y=candidate_y,
            candidate_idx=candidate_indices,
            context_idx=context_idx,
            context_size=self.C["context_size"],
            is_train=False
        )
        fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1))
        # fwd_out, y = self.apply_model("test", batch, batch_idx, training=False)
        output, _, _ = fwd_out

        # in binary case, we need to convert it to 2-class logits
        if self.task_type == "binary":
            # it will be passed to a softmax, so we need to add a 0
            # to make the probabilities right
            output = torch.stack([torch.zeros_like(output), output], dim=1)
        elif self.task_type == "regression":
            output = output.unsqueeze(1)

        return output

    # here we only use it to get context_idx for the frozen contexts
    # so we only need to do it on train
    @torch.inference_mode()
    def evaluate(self, eval_batch_size: int, *, progress_bar: bool = False):
        self.eval()
        predictions = {}
        context_idx = {}
        context_probs = {}
        while eval_batch_size:
            try:
                # fwd_out = []
                # for idx in tqdm(
                #     torch.arange(len(self.dataset), device=self.device).split(
                #         eval_batch_size
                #     ),
                #     desc=f'Evaluation ("train"))',
                #     disable=not progress_bar,
                # ):
                #     batch = {
                #         key: self.dataset.data[key][idx]
                #         for key in self.dataset.data
                #     }
                #     x = {
                #         key[2:]: batch[key]
                #         for key in batch
                #         if key.startswith('X_')
                #     }
                #     #TODO check
                #     fwd_out.append(
                #         self.model(
                #             x_=x,
                #             y=None,
                #             idx=None,
                #             candidate_x_=x,
                #             candidate_y=batch['Y'],
                #             candidate_idx=idx,
                #             context_idx=None,
                #             context_size=self.C["context_size"],
                #             is_train=False
                #         )
                #     )
                fwd_out = lib.cat(
                    [
                        self.apply_model("train", batch, batch_idx, training=False)[0]
                        for batch_idx, batch in enumerate(
                        DataLoader(
                            self.dataset, batch_size=eval_batch_size, shuffle=False
                        )
                    )
                    ]
                )
                # fwd_out = lib.cat(fwd_out)
                predictions["train"], context_idx["train"], context_probs["train"] = (
                    e.cpu().numpy() for e in fwd_out
                )
            except RuntimeError as err:
                if not lib.is_oom_exception(err):
                    raise
                eval_batch_size //= 2
                print(f'eval_batch_size = {eval_batch_size}')
            else:
                break
        if not eval_batch_size:
            RuntimeError('Not enough memory even for eval_batch_size=1')
        metrics = None
        self.train()
        return metrics, predictions, context_idx, context_probs, eval_batch_size

    def configure_optimizers(self):
        optimizer_config = self.C["optimizer"].copy()
        optimizer = lib.make_optimizer(
            self.model, **optimizer_config, zero_weight_decay_condition=zero_wd_condition
        )
        return optimizer

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.C["batch_size"], shuffle=True,
                          num_workers=max(1, min(self.C["n_threads"] - 1, 8)),
                          persistent_workers=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.C["eval_batch_size"], shuffle=False,
                          num_workers=max(1, min(self.C["n_threads"] - 1, 8)),
                          persistent_workers=True)


================================================
FILE: pytabkit/models/nn_models/tabr_lib.py
================================================
import math
import inspect
import warnings
import dataclasses
from typing import Any, Callable, Optional, Union, cast, Iterator, Iterable, List, TypeVar

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from torch.nn.parameter import Parameter

# we copied this file from https://github.com/yandex-research/tabular-dl-tabr/blob/main/lib/deep.py
# to limit the number of dependencies

# ======================================================================================
# >>> modules <<<
# ======================================================================================
# When an instance of ModuleSpec is a dict,
# it must contain the key "type" with a string value
ModuleSpec = Union[str, dict[str, Any], Callable[..., nn.Module]]
T = TypeVar('T')


def _initialize_embeddings(weight: Tensor, d: Optional[int]) -> None:
    if d is None:
        d = weight.shape[-1]
    d_sqrt_inv = 1 / math.sqrt(d)
    nn.init.uniform_(weight, a=-d_sqrt_inv, b=d_sqrt_inv)


def make_trainable_vector(d: int) -> Parameter:
    x = torch.empty(d)
    _initialize_embeddings(x, None)
    return Parameter(x)


# class OneHotEncoder(nn.Module):
#     cardinalities: Tensor

#     def __init__(self, cardinalities: list[int]) -> None:
#         # cardinalities[i]`` is the number of unique values for the i-th categorical feature.
#         super().__init__()
#         self.register_buffer('cardinalities', torch.tensor(cardinalities))

#     def forward(self, x: Tensor) -> Tensor:
#         encoded_columns = [
#             F.one_hot(x[..., column], cardinality)
#             for column, cardinality in zip(range(x.shape[-1]), self.cardinalities)
#         ]

#         return torch.cat(encoded_columns, -1)
# This is modified to allow to encode unknown categories with zeros
class OneHotEncoder(nn.Module):
    cardinalities: torch.Tensor

    def __init__(self, cardinalities: list[int]) -> None:
        super().__init__()
        self.register_buffer('cardinalities', torch.tensor(cardinalities))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        encoded_columns = []
        for column, cardinality in enumerate(self.cardinalities):
            column_values = x[..., column]
            # Replace -1 with a temporary valid index (e.g., 0)
            temp_index = torch.where(column_values == -1, 0, column_values)
            # Perform one-hot encoding
            one_hot = F.one_hot(temp_index, cardinality)
            # Zero out the vectors where original value was -1
            mask = column_values == -1
            one_hot[mask] = 0
            encoded_columns.append(one_hot)

        return torch.cat(encoded_columns, -1)


class CLSEmbedding(nn.Module):
    def __init__(self, d_embedding: int) -> None:
        super().__init__()
        self.weight = make_trainable_vector(d_embedding)

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim == 3
        assert x.shape[-1] == len(self.weight)
        return torch.cat([self.weight.expand(len(x), 1, -1), x], dim=1)


class CatEmbeddings(nn.Module):
    def __init__(
            self,
            _cardinalities_and_maybe_dimensions: Union[list[int], list[tuple[int, int]]],
            d_embedding: Optional[int] = None,
            *,
            stack: bool = False,
    ) -> None:
        assert _cardinalities_and_maybe_dimensions
        spec = _cardinalities_and_maybe_dimensions
        if not (
                (isinstance(spec[0], tuple) and d_embedding is None)
                or (isinstance(spec[0], int) and d_embedding is not None)
        ):
            raise ValueError(
                'Invalid arguments. Valid combinations are:'
                ' (1) the first argument is a list of (cardinality, embedding)-tuples AND d_embedding is None'
                ' (2) the first argument is a list of cardinalities AND d_embedding is an integer'
            )
        if stack and d_embedding is None:
            raise ValueError('stack can be True only when d_embedding is not None')

        super().__init__()
        spec_ = cast(
            list[tuple[int, int]],
            spec if d_embedding is None else [(x, d_embedding) for x in spec],
        )
        self._embeddings = nn.ModuleList()
        for cardinality, d_embedding in spec_:
            self._embeddings.append(nn.Embedding(cardinality, d_embedding))
        self.stack = stack
        self.reset_parameters()

    def reset_parameters(self) -> None:
        for module in self._embeddings:
            _initialize_embeddings(module.weight, None)  # type: ignore[code]

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim == 2
        assert x.shape[1] == len(self._embeddings)
        out = [module(column) for module, column in zip(self._embeddings, x.T)]
        return torch.stack(out, dim=1) if self.stack else torch.cat(out, dim=1)


class LinearEmbeddings(nn.Module):
    def __init__(self, n_features: int, d_embedding: int, bias: bool = True):
        super().__init__()
        self.weight = Parameter(Tensor(n_features, d_embedding))
        self.bias = Parameter(Tensor(n_features, d_embedding)) if bias else None
        self.reset_parameters()

    def reset_parameters(self) -> None:
        for parameter in [self.weight, self.bias]:
            if parameter is not None:
                _initialize_embeddings(parameter, parameter.shape[-1])

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim == 2
        x = self.weight[None] * x[..., None]
        if self.bias is not None:
            x = x + self.bias[None]
        return x


class PeriodicEmbeddings(nn.Module):
    def __init__(
            self, n_features: int, n_frequencies: int, frequency_scale: float
    ) -> None:
        super().__init__()
        self.frequencies = Parameter(
            torch.normal(0.0, frequency_scale, (n_features, n_frequencies))
        )

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim == 2
        x = 2 * torch.pi * self.frequencies[None] * x[..., None]
        x = torch.cat([torch.cos(x), torch.sin(x)], -1)
        return x


class NLinear(nn.Module):
    def __init__(
            self, n_features: int, d_in: int, d_out: int, bias: bool = True
    ) -> None:
        super().__init__()
        self.weight = Parameter(Tensor(n_features, d_in, d_out))
        self.bias = Parameter(Tensor(n_features, d_out)) if bias else None
        with torch.no_grad():
            for i in range(n_features):
                layer = nn.Linear(d_in, d_out)
                self.weight[i] = layer.weight.T
                if self.bias is not None:
                    self.bias[i] = layer.bias

    def forward(self, x):
        assert x.ndim == 3
        x = x[..., None] * self.weight[None]
        x = x.sum(-2)
        if self.bias is not None:
            x = x + self.bias[None]
        return x


class LREmbeddings(nn.Sequential):
    """The LR embeddings from the paper 'On Embeddings for Numerical Features in Tabular Deep Learning'."""  # noqa: E501

    def __init__(self, n_features: int, d_embedding: int) -> None:
        super().__init__(LinearEmbeddings(n_features, d_embedding), nn.ReLU())


class PLREmbeddings(nn.Sequential):
    """The PLR embeddings from the paper 'On Embeddings for Numerical Features in Tabular Deep Learning'.

    Additionally, the 'lite' option is added. Setting it to `False` gives you the original PLR
    embedding from the above paper. We noticed that `lite=True` makes the embeddings
    noticeably more lightweight without critical performance loss, and we used that for our model.
    """  # noqa: E501

    def __init__(
            self,
            n_features: int,
            n_frequencies: int,
            frequency_scale: float,
            d_embedding: int,
            lite: bool,
    ) -> None:
        super().__init__(
            PeriodicEmbeddings(n_features, n_frequencies, frequency_scale),
            (
                nn.Linear(2 * n_frequencies, d_embedding)
                if lite
                else NLinear(n_features, 2 * n_frequencies, d_embedding)
            ),
            nn.ReLU(),
        )


class PBLDEmbeddings(nn.Module):
    def __init__(self, n_features: int,
                 n_frequencies: int,
                 frequency_scale: float,
                 d_embedding: int,
                 plr_act_name: str = 'linear',
                 plr_use_densenet: bool = True):
        super().__init__()
        print(f'Constructing PBLD embeddings')
        hidden_2 = d_embedding-1 if plr_use_densenet else d_embedding
        self.weight_1 = nn.Parameter(frequency_scale * torch.randn(n_features, 1, n_frequencies))
        self.weight_2 = nn.Parameter((-1 + 2 * torch.rand(n_features, n_frequencies, hidden_2))
                / np.sqrt(n_frequencies))
        self.bias_1 = nn.Parameter(np.pi * (-1 + 2 * torch.rand(n_features, 1, n_frequencies)))
        self.bias_2 = nn.Parameter((-1 + 2 * torch.rand(n_features, 1, hidden_2)) / np.sqrt(n_frequencies))
        self.plr_act_name = plr_act_name
        self.plr_use_densenet = plr_use_densenet

    def forward(self, x):
        # transpose to treat the continuous feature dimension like a batched dimension
        # then add a new channel dimension
        # shape will be (vectorized..., n_cont, batch, 1)
        x_orig = x
        x = x.transpose(-1, -2).unsqueeze(-1)
        x = 2 * torch.pi * x.matmul(self.weight_1)  # matmul is automatically batched
        x = x + self.bias_1
        # x = torch.sin(x)
        x = torch.cos(x)
        x = x.matmul(self.weight_2)  # matmul is automatically batched
        x = x + self.bias_2
        if self.plr_act_name == 'relu':
            x = torch.relu(x)
        elif self.plr_act_name == 'linear':
            pass
        else:
            raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"')
        # bring back n_cont dimension after n_batch
        # then flatten the last two dimensions
        x = x.transpose(-2, -3)
        x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1])
        if self.plr_use_densenet:
            x = torch.cat([x, x_orig], dim=-1)
        return x


class MLP(nn.Module):
    class Block(nn.Module):
        def __init__(
                self,
                *,
                d_in: int,
                d_out: int,
                bias: bool,
                activation: str,
                dropout: float,
        ) -> None:
            super().__init__()
            self.linear = nn.Linear(d_in, d_out, bias)
            self.activation = make_module(activation)
            self.dropout = nn.Dropout(dropout)

        def forward(self, x: Tensor) -> Tensor:
            return self.dropout(self.activation(self.linear(x)))

    Head = nn.Linear

    def __init__(
            self,
            *,
            d_in: int,
            d_out: Optional[int],
            n_blocks: int,
            d_layer: int,
            activation: str,
            dropout: float,
    ) -> None:
        assert n_blocks > 0
        super().__init__()

        self.blocks = nn.Sequential(
            *[
                MLP.Block(
                    d_in=d_layer if block_i else d_in,
                    d_out=d_layer,
                    bias=True,
                    activation=activation,
                    dropout=dropout,
                )
                for block_i in range(n_blocks)
            ]
        )
        self.head = None if d_out is None else MLP.Head(d_layer, d_out)

    @property
    def d_out(self) -> int:
        return (
            self.blocks[-1].linear.out_features  # type: ignore[code]
            if self.head is None
            else self.head.out_features
        )

    def forward(self, x: Tensor) -> Tensor:
        x = self.blocks(x)
        if self.head is not None:
            x = self.head(x)
        return x


_CUSTOM_MODULES = {
    x.__name__: x
    for x in [
        LinearEmbeddings,
        LREmbeddings,
        PLREmbeddings,
        PBLDEmbeddings,
        MLP,
    ]
}


def register_module(key: str, f: Callable[..., nn.Module]) -> None:
    assert key not in _CUSTOM_MODULES
    _CUSTOM_MODULES[key] = f


def make_module(spec: ModuleSpec, *args, **kwargs) -> nn.Module:
    """
    >>> make_module('ReLU')
    >>> make_module(nn.ReLU)
    >>> make_module('Linear', 1, out_features=2)
    >>> make_module((lambda *args: nn.Linear(*args)), 1, out_features=2)
    >>> make_module({'type': 'Linear', 'in_features' 1}, out_features=2)
    """
    if isinstance(spec, str):
        Module = getattr(nn, spec, None)
        if Module is None:
            Module = _CUSTOM_MODULES[spec]
        else:
            assert spec not in _CUSTOM_MODULES
        return make_module(Module, *args, **kwargs)
    elif isinstance(spec, dict):
        assert not (set(spec) & set(kwargs))
        spec = spec.copy()
        return make_module(spec.pop('type'), *args, **spec, **kwargs)
    elif callable(spec):
        return spec(*args, **kwargs)
    else:
        raise ValueError()


def get_n_parameters(m: nn.Module):
    return sum(x.numel() for x in m.parameters() if x.requires_grad)


def get_d_out(n_classes: Optional[int]) -> int:
    return 1 if n_classes is None or n_classes == 2 else n_classes


# ======================================================================================
# >>> optimization <<<
# ======================================================================================
def default_zero_weight_decay_condition(
        module_name: str, module: nn.Module, parameter_name: str, parameter: Parameter
):
    del module_name, parameter
    return parameter_name.endswith('bias') or isinstance(
        module,
        (
            nn.BatchNorm1d,
            nn.LayerNorm,
            nn.InstanceNorm1d,
            LinearEmbeddings,
            PeriodicEmbeddings,
        ),
    )


def make_parameter_groups(
        model: nn.Module,
        zero_weight_decay_condition,
        custom_groups: dict[tuple[str], dict],  # [(fullnames, options), ...]
) -> list[dict[str, Any]]:
    custom_fullnames = set()
    custom_fullnames.update(*custom_groups)
    assert sum(map(len, custom_groups)) == len(
        custom_fullnames
    ), 'Custom parameter groups must not intersect'

    parameters_info = {}  # fullname -> (parameter, needs_wd)
    for module_name, module in model.named_modules():
        for name, parameter in module.named_parameters():
            fullname = f'{module_name}.{name}' if module_name else name
            parameters_info.setdefault(fullname, (parameter, []))[1].append(
                not zero_weight_decay_condition(module_name, module, name, parameter)
            )
    parameters_info = {k: (v[0], all(v[1])) for k, v in parameters_info.items()}

    params_with_wd = {'params': []}
    params_without_wd = {'params': [], 'weight_decay': 0.0}
    custom_params = {k: {'params': []} | v for k, v in custom_groups.items()}

    for fullname, (parameter, needs_wd) in parameters_info.items():
        for fullnames, group in custom_params.items():
            if fullname in fullnames:
                custom_fullnames.remove(fullname)
                group['params'].append(parameter)
                break
        else:
            (params_with_wd if needs_wd else params_with_wd)['params'].append(parameter)
    assert (
        not custom_fullnames
    ), f'Some of the custom parameters were not found in the model: {custom_fullnames}'
    return [params_with_wd, params_without_wd] + list(custom_params.values())


def make_optimizer(
        module: nn.Module,
        type: str,
        *,
        zero_weight_decay_condition=default_zero_weight_decay_condition,
        custom_parameter_groups: Optional[dict[tuple[str], dict]] = None,
        **optimizer_kwargs,
) -> torch.optim.Optimizer:
    if custom_parameter_groups is None:
        custom_parameter_groups = {}
    Optimizer = getattr(optim, type)
    parameter_groups = make_parameter_groups(
        module, zero_weight_decay_condition, custom_parameter_groups
    )
    print(f'{optimizer_kwargs=}')
    return Optimizer(parameter_groups, **optimizer_kwargs)


def get_lr(optimizer: optim.Optimizer) -> float:
    return next(iter(optimizer.param_groups))['lr']


def set_lr(optimizer: optim.Optimizer, lr: float) -> None:
    for group in optimizer.param_groups:
        group['lr'] = lr


## We also package useful delu functions to limit the number of dependencies
# copied from https://github.com/Yura52/delu/blob/5f0015cbdff86f64aff8199123012a9663538fcf/delu/nn.py
class Lambda(torch.nn.Module):
    """A wrapper for functions from `torch` and methods of `torch.Tensor`.

    An important "feature" of this module is that it is intentionally limited:

    - Only the functions from the `torch` module and the methods of `torch.Tensor`
      are allowed.
    - The passed callable must accept a single `torch.Tensor`
      and return a single `torch.Tensor`.
    - The allowed keyword arguments must be of simple types (see the docstring).

    **Usage**

    >>> m = delu.nn.Lambda(torch.squeeze)
    >>> m(torch.randn(2, 1, 3, 1)).shape
    torch.Size([2, 3])
    >>> m = delu.nn.Lambda(torch.squeeze, dim=1)
    >>> m(torch.randn(2, 1, 3, 1)).shape
    torch.Size([2, 3, 1])
    >>> m = delu.nn.Lambda(torch.Tensor.abs_)
    >>> m(torch.tensor(-1.0))
    tensor(1.)

    Custom functions are not allowed
    (technically, they are **temporarily** allowed,
    but this functionality is deprecated and will be removed in future releases):

    >>> # xdoctest: +SKIP
    >>> m = delu.nn.Lambda(lambda x: torch.abs(x))
    Traceback (most recent call last):
        ...
    ValueError: fn must be a function from `torch` or a method of `torch.Tensor`, but ...

    Non-trivial keyword arguments are not allowed:

    >>> m = delu.nn.Lambda(torch.mul, other=torch.tensor(2.0))
    Traceback (most recent call last):
        ...
    ValueError: For kwargs, the allowed value types include: ...
    """  # noqa: E501

    def __init__(self, fn: Callable[..., torch.Tensor], /, **kwargs) -> None:
        """
        Args:
            fn: the callable.
            kwargs: the keyword arguments for ``fn``. The allowed values types include:
                None, bool, int, float, bytes, str
                and (nested) tuples of these simple types.
        """
        super().__init__()
        if not callable(fn) or (
                fn not in vars(torch).values()
                and (
                        fn not in (member for _, member in inspect.getmembers(torch.Tensor))
                        or inspect.ismethod(fn)  # Check if fn is a @classmethod
                )
        ):
            warnings.warn(
                'Passing custom functions to delu.nn.Lambda is deprecated'
                ' and will be removed in future releases.'
                ' Only functions from the `torch` module and methods of `torch.Tensor`'
                ' are allowed',
                DeprecationWarning,
            )
            # NOTE: in future releases, replace the above warning with this exception:
            # raise ValueError(
            #     'fn must be a function from `torch` or a method of `torch.Tensor`,'
            #     f' but this is not true for the passed {fn=}'
            # )

        def is_valid_value(x):
            return (
                    x is None
                    or isinstance(x, (bool, int, float, bytes, str))
                    or isinstance(x, tuple)
                    and all(map(is_valid_value, x))
            )

        for k, v in kwargs.items():
            if not is_valid_value(v):
                raise ValueError(
                    'For kwargs, the allowed value types include:'
                    ' None, bool, int, float, bytes, str and (nested) tuples containing'
                    ' values of these simple types. This is not true for the passed'
                    f' argument {k} with the value {v}'
                )

        self._function = fn
        self._function_kwargs = kwargs

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Do the forward pass."""
        return self._function(x, **self._function_kwargs)


# copied from https://github.com/Yura52/delu/blob/5f0015cbdff86f64aff8199123012a9663538fcf/delu/_tensor_ops.py#L339

def _make_index_batches(
        x: torch.Tensor,
        batch_size: int,
        shuffle: bool,
        generator: Optional[torch.Generator],
        drop_last: bool,
) -> Iterable[torch.Tensor]:
    size = len(x)
    if not size:
        raise ValueError('data must not contain empty tensors')
    batch_indices = (
        torch.randperm(size, generator=generator, device=x.device)
        if shuffle
        else torch.arange(size, device=x.device)
    ).split(batch_size)
    return (
        batch_indices[:-1]
        if batch_indices and drop_last and len(batch_indices[-1]) < batch_size
        else batch_indices
    )


def iter_batches(
        data: T,
        /,
        batch_size: int,
        *,
        shuffle: bool = False,
        generator: Optional[torch.Generator] = None,
        drop_last: bool = False,
) -> Iterator[T]:
    """Iterate over a tensor or a collection of tensors by (random) batches.

    The function makes batches along the first dimension of the tensors in ``data``.

    TL;DR (assuming that ``X`` and ``Y`` denote full tensors
    and ``xi`` and ``yi`` denote batches):

    - ``delu.iter_batches: X -> [x1, x2, ..., xN]``
    - ``delu.iter_batches: (X, Y) -> [(x1, y1), (x2, y2), ..., (xN, yN)]``
    - ``delu.iter_batches: {'x': X, 'y': Y} -> [{'x': x1, 'y': y1}, ...]``
    - Same for named tuples.
    - Same for dataclasses.

    .. note::
        `delu.iter_batches` is significantly faster for in-memory tensors
        than `torch.utils.data.DataLoader`, because, when building batches,
        it uses batched indexing instead of one-by-one indexing.

    **Usage**

    >>> X = torch.randn(12, 32)
    >>> Y = torch.randn(12)

    `delu.iter_batches` can be applied to tensors:

    >>> for x in delu.iter_batches(X, batch_size=5):
    ...     print(len(x))
    5
    5
    2

    `delu.iter_batches` can be applied to tuples:

    >>> # shuffle=True can be useful for training.
    >>> dataset = (X, Y)
    >>> for x, y in delu.iter_batches(dataset, batch_size=5, shuffle=True):
    ...     print(len(x), len(y))
    5 5
    5 5
    2 2
    >>> # Drop the last incomplete batch.
    >>> for x, y in delu.iter_batches(
    ...     dataset, batch_size=5, shuffle=True, drop_last=True
    ... ):
    ...     print(len(x), len(y))
    5 5
    5 5
    >>> # The last batch is complete, so drop_last=True does not have any effect.
    >>> batches = []
    >>> for x, y in delu.iter_batches(dataset, batch_size=6, drop_last=True):
    ...     print(len(x), len(y))
    ...     batches.append((x, y))
    6 6
    6 6

    By default, ``shuffle`` is set to `False`, i.e. the order of items is preserved:

    >>> X2, Y2 = delu.cat(list(delu.iter_batches((X, Y), batch_size=5)))
    >>> print((X == X2).all().item(), (Y == Y2).all().item())
    True True

    `delu.iter_batches` can be applied to dictionaries:

    >>> dataset = {'x': X, 'y': Y}
    >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True):
    ...     print(isinstance(batch, dict), len(batch['x']), len(batch['y']))
    True 5 5
    True 5 5
    True 2 2

    `delu.iter_batches` can be applied to named tuples:

    >>> from typing import NamedTuple
    >>> class Data(NamedTuple):
    ...     x: torch.Tensor
    ...     y: torch.Tensor
    >>> dataset = Data(X, Y)
    >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True):
    ...     print(isinstance(batch, Data), len(batch.x), len(batch.y))
    True 5 5
    True 5 5
    True 2 2

    `delu.iter_batches` can be applied to dataclasses:

    >>> from dataclasses import dataclass
    >>> @dataclass
    ... class Data:
    ...     x: torch.Tensor
    ...     y: torch.Tensor
    >>> dataset = Data(X, Y)
    >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True):
    ...     print(isinstance(batch, Data), len(batch.x), len(batch.y))
    True 5 5
    True 5 5
    True 2 2

    Args:
        data: the tensor or the non-empty collection of tensors.
            If data is a collection, then the tensors must be of the same size
            along the first dimension.
        batch_size: the batch size. If ``drop_last`` is False,
            then the last batch can be smaller than ``batch_size``.
        shuffle: if True, iterate over random batches (without replacement),
            not sequentially.
        generator: when ``shuffle`` is True, passing ``generator`` makes the function
            reproducible.
        drop_last: when ``True`` and the last batch is smaller then ``batch_size``,
            then this last batch is not returned
            (in other words,
            same as the ``drop_last`` argument for `torch.utils.data.DataLoader`).
    Returns:
        the iterator over batches.
    """
    if not shuffle and generator is not None:
        raise ValueError('When shuffle is False, generator must be None.')

    constructor: Callable[[Any], T]
    args = (batch_size, shuffle, generator, drop_last)

    if isinstance(data, torch.Tensor):
        item = data
        for idx in _make_index_batches(item, *args):
            yield data[idx]  # type: ignore

    elif isinstance(data, tuple):
        if not data:
            raise ValueError('data must be non-empty')
        item = data[0]
        for x in data:
            if not isinstance(x, torch.Tensor) or len(x) != len(item):
                raise ValueError(
                    'If data is a tuple, it must contain only tensors,'
                    ' and they must have the same first dimension'
                )
        constructor = type(data)  # type: ignore
        constructor = getattr(constructor, '_make', constructor)  # Handle named tuples.
        for idx in _make_index_batches(item, *args):
            yield constructor(x[idx] for x in data)

    elif isinstance(data, dict):
        if not data:
            raise ValueError('data must be non-empty')
        item = next(iter(data.values()))
        for x in data.values():
            if not isinstance(x, torch.Tensor) or len(x) != len(item):
                raise ValueError(
                    'If data is a dict, it must contain only tensors,'
                    ' and they must have the same first dimension'
                )
        constructor = type(data)  # type: ignore
        for idx in _make_index_batches(item, *args):
            yield constructor((k, v[idx]) for k, v in data.items())

    elif dataclasses.is_dataclass(data):
        fields = list(dataclasses.fields(data))
        if not fields:
            raise ValueError('data must be non-empty')
        item = getattr(data, fields[0].name)
        for field in fields:
            if field.type is not torch.Tensor:
                raise ValueError('All dataclass fields must be tensors.')
            if len(getattr(data, field.name)) != len(item):
                raise ValueError(
                    'All dataclass tensors must have the same first dimension.'
                )
        constructor = type(data)  # type: ignore
        for idx in _make_index_batches(item, *args):
            yield constructor(
                **{field.name: getattr(data, field.name)[idx] for field in fields}  # type: ignore
            )

    else:
        raise ValueError(f'The collection {type(data)} is not supported.')


def cat(data: List[T], /, dim: int = 0) -> T:
    """Concatenate a sequence of collections of tensors.

    `delu.cat` is a generalized version of `torch.cat` for concatenating
    not only tensors, but also (nested) collections of tensors.

    **Usage**

    Let's see how a sequence of model outputs for batches can be concatenated
    into a output tuple for the whole dataset:

    >>> from torch.utils.data import DataLoader, TensorDataset
    >>> dataset = TensorDataset(torch.randn(320, 24))
    >>> batch_size = 32
    >>>
    >>> # The model returns not only predictions, but also embeddings.
    >>> def model(x_batch):
    ...     # A dummy forward pass.
    ...     embeddings_batch = torch.randn(batch_size, 16)
    ...     y_pred_batch = torch.randn(batch_size)
    ...     return (y_pred_batch, embeddings_batch)
    ...
    >>> y_pred, embeddings = delu.cat(
    ...     [model(batch) for batch in DataLoader(dataset, batch_size, shuffle=True)]
    ... )
    >>> len(y_pred) == len(dataset)
    True
    >>> len(embeddings) == len(dataset)
    True

    The same works for dictionaries:

    >>> def model(x_batch):
    ...     return {
    ...         'y_pred': torch.randn(batch_size),
    ...         'embeddings': torch.randn(batch_size, 16)
    ...     }
    ...
    >>> outputs = delu.cat(
    ...     [model(batch) for batch in DataLoader(dataset, batch_size, shuffle=True)]
    ... )
    >>> len(outputs['y_pred']) == len(dataset)
    True
    >>> len(outputs['embeddings']) == len(dataset)
    True

    The same works for sequences of named tuples, dataclasses, tensors and
    nested combinations of all mentioned collection types.

    *Below, additional technical examples are provided.*

    The common setup:

    >>> # First batch.
    >>> x1 = torch.randn(64, 10)
    >>> y1 = torch.randn(64)
    >>> # Second batch.
    >>> x2 = torch.randn(64, 10)
    >>> y2 = torch.randn(64)
    >>> # The last (incomplete) batch.
    >>> x3 = torch.randn(7, 10)
    >>> y3 = torch.randn(7)
    >>> total_size = len(x1) + len(x2) + len(x3)

    `delu.cat` can be applied to tuples:

    >>> batches = [(x1, y1), (x2, y2), (x3, y3)]
    >>> X, Y = delu.cat(batches)
    >>> len(X) == total_size and len(Y) == total_size
    True

    `delu.cat` can be applied to dictionaries:

    >>> batches = [
    ...     {'x': x1, 'y': y1},
    ...     {'x': x2, 'y': y2},
    ...     {'x': x3, 'y': y3},
    ... ]
    >>> result = delu.cat(batches)
    >>> isinstance(result, dict)
    True
    >>> len(result['x']) == total_size and len(result['y']) == total_size
    True

    `delu.cat` can be applied to named tuples:

    >>> from typing import NamedTuple
    >>> class Data(NamedTuple):
    ...     x: torch.Tensor
    ...     y: torch.Tensor
    ...
    >>> batches = [Data(x1, y1), Data(x2, y2), Data(x3, y3)]
    >>> result = delu.cat(batches)
    >>> isinstance(result, Data)
    True
    >>> len(result.x) == total_size and len(result.y) == total_size
    True

    `delu.cat` can be applied to dataclasses:

    >>> from dataclasses import dataclass
    >>> @dataclass
    ... class Data:
    ...     x: torch.Tensor
    ...     y: torch.Tensor
    ...
    >>> batches = [Data(x1, y1), Data(x2, y2), Data(x3, y3)]
    >>> result = delu.cat(batches)
    >>> isinstance(result, Data)
    True
    >>> len(result.x) == total_size and len(result.y) == total_size
    True

    `delu.cat` can be applied to nested collections:

    >>> batches = [
    ...     (x1, {'a': {'b': y1}}),
    ...     (x2, {'a': {'b': y2}}),
    ...     (x3, {'a': {'b': y3}}),
    ... ]
    >>> X, Y_nested = delu.cat(batches)
    >>> len(X) == total_size and len(Y_nested['a']['b']) == total_size
    True

    **Lists are not supported:**

    >>> # This does not work. Instead, use tuples.
    >>> # batches = [[x1, y1], [x2, y2], [x3, y3]]
    >>> # delu.cat(batches)  # Error

    Args:
        data: the list of collections of tensors.
            All items of the list must be of the same type, structure and layout, only
            the ``dim`` dimension can vary (same as for `torch.cat`).
            All the "leaf" values must be of the type `torch.Tensor`.
        dim: the dimension along which the tensors are concatenated.
    Returns:
        The concatenated items of the list.
    """
    if not isinstance(data, list):
        raise ValueError('The input must be a list')
    if not data:
        raise ValueError('The input must be non-empty')

    first = data[0]

    if isinstance(first, torch.Tensor):
        return torch.cat(data, dim=dim)  # type: ignore

    elif isinstance(first, tuple):
        constructor = type(first)
        constructor = getattr(constructor, '_make', constructor)  # Handle named tuples.
        return constructor(
            cat([x[i] for x in data], dim=dim) for i in range(len(first))  # type: ignore
        )

    elif isinstance(first, dict):
        return type(first)((key, cat([x[key] for x in data], dim=dim)) for key in first)  # type: ignore

    elif dataclasses.is_dataclass(first):
        return type(first)(
            **{
                field.name: cat([getattr(x, field.name) for x in data], dim=dim)
                for field in dataclasses.fields(first)
            }
        )  # type: ignore

    else:
        raise ValueError(f'The collection type {type(first)} is not supported.')


def is_oom_exception(err: RuntimeError) -> bool:
    return isinstance(err, torch.cuda.OutOfMemoryError) or any(
        x in str(err)
        for x in [
            'CUDA out of memory',
            'CUBLAS_STATUS_ALLOC_FAILED',
            'CUDA error: out of memory',
        ]
    )


================================================
FILE: pytabkit/models/optim/__init__.py
================================================


================================================
FILE: pytabkit/models/optim/adopt.py
================================================
# taken from https://github.com/iShohei220/adopt/blob/main/adopt.py
# Apache 2.0 license
# requires torch >= 2.4

# mypy: allow-untyped-decorators
# mypy: allow-untyped-defs
from typing import cast, List, Optional, Tuple, Union

import torch
from torch import Tensor

from torch.optim.optimizer import (
    _capturable_doc,
    _default_to_fused_or_foreach,
    _device_dtype_check_for_fused,
    _differentiable_doc,
    _disable_dynamo_if_unsupported,
    _foreach_doc,
    _fused_doc,
    _get_capturable_supported_devices,
    _get_scalar_dtype,
    _get_value,
    _maximize_doc,
    _stack_if_compiling,
    _use_grad_for_differentiable,
    _view_as_real,
    DeviceDict,
    Optimizer,
    ParamsT,
)


__all__ = ["ADOPT", "adopt"]


class ADOPT(Optimizer):
    def __init__(
        self,
        params: ParamsT,
        lr: Union[float, Tensor] = 1e-3,
        betas: Tuple[float, float] = (0.9, 0.9999),
        eps: float = 1e-6,
        weight_decay: float = 0.0,
        decoupled: bool = False,
        *,
        foreach: Optional[bool] = None,
        maximize: bool = False,
        capturable: bool = False,
        differentiable: bool = False,
        fused: Optional[bool] = None,
    ):
        if isinstance(lr, Tensor):
            if foreach and not capturable:
                raise ValueError(
                    "lr as a Tensor is not supported for capturable=False and foreach=True"
                )
            if lr.numel() != 1:
                raise ValueError("Tensor lr must be 1-element")
        if not 0.0 <= lr:
            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(
            lr=lr,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            decoupled=decoupled,
            maximize=maximize,
            foreach=foreach,
            capturable=capturable,
            differentiable=differentiable,
            fused=fused,
        )
        super().__init__(params, defaults)

        if fused:
            # TODO: support fused
            raise RuntimeError("`fused` is not currently supported")

            if differentiable:
                raise RuntimeError("`fused` does not support `differentiable`")
            self._step_supports_amp_scaling = True
            # TODO(crcrpar): [low prec params & their higher prec copy]
            # Support AMP with FP16/BF16 model params which would need
            # higher prec copy of params to do update math in higher prec to
            # alleviate the loss of information.
            if foreach:
                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault("maximize", False)
            group.setdefault("foreach", None)
            group.setdefault("capturable", False)
            group.setdefault("differentiable", False)
            fused = group.setdefault("fused", None)
            for p in group["params"]:
                p_state = self.state.get(p, [])
                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
                    step_val = float(p_state["step"])
                    p_state["step"] = (
                        torch.tensor(
                            step_val,
                            dtype=_get_scalar_dtype(is_fused=fused),
                            device=p.device,
                        )
                        if group["capturable"] or group["fused"]
                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
                    )

    def _init_group(
        self,
        group,
        params_with_grad,
        grads,
        exp_avgs,
        exp_avg_sqs,
        state_steps,
    ):
        has_complex = False
        for p in group["params"]:
            if p.grad is not None:
                has_complex |= torch.is_complex(p)
                params_with_grad.append(p)
                if p.grad.is_sparse:
                    raise RuntimeError(
                        "ADOPT does not support sparse gradients"
                    )
                grads.append(p.grad)

                state = self.state[p]
                # Lazy state initialization
                if len(state) == 0:
                    if group["fused"]:
                        _device_dtype_check_for_fused(p)
                    # note(crcrpar): [special device hosting for step]
                    # Deliberately host `step` on CPU if both capturable and fused are off.
                    # This is because kernel launches are costly on CUDA and XLA.
                    state["step"] = (
                        torch.zeros(
                            (),
                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
                            device=p.device,
                        )
                        if group["capturable"] or group["fused"]
                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
                    )
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )

                exp_avgs.append(state["exp_avg"])
                exp_avg_sqs.append(state["exp_avg_sq"])

                if group["differentiable"] and state["step"].requires_grad:
                    raise RuntimeError(
                        "`requires_grad` is not supported for `step` in differentiable mode"
                    )

                # Foreach without capturable does not support a tensor lr
                if (
                    group["foreach"]
                    and torch.is_tensor(group["lr"])
                    and not group["capturable"]
                ):
                    raise RuntimeError(
                        "lr as a Tensor is not supported for capturable=False and foreach=True"
                    )

                state_steps.append(state["step"])
        return has_complex

    @_use_grad_for_differentiable
    def step(self, closure=None):
        """Perform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self._cuda_graph_capture_health_check()

        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            params_with_grad: List[Tensor] = []
            grads: List[Tensor] = []
            exp_avgs: List[Tensor] = []
            exp_avg_sqs: List[Tensor] = []
            state_steps: List[Tensor] = []
            beta1, beta2 = group["betas"]

            has_complex = self._init_group(
                group,
                params_with_grad,
                grads,
                exp_avgs,
                exp_avg_sqs,
                state_steps,
            )

            adopt(
                params_with_grad,
                grads,
                exp_avgs,
                exp_avg_sqs,
                state_steps,
                has_complex=has_complex,
                beta1=beta1,
                beta2=beta2,
                lr=group["lr"],
                weight_decay=group["weight_decay"],
                decoupled=group["decoupled"],
                eps=group["eps"],
                maximize=group["maximize"],
                foreach=group["foreach"],
                capturable=group["capturable"],
                differentiable=group["differentiable"],
                fused=group["fused"],
                grad_scale=getattr(self, "grad_scale", None),
                found_inf=getattr(self, "found_inf", None),
            )

        return loss


def _single_tensor_adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    has_complex: bool,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    weight_decay: float,
    decoupled: bool,
    eps: float,
    maximize: bool,
    capturable: bool,
    differentiable: bool,
):
    assert grad_scale is None and found_inf is None

    if torch.jit.is_scripting():
        # this assert is due to JIT being dumb and not realizing that the ops below
        # have overloads to handle both float and Tensor lrs, so we just assert it's
        # a float since most people using JIT are using floats
        assert isinstance(lr, float)

    for i, param in enumerate(params):
        grad = grads[i] if not maximize else -grads[i]
        exp_avg = exp_avgs[i]
        exp_avg_sq = exp_avg_sqs[i]
        step_t = state_steps[i]

        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
        if not torch._utils.is_compiling() and capturable:
            capturable_supported_devices = _get_capturable_supported_devices()
            assert (
                param.device.type == step_t.device.type
                and param.device.type in capturable_supported_devices
            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."

        # update step
        step_t += 1

        if weight_decay != 0:
            if decoupled:
                param.add_(param, alpha=-lr*weight_decay)
            else:
                grad = grad.add(param, alpha=weight_decay)

        if torch.is_complex(param):
            grad = torch.view_as_real(grad)
            if exp_avg is not None:
                exp_avg = torch.view_as_real(exp_avg)
            if exp_avg_sq is not None:
                exp_avg_sq = torch.view_as_real(exp_avg_sq)
            param = torch.view_as_real(param)

        step = step_t if capturable or differentiable else _get_value(step_t)
        if step == 1:
            exp_avg_sq.addcmul_(grad, grad.conj())
            continue

        denom = torch.clamp(exp_avg_sq.sqrt(), eps)
        if step == 2:
            exp_avg.addcdiv_(grad, denom)
        else:
            exp_avg.mul_(beta1).addcdiv_(grad, denom, value=1 - beta1)

        param.add_(exp_avg, alpha=-lr)
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)


def _multi_tensor_adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    has_complex: bool,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    weight_decay: float,
    decoupled: bool,
    eps: float,
    maximize: bool,
    capturable: bool,
    differentiable: bool,
):
    if len(params) == 0:
        return

    if isinstance(lr, Tensor) and not capturable:
        raise RuntimeError(
            "lr as a Tensor is not supported for capturable=False and foreach=True"
        )

    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
    if not torch._utils.is_compiling() and capturable:
        capturable_supported_devices = _get_capturable_supported_devices(
            supports_xla=False
        )
        assert all(
            p.device.type == step.device.type
            and p.device.type in capturable_supported_devices
            for p, step in zip(params, state_steps)
        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."

    assert grad_scale is None and found_inf is None

    assert not differentiable, "_foreach ops don't support autograd"

    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
        [params, grads, exp_avgs, exp_avg_sqs, state_steps]  # type: ignore[list-item]
    )
    for (
        device_params_,
        device_grads_,
        device_exp_avgs_,
        device_exp_avg_sqs_,
        device_state_steps_,
    ), _ in grouped_tensors.values():
        device_params = cast(List[Tensor], device_params_)
        device_grads = cast(List[Tensor], device_grads_)
        device_exp_avgs = cast(List[Tensor], device_exp_avgs_)
        device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_)
        device_state_steps = cast(List[Tensor], device_state_steps_)

        # Handle complex parameters
        if has_complex:
            _view_as_real(
                device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
            )

        if maximize:
            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]

        # Update steps
        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
        # wrapped it once now. The alpha is required to assure we go to the right overload.
        if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
            torch._foreach_add_(
                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
            )
        else:
            torch._foreach_add_(device_state_steps, 1)

        if weight_decay != 0:
            if decoupled:
                torch._foreach_add_(device_params, device_params, alpha=-lr*weight_decay)
            else:
                # Re-use the intermediate memory (device_grads) already allocated for maximize
                if maximize:
                    torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
                else:
                    device_grads = torch._foreach_add(  # type: ignore[assignment]
                        device_grads, device_params, alpha=weight_decay
                    )

        if device_state_steps[0] == 1:
            torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads)
            continue

        exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
        exp_avg_sq_sqrt = torch._foreach_maximum(exp_avg_sq_sqrt, eps)

        if device_state_steps[0] == 2:
            torch._foreach_addcdiv_(device_exp_avgs, device_grads, exp_avg_sq_sqrt)
        else:
            torch._foreach_mul_(device_exp_avgs, beta1)
            torch._foreach_addcdiv_(
                device_exp_avgs, device_grads, exp_avg_sq_sqrt, value=1 - beta1
            )

        torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr)
        torch._foreach_mul_(device_exp_avg_sqs, beta2)
        torch._foreach_addcmul_(
            device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2
        )


@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt)
def adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
    foreach: Optional[bool] = None,
    capturable: bool = False,
    differentiable: bool = False,
    fused: Optional[bool] = None,
    grad_scale: Optional[Tensor] = None,
    found_inf: Optional[Tensor] = None,
    has_complex: bool = False,
    *,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    weight_decay: float,
    decoupled: bool,
    eps: float,
    maximize: bool,
):
    r"""Functional API that performs ADOPT algorithm computation.

    """
    # Respect when the user inputs False/True for foreach or fused. We only want to change
    # the default when neither have been user-specified. Note that we default to foreach
    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
    # bake-in time before making it the default, even if it is typically faster.
    if fused is None and foreach is None:
        _, foreach = _default_to_fused_or_foreach(
            params, differentiable, use_fused=False
        )
        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
        if foreach and isinstance(lr, Tensor) and not capturable:
            foreach = False
    if fused is None:
        fused = False
    if foreach is None:
        foreach = False

    # this check is slow during compilation, so we skip it
    # if it's strictly needed we can add this check back in dynamo
    if not torch._utils.is_compiling() and not all(
        isinstance(t, torch.Tensor) for t in state_steps
    ):
        raise RuntimeError(
            "API has changed, `state_steps` argument must contain a list of singleton tensors"
        )

    if foreach and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
    if fused and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with fused optimizers")

    if fused and not torch.jit.is_scripting():
        func = _fused_adopt
    elif foreach and not torch.jit.is_scripting():
        func = _multi_tensor_adopt
    else:
        func = _single_tensor_adopt

    func(
        params,
        grads,
        exp_avgs,
        exp_avg_sqs,
        state_steps,
        has_complex=has_complex,
        beta1=beta1,
        beta2=beta2,
        lr=lr,
        weight_decay=weight_decay,
        decoupled=decoupled,
        eps=eps,
        maximize=maximize,
        capturable=capturable,
        differentiable=differentiable,
        grad_scale=grad_scale,
        found_inf=found_inf,
    )

================================================
FILE: pytabkit/models/optim/optimizers.py
================================================
import warnings
from collections import defaultdict
from copy import deepcopy
from itertools import chain
from typing import Optional, Dict, Any, Set, DefaultDict, Iterable

import torch
import torch.optim as optim
from torch.optim.optimizer import required, StateDict

from pytabkit.models.training.coord import HyperparamManager
from pytabkit.models.optim.scheduling_adam import SchedulingAdam


class OptimizerBase(torch.optim.Optimizer):
    def __init__(self, opt, hyper_mappings, hp_manager: HyperparamManager):
        self.hp_manager = hp_manager
        self.hyper_getters = {}
        self.n_groups = len(opt.param_groups)
        for names, opt_name, defaults in hyper_mappings:
            if isinstance(names, str):
                names = (names,)
                defaults = (defaults,)
            for name, default in zip(names, defaults):
                self.hyper_getters[name] = [self.hp_manager.register_hyper(name, group['params'][0].context.scope,
                                                                           default=default) for group in opt.param_groups]
        super().__init__(opt.param_groups, defaults={})
        self.hyper_mappings = hyper_mappings
        self.opt = opt

    def get_hyper_values(self, name, i, use_hyper_factor=True):
        value = self.hyper_getters[name][i]()
        param = self.opt.param_groups[i]['params'][0]  # should only be one param
        if use_hyper_factor and name in param.hyper_factors:
            value *= param.hyper_factors[name]
        return value

    def step(self, closure=None, loss: Optional[torch.Tensor] = None):
        unhandled_mappings = []
        for names, opt_name, defaults in self.hyper_mappings:
            if opt_name is None:
                unhandled_mappings.append((names, opt_name, defaults))
                continue

            if isinstance(names, tuple):
                for i, group in enumerate(self.opt.param_groups):
                    group[opt_name] = tuple(self.get_hyper_values(name, i) for name in names)
            elif isinstance(names, str):
                for i, group in enumerate(self.opt.param_groups):
                    group[opt_name] = self.get_hyper_values(names, i)
            else:
                raise RuntimeError('Could not understand mapping key {}'.format(names))

        for names, opt_name, defaults in unhandled_mappings:
            if names == 'wd':
                with torch.no_grad():
                    for i, group in enumerate(self.opt.param_groups):
                        wd = self.get_hyper_values('wd', i)
                        lr = self.get_hyper_values('lr', i)
                        if wd != 0.0:
                            for p in group['params']:
                                p.mul_(1.0 - wd * lr * p.hyper_factors.get('wd', 1.0) * p.hyper_factors.get('lr', 1.0))

            else:
                raise RuntimeError('Could not understand mapping {}'.format((names, opt_name, defaults)))

        self._opt_step_with_loss(loss)

    def train(self):
        if hasattr(self.opt, 'train') and callable(self.opt.train):
            # print('opt train')
            self.opt.train()

    def eval(self):
        if hasattr(self.opt, 'eval') and callable(self.opt.eval):
            # print('opt eval')
            self.opt.eval()

    def _opt_step_with_loss(self, loss: Optional[torch.Tensor]):
        self.opt.step()

    def __getstate__(self) -> Dict[str, Any]:
        # override the pickling method since otherwise self.opt is not restored
        return {'__dict__': self.__dict__}

    def __setstate__(self, state: Dict[str, Any]) -> None:
        # override the pickling method since otherwise self.opt is not restored
        self.__dict__ = state['__dict__']


class AdamOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager):
        super().__init__(optim.Adam(param_groups),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)


class SchedulingAdamOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager):
        super().__init__(SchedulingAdam(param_groups),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)


class AMSGradOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager):
        super().__init__(optim.Adam(param_groups, amsgrad=True),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)


class AdamaxOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager):
        super().__init__(optim.Adamax(param_groups),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)


class SGDOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager):
        super().__init__(optim.SGD(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), ('mom', 'momentum', 0.0),
                                                                  ('wd', None, 0.0)],
                         hp_manager=hp_manager)

class SFAdamOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager: HyperparamManager):
        from schedulefree import AdamWScheduleFree
        super().__init__(AdamWScheduleFree(param_groups),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0),
                                         ('weight_decay', 'weight_decay', 0.0),
                                         ('warmup_steps', 'warmup_steps', 0)],
                         hp_manager=hp_manager)


class MoMoAdamOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager: HyperparamManager):
        from momo import MomoAdam
        super().__init__(MomoAdam(param_groups),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)

    def _opt_step_with_loss(self, loss: Optional[torch.Tensor]):
        self.opt.step(loss=loss)


class AdoptOptimizer(OptimizerBase):
    def __init__(self, param_groups, hp_manager: HyperparamManager):
        from .adopt import ADOPT
        super().__init__(ADOPT(param_groups, decoupled=True),
                         hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)),
                                         ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)],
                         hp_manager=hp_manager)


def get_opt_class(opt_name):
    if opt_name == 'adam':
        return AdamOptimizer
    elif opt_name == 'adamax':
        return AdamaxOptimizer
    elif opt_name == 'sgd':
        return SGDOptimizer
    elif opt_name == 'amsgrad':
        return AMSGradOptimizer
    elif opt_name == 'sched_adam':
        return SchedulingAdamOptimizer
    elif opt_name == 'sfadam':
        return SFAdamOptimizer
    elif opt_name == 'momoadam':
        return MoMoAdamOptimizer
    elif opt_name == 'adopt':
        return AdoptOptimizer
    else:
        raise ValueError(f'Unknown optimizer "{opt_name}"')


================================================
FILE: pytabkit/models/optim/scheduling_adam.py
================================================
import torch
from torch.optim import Optimizer
import math


#  modification of normal adam to properly handle varying betas
class SchedulingAdam(Optimizer):
    r"""Implements Adam algorithm.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    The implementation of the L2 penalty follows changes proposed in
    `Decoupled Weight Decay Regularization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(SchedulingAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SchedulingAdam, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    @torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            params_with_grad = []
            grads = []
            exp_avgs = []
            exp_avg_sqs = []
            state_sums = []
            max_exp_avg_sqs = []
            state_steps = []

            beta1, beta2 = group['betas']

            for p in group['params']:
                if p.grad is not None:
                    params_with_grad.append(p)
                    if p.grad.is_sparse:
                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                    grads.append(p.grad)

                    state = self.state[p]
                    # Lazy state initialization
                    if len(state) == 0:
                        state['step'] = 0
                        state['beta1_prod'] = 1.0
                        state['beta2_prod'] = 1.0
                        # Exponential moving average of gradient values
                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                        # Exponential moving average of squared gradient values
                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                        if group['amsgrad']:
                            # Maintains max of all exp. moving avg. of sq. grad. values
                            state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                    exp_avgs.append(state['exp_avg'])
                    exp_avg_sqs.append(state['exp_avg_sq'])

                    if group['amsgrad']:
                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])

                    # update the steps for each param group update
                    state['step'] += 1
                    state['beta1_prod'] *= beta1
                    state['beta2_prod'] *= beta2
                    # record the step after step update
                    state_steps.append(state['step'])

            lr = group['lr']
            weight_decay = group['weight_decay']
            eps = group['eps']
            amsgrad = group['amsgrad']

            for i, param in enumerate(params_with_grad):
                grad = grads[i]
                exp_avg = exp_avgs[i]
                exp_avg_sq = exp_avg_sqs[i]

                bias_correction1 = 1 - self.state[param]['beta1_prod']
                bias_correction2 = 1 - self.state[param]['beta2_prod']

                if weight_decay != 0:
                    grad = grad.add(param, alpha=weight_decay)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

                step_size = lr / bias_correction1

                param.addcdiv_(exp_avg, denom, value=-step_size)
        return loss

================================================
FILE: pytabkit/models/sklearn/__init__.py
================================================


================================================
FILE: pytabkit/models/sklearn/default_params.py
================================================
import numpy as np

from pytabkit.models import utils


class DefaultParams:
    RealMLP_TD_CLASS = dict(
        hidden_sizes=[256] * 3,
        max_one_hot_cat_size=9, embedding_size=8,
        weight_param='ntk', bias_lr_factor=0.1,
        act='selu', use_parametric_act=True, act_lr_factor=0.1,
        block_str='w-b-a-d', p_drop=0.15, p_drop_sched='flat_cos',
        add_front_scale=True,
        scale_lr_factor=6.0,
        bias_init_mode='he+5', weight_init_mode='std',
        wd=2e-2, wd_sched='flat_cos', bias_wd_factor=0.0,
        use_ls=True, ls_eps=0.1,
        num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1,
        lr=4e-2,
        tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
        n_epochs=256, lr_sched='coslog4', opt='adam', sq_mom=0.95
    )

    RealMLP_TD_S_CLASS = dict(
        hidden_sizes=[256] * 3,
        weight_param='ntk', bias_lr_factor=0.1,
        act='selu',
        block_str='w-b-a',
        add_front_scale=True, scale_lr_factor=6.0,
        bias_init_mode='normal', weight_init_mode='normal',
        last_layer_config=dict(bias_init_mode='zeros', weight_init_mode='zeros'),
        use_ls=True, ls_eps=0.1,
        tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip'],
        n_epochs=256, lr=4e-2, lr_sched='coslog4', opt='adam', sq_mom=0.95
    )

    RealMLP_TD_REG = dict(
        hidden_sizes=[256] * 3,
        max_one_hot_cat_size=9, embedding_size=8,
        weight_param='ntk', weight_init_mode='std',
        bias_init_mode='he+5', bias_lr_factor=0.1,
        act='mish', use_parametric_act=True, act_lr_factor=0.1,
        wd=2e-2, wd_sched='flat_cos', bias_wd_factor=0.0,
        block_str='w-b-a-d', p_drop=0.15, p_drop_sched='flat_cos',
        add_front_scale=True, scale_lr_factor=6.0,
        tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
        num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1,
        clamp_output=True, normalize_output=True,
        lr=0.2,
        n_epochs=256, lr_sched='coslog4', opt='adam', sq_mom=0.95
    )

    RealMLP_TD_S_REG = dict(
        hidden_sizes=[256] * 3,
        weight_param='ntk', bias_lr_factor=0.1,
        bias_init_mode='normal', weight_init_mode='normal',
        last_layer_config=dict(bias_init_mode='zeros', weight_init_mode='zeros'),
        act='mish', normalize_output=True,
        block_str='w-b-a',
        add_front_scale=True, scale_lr_factor=6.0,
        tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip'],
        n_epochs=256, lr=7e-2, lr_sched='coslog4', opt='adam', sq_mom=0.95
    )

    # -------- GBDTs ------------

    LGBM_TD_CLASS = dict(
        n_estimators=1000, lr=4e-2, subsample=0.75, colsample_bytree=1.0, num_leaves=50, bagging_freq=1,
        min_data_in_leaf=40, min_sum_hessian_in_leaf=1e-7, max_bin=255, early_stopping_rounds=300,
    )

    LGBM_TD_REG = dict(
        n_estimators=1000, lr=5e-2, subsample=0.7, colsample_bytree=1.0, num_leaves=100, max_bin=255, bagging_freq=1,
        min_data_in_leaf=3, min_sum_hessian_in_leaf=1e-7, early_stopping_rounds=300,
    )

    XGB_TD_CLASS = dict(
        n_estimators=1000, lr=8e-2, min_child_weight=5e-6, reg_lambda=0.0, max_depth=6,
        colsample_bylevel=0.9, subsample=0.65, tree_method='hist', max_bin=256, early_stopping_rounds=300,
    )

    XGB_TD_REG = dict(
        n_estimators=1000, max_depth=9, tree_method='hist', max_bin=256, lr=5e-2, min_child_weight=2.0, reg_lambda=0.0,
        subsample=0.7, early_stopping_rounds=300,
    )

    # from Probst, Boulestix, and Bischl, "Tunability: Importance of ..."
    XGB_PBB_CLASS = dict(
        n_estimators=4168, lr=0.018, min_child_weight=2.06,
        max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839,
        colsample_bytree=0.752, colsample_bylevel=0.585,
        tree_method='hist', max_n_threads=64,
        tfms=['one_hot'], max_one_hot_cat_size=20
    )

    CB_TD_CLASS = dict(
        n_estimators=1000, lr=8e-2, l2_leaf_reg=1e-5, boosting_type='Plain',
        bootstrap_type='Bernoulli', subsample=0.9,
        max_depth=7, random_strength=0.8, one_hot_max_size=15,
        leaf_estimation_iterations=1, max_bin=254, early_stopping_rounds=300,
    )

    CB_TD_REG = dict(
        n_estimators=1000, lr=9e-2, l2_leaf_reg=1e-5, boosting_type='Plain',
        bootstrap_type='Bernoulli', subsample=0.9,
        max_depth=9, random_strength=0.0, max_bin=254,
        one_hot_max_size=20, leaf_estimation_iterations=20, early_stopping_rounds=300,
    )

    # RTDL params

    RESNET_RTDL_D_CLASS_Grinsztajn = {
        "lr_scheduler": False,
        "module_activation": "reglu",
        "module_normalization": "batchnorm",
        "module_n_layers": 8,
        "module_d": 256,
        "module_d_hidden_factor": 2,
        "module_hidden_dropout": 0.2,
        "module_residual_dropout": 0.2,
        "lr": 1e-3,
        "optimizer_weight_decay": 1e-7,
        "optimizer": "adamw",
        "module_d_embedding": 128,
        "batch_size": 256,
        "max_epochs": 300,
        "use_checkpoints": True,
        "es_patience": 40,
        "lr_patience": 30,
        "verbose": 0,
        'tfms': ['quantile'],
    }

    RESNET_RTDL_D_REG_Grinsztajn = {**RESNET_RTDL_D_CLASS_Grinsztajn,
                                    "transformed_target": True}

    MLP_RTDL_D_CLASS_Grinsztajn = {
        "lr_scheduler": False,
        "module_n_layers": 8,
        "module_d_layers": 256,
        "module_d_first_layer": 128,
        "module_d_last_layer": 128,
        "module_dropout": 0.2,
        "lr": 1e-3,
        "optimizer": "adamw",
        "module_d_embedding": 128,
        "batch_size": 256,
        "max_epochs": 300,
        "use_checkpoints": True,
        "es_patience": 40,
        "lr_patience": 30,
        "verbose": 0,
        'tfms': ['quantile'],
    }

    MLP_RTDL_D_REG_Grinsztajn = {**MLP_RTDL_D_CLASS_Grinsztajn,
                                 "transformed_target": True}

    FTT_D_CLASS = {
        "lr_scheduler": False,
        "module_d_token": 192,
        "module_d_ffn_factor": 4. / 3.,
        "module_n_layers": 3,
        "module_n_heads": 8,
        "module_activation": "reglu",
        "module_token_bias": True,
        "module_attention_dropout": 0.2,
        "module_initialization": "kaiming",
        "module_ffn_dropout": 0.1,
        "module_residual_dropout": 0.0,
        "module_prenormalization": True,
        "module_kv_compression": None,
        "module_kv_compression_sharing": None,
        "lr": 1e-4,
        "optimizer": "adamw",
        "optimizer_weight_decay": 1e-5,
        "batch_size": 256,  # default in Grinsztajn is 512?
        "max_epochs": 300,  # todo: keep it?
        "use_checkpoints": True,
        "es_patience": 16,  # value from Gorishniy et al.
        "lr_patience": 30,
        "verbose": 0,
        "tfms": ['quantile_tabr'],
    }

    FTT_D_REG = {**FTT_D_CLASS, "transformed_target": True}

    # Default parameters for rtdl models based on https://github.com/naszilla/tabzilla/blob/main/TabZilla/models/rtdl.py
    RESNET_RTDL_D_CLASS_TabZilla = {
        "lr_scheduler": False,
        "module_activation": "relu",
        "module_normalization": "batchnorm",
        "module_n_layers": 2,
        "module_d": 128,
        "module_d_hidden_factor": 2,
        "module_hidden_dropout": 0.25,  # DROPOUT_FIRST
        "module_residual_dropout": 0.1,  # DROPOUT_SECOND
        "lr": 1e-3,
        "optimizer_weight_decay": 0.01,  # for tabzilla they don't set it which means 0.01 (which seems high compared
        # to rtdl hp space?)
        "optimizer": "adamw",
        "module_d_embedding": 8,
        "batch_size": 128,
        # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129
        "max_epochs": 1000,  # same
        "use_checkpoints": True,
        "es_patience": 20,  # same
        "lr_patience": 30,
        "verbose": 0,
        'tfms': ['quantile_tabr'],
    }

    RESNET_RTDL_D_REG_TabZilla = {**RESNET_RTDL_D_CLASS_TabZilla,
                                  "transformed_target": True}

    MLP_RTDL_D_CLASS_TabZilla = {
        "lr_scheduler": False,
        "module_n_layers": 3,
        "module_d_first_layer": 128,  # ignored by the code since d_layers is a list
        "module_d_last_layer": 128,  # ignored by the code since d_layers is a list
        "module_d_layers": [128, 256, 128],
        "module_dropout": 0.1,
        # module_activation
        # module_dropout
        # optimizer_weight_decay
        "lr": 1e-3,
        "optimizer": "adamw",
        "module_d_embedding": 8,
        "batch_size": 128,
        # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129
        "max_epochs": 1000,  # same
        "use_checkpoints": True,
        "es_patience": 20,  # same
        "lr_patience": 30,
        "verbose": 0,
        'tfms': ['quantile_tabr'],
    }

    MLP_RTDL_D_REG_TabZilla = {**MLP_RTDL_D_CLASS_TabZilla,
                               "transformed_target": True}

    MLP_PLR_D_CLASS = {
        # adapted from TabZilla version of MLP_RTDL_D and the defaults of the rtdl_num_embeddings library
        "lr_scheduler": False,
        "module_n_layers": 3,
        "module_d_first_layer": 128,  # ignored by the code since d_layers is a list
        "module_d_last_layer": 128,  # ignored by the code since d_layers is a list
        "module_d_layers": [128, 256, 128],
        "module_dropout": 0.1,
        "lr": 1e-3,
        "optimizer": "adamw",
        "module_d_embedding": 8,
        "batch_size": 128,
        # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129
        "max_epochs": 1000,  # same
        "use_checkpoints": True,
        "es_patience": 20,  # same
        "lr_patience": 30,
        "verbose": 0,
        'tfms': ['quantile_tabr'],
        "module_num_emb_type": 'plr',
        "module_num_emb_dim": 24,
        "module_num_emb_hidden_dim": 48,
        "module_num_emb_sigma": 0.01,
        "module_num_emb_lite": False
    }

    MLP_PLR_D_REG = {**MLP_PLR_D_CLASS,
                     "transformed_target": True}

    TABR_S_D_CLASS = {
        "num_embeddings": None,
        "d_main": 265,
        "context_dropout": 0.38920071545944357,  # named mixer_dropout sometimes I think
        "d_multiplier": 2.0,
        "encoder_n_blocks": 0,
        "predictor_n_blocks": 1,
        "mixer_normalization": "auto",
        "dropout0": 0.38852797479169876,
        "dropout1": 0.0,
        "normalization": "LayerNorm",
        "activation": "ReLU",
        "batch_size": "auto",  # adapt given the dataset size
        "eval_batch_size": 4096,  # TODO: automatically infer given memory
        "patience": 16,
        "n_epochs": 100_000,  # inf in paper
        "context_size": 96,
        "freeze_contexts_after_n_epochs": None,
        "optimizer": {
            "type": "AdamW",
            "lr": 0.0003121273641315169,
            "weight_decay": 1.2260352006404615e-06
        },
        'tfms': ['quantile_tabr'],
    }

    TABR_S_D_REG = {**TABR_S_D_CLASS,
                    "transformed_target": True}

    TABR_S_D_CLASS_FREEZE = {
        **TABR_S_D_CLASS,
        "freeze_contexts_after_n_epochs": 4,
    }

    TABR_S_D_REG_FREEZE = {
        **TABR_S_D_REG,
        "freeze_contexts_after_n_epochs": 4,
    }

    RealTABR_D_CLASS = {
        "d_main": 265,
        "context_dropout": 0.38920071545944357,  # named mixer_dropout sometimes I think
        "d_multiplier": 2.0,
        "encoder_n_blocks": 0,
        "predictor_n_blocks": 1,
        "mixer_normalization": "auto",
        "dropout0": 0.38852797479169876,
        "dropout1": 0.0,
        "normalization": "LayerNorm",
        "activation": "ReLU",
        "batch_size": "auto",  # adapt given the dataset size
        "eval_batch_size": 4096,
        "patience": 16,
        "n_epochs": 100_000,  # inf in paper
        "context_size": 96,
        "freeze_contexts_after_n_epochs": None,
        'num_embeddings': {
            'type': "PBLDEmbeddings",
            'n_frequencies': 8,  # not 16 because of RAM issues on meta-test
            'd_embedding': 4,
            'frequency_scale': 0.1,
        },
        'tfms': ['median_center', 'robust_scale', 'smooth_clip'],
        'optimizer': {
            "type": "AdamW",
            "lr": 0.0003121273641315169,
            "weight_decay": 1.2260352006404615e-06,
            "betas": (0.9, 0.95),
        },
        'add_scaling_layer': True,
        'scale_lr_factor': 96,
        'ls_eps': 0.1,
    }

    RealTABR_D_REG = {**RealTABR_D_CLASS,
                      "transformed_target": True}

    TABM_D_CLASS = {
        # from https://github.com/yandex-research/tabm/blob/main/example.ipynb
        'arch_type': 'tabm',
        'tabm_k': 32,
        'num_emb_type': 'none',
        'num_emb_n_bins': 48,
        'batch_size': 256,
        'lr': 2e-3,
        'weight_decay': 0.0,
        'n_epochs': 1_000_000_000,
        'patience': 16,
        'd_embedding': 16,
        'd_block': 512,
        'n_blocks': 'auto',
        'dropout': 0.1,
        'compile_model': False,
        'allow_amp': False,
        'tfms': ['quantile_tabr'],
        'gradient_clipping_norm': None,  # set to 1.0 in TabR paper experiments
    }

    TABM_D_REG = TABM_D_CLASS

    VANILLA_MLP_CLASS = dict(
        hidden_sizes=[256] * 3,
        p_drop=0.0,
        wd=0.0,
        block_str='w-b-a-d',
        opt='adam',
        tfms=['quantile', 'one_hot'],
        batch_size=256,
        n_epochs=256,
        act='relu',
        weight_param='standard',
        weight_init_mode='uniform',
        weight_init_gain=1. / np.sqrt(3.),
        bias_init_mode='pytorch-default',
        lr=1e-3,
        lr_sched='constant',
        max_n_vectorized=1,  # this is because of the preprocessing
        use_last_best_epoch=False,
    )
    VANILLA_MLP_REG = utils.join_dicts(VANILLA_MLP_CLASS, dict(normalize_output=True))

    XRFM_D_CLASS = dict(
        bandwidth=10.0,
        p_interp=1.0,
        exponent=1.0,
        reg=1e-3,
        iters=5,
        diag=True,
        bandwidth_mode='constant',
        kernel_type='l2',
        max_leaf_samples=60_000,
        early_stop_rfm=True,
        early_stop_multiplier=1.1,
        classification_mode='prevalence',
        M_batch_size=8000,
    )

    XRFM_D_REG = XRFM_D_CLASS

    # ----- sklearn versions ------

    LGBM_D = dict(
        n_estimators=100,
    )

    XGB_D = dict(
        n_estimators=100, tree_method='hist',
    )

    CB_D = dict(
        n_estimators=1000,
    )

    RF_SKL_D = dict(
        tfms=['ordinal_encoding'], permute_ordinal_encoding=True,
    )

    MLP_SKL_D = dict(
        tfms=['mean_center', 'l2_normalize', 'one_hot']
    )


================================================
FILE: pytabkit/models/sklearn/sklearn_base.py
================================================
import copy
from pathlib import Path
from typing import Dict, Any, Optional, Union, List
from warnings import warn
from packaging.version import Version

import numpy as np
import pandas as pd
import scipy.sparse
import sklearn
import torch
import multiprocessing as mp
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics._dist_metrics import check_array
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted, check_X_y

from pytabkit.models import utils
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources
from pytabkit.models.data.data import DictDataset, TensorInfo
from pytabkit.models.data.splits import RandomSplitter, KFoldSplitter
from pytabkit.models.data.conversion import ToDictDatasetConverter
from pytabkit.models.torch_utils import get_available_device_names
from pytabkit.models.training.logging import StdoutLogger


def to_df(x) -> pd.DataFrame:
    try:
        return pd.DataFrame(x)
    except:
        pass

    return pd.DataFrame(np.array(x))


def to_normal_type(x) -> Any:
    if isinstance(x, pd.DataFrame) or isinstance(x, list) or isinstance(x, np.ndarray) or isinstance(x, pd.Series):
        return x
    return np.asarray(x)


def concat_arrays(x1, x2) -> Any:
    if type(x1) != type(x2):
        raise ValueError(f'Arrays must have the same type, but got {type(x1)=} and {type(x2)=}')
    if isinstance(x1, pd.DataFrame) or isinstance(x1, pd.Series):
        return pd.concat([x1, x2], axis=0, ignore_index=True)
    return np.concatenate([x1, x2], axis=0)


def check_X_y_wrapper(*args, **kwargs):
    if Version(sklearn.__version__) >= Version("1.8.0"):
        if 'force_all_finite' in kwargs:
            kwargs['ensure_all_finite'] = kwargs['force_all_finite']
            del kwargs['force_all_finite']
    else:
        if 'ensure_all_finite' in kwargs:
            kwargs['force_all_finite'] = kwargs['ensure_all_finite']
            del kwargs['ensure_all_finite']

    check_X_y(*args, **kwargs)


def check_array_wrapper(*args, **kwargs):
    if Version(sklearn.__version__) >= Version("1.8.0"):
        if 'force_all_finite' in kwargs:
            kwargs['ensure_all_finite'] = kwargs['force_all_finite']
            del kwargs['force_all_finite']
    else:
        if 'ensure_all_finite' in kwargs:
            kwargs['force_all_finite'] = kwargs['ensure_all_finite']
            del kwargs['ensure_all_finite']

    check_array(*args, **kwargs)


class AlgInterfaceEstimator(BaseEstimator):
    """
    Base class for wrapping AlgInterface subclasses with a scikit-learn compatible interface.
    """

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        # override this
        raise NotImplementedError()

    def _supports_multioutput(self) -> bool:
        # only relevant for regression, override this if multioutput is not supported
        return True

    def _supports_single_class(self) -> bool:
        # only relevant for classification,
        # override this if training with only a single class in the training set is not supported
        return True

    def _supports_single_sample(self) -> bool:
        return True

    def _non_deterministic_tag(self) -> bool:
        return False

    def _is_classification(self) -> bool:
        raise NotImplementedError()

    def _get_default_params(self) -> Dict[str, Any]:
        # override this in subclasses to handle default parameters that should not be treated in the constructor
        # e.g. because their default values are mutable (list/dict/...)
        return dict()

    def _allowed_device_names(self) -> List[str]:
        # override in subclasses that allow to run on a GPU or mps
        return ['cpu']

    def _more_tags(self):
        return dict(non_deterministic=self._non_deterministic_tag())

    def __sklearn_tags__(self):
        tags = super().__sklearn_tags__()
        tags.non_deterministic = self._non_deterministic_tag()
        return tags

    def get_config(self) -> Dict[str, Any]:
        """
        Augments the result from self.get_params() with the parameters from self._get_default_params().
        Uses _preprocess_config_key() to change the names from self.get_params() if implemented.
        Default parameters are used if the value in get_params() is either None or not present.
        :return: Dictionary of parameters augmented with default parameters.
        """
        params = {key: value for key, value in self.get_params(deep=False).items()}
        default_params = self._get_default_params()
        for key, value in default_params.items():
            if key not in params or params[key] is None:
                params[key] = value

        # print(f'{params=}')

        # return params
        # remove None values
        return {key: value for key, value in params.items() if value is not None}

    def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Optional[np.ndarray] = None,
            cat_indicator: Optional[Union[List[bool], np.ndarray]] = None, cat_col_names: Optional[List[str]] = None,
            time_to_fit_in_seconds: Optional[int] = None) -> BaseEstimator:
        """
        Fit the estimator.

        :param X: Inputs (covariates). pandas DataFrame, numpy array, or similar array-like.
        :param y: Labels (targets, variates). pandas DataFrame/Series, numpy array, or similar array-like.
        :param X_val: Inputs for validation set. Can only be used if n_cv is not set to a value other than 1,
            and if val_idxs is not used. If X_val is used, X will be used for the training set only,
            instead of getting validation data from X.
        :param y_val: Labels for the validation set.
        :param val_idxs: Indices of validation set elements within X and y (optional).
            Can be an array of shape (n_val_samples,) or (n_val_splits,n_val_samples_per_split).
            In the latter case, the results of the models on the validation splits will be ensembled.
        :param cat_indicator: Which features/columns are categorical, specified as a list or array of booleans.
            If this is not specified, all columns with category/string/object dtypes are interpreted as categorical
            and all others as numerical.
        :param cat_col_names: List of column names that should be treated as categorical (if X is a pd.DataFrame).
            Can be specified instead of cat_indicator.
        :param time_to_fit_in_seconds: Time limit in seconds for fitting.
            Currently only implemented for RealMLP (default=None). If None, no time limit will be applied.
        :return: Returns self.
        """

        # do a first check, this includes to check if X or y are not None before other things are done to them
        check_X_y_wrapper(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)

        # if X is None:
        #     raise ValueError(f'This estimator requires X to be passed, but X is None')
        # if y is None:
        #     # this message has to include the special text
        #     # "requires y to be passed, but the target y is None"
        #     # or one of the other particular messages
        #     # for the estimator test "check_requires_y_none" to not fail
        #     # it doesn't work automatically because of the to_normal_type(y) before the check_X_y
        #     raise ValueError(f'This estimator requires y to be passed, but the target y is None')

        for arr in [X, y, X_val, y_val]:
            if scipy.sparse.issparse(arr):
                raise ValueError(f'Sparse arrays are not supported!')

        # print(f'{X=}')
        # print(f'{y=}')
        X = to_normal_type(X)
        y = to_normal_type(y)  # need to convert array-like objects to arrays for self.is_y_1d_

        params = self.get_config()
        n_cv = params.get('n_cv', 1)
        n_repeats = params.get('n_repeats', 1)
        # val_fraction is only relevant for n_cv == 1
        val_fraction = params.get('val_fraction', 0.2)
        n_refit = params.get('n_refit', 0)

        if X_val is not None and y_val is None:
            raise ValueError(f'X_val is not None but y_val is None')
        elif X_val is None and y_val is not None:
            raise ValueError(f'X_val is None but y_val is not None')

        if X_val is not None and y_val is not None:
            if val_idxs is not None:
                raise ValueError(f'both val_idxs and X_val, y_val were provided')

            X_val = to_normal_type(X_val)
            y_val = to_normal_type(y_val)

            val_idxs = np.arange(len(X), len(X) + len(X_val))

            X = concat_arrays(X, X_val)
            y = concat_arrays(y, y_val)

        # check again with the validation set concatenated
        check_X_y_wrapper(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)

        if self._is_classification():
            # classes_ is overridden later, but this raises an error when y is a regression target, so it is useful
            self.classes_ = unique_labels(y)

        self.is_y_1d_ = isinstance(y, pd.Series) or (isinstance(y, np.ndarray) and len(y.shape) == 1)
        if isinstance(y, list):
            if len(np.asarray(y).shape) == 1:
                self.is_y_1d_ = True

        # if not (isinstance(y, np.ndarray) or isinstance(y, list)
        #         or isinstance(y, pd.DataFrame) or isinstance(y, pd.Series)):
        #     raise ValueError(f'y has type {type(y)}, but should be one of np.ndarray, list, pd.DataFrame, or pd.Series')
        # y_df = pd.DataFrame(y)
        X_df = to_df(X).copy()
        y_df = to_df(y).copy()
        # self.y_encoder_.fit_transform(y)

        if cat_col_names is not None:
            if cat_indicator is not None:
                raise ValueError(f'Specified both cat_col_names and cat_indicator')
            cat_indicator = [col_name in cat_col_names for col_name in X_df.columns]
        self.x_converter_ = ToDictDatasetConverter(cat_features=cat_indicator, verbosity=params.get('verbosity', 0))
        self.y_encoder_ = OrdinalEncoder(dtype=np.int64)  # only used for classification

        if not self._supports_single_sample() and len(X_df) == 1:
            raise ValueError('Training with one sample is not supported!')

        x_ds = self.x_converter_.fit_transform(X_df)
        if torch.any(torch.isnan(x_ds.tensors['x_cont'])):
            raise ValueError('NaN values in continuous columns are currently not allowed!')

        self.is_y_float64_ = False  # checked later in the regression case

        # convert y
        if self._is_classification():
            self.y_encoder_ = OrdinalEncoder(dtype=np.int64)
            y_tfmd = self.y_encoder_.fit_transform(y_df)
            if len(y_tfmd.shape) == 1:
                y_tfmd = y_tfmd[:, None]
            if len(y_tfmd.shape) != 2:
                raise ValueError('len(y.shape) != 2')
            if y_tfmd.shape[1] != 1:
                raise ValueError('Multilabel classification is not supported!')
            if not self.is_y_1d_:
                warn(
                    (
                        "A column-vector y was passed when a 1d array was"
                        " expected. Please change the shape of y to "
                        "(n_samples,), for example using ravel()."
                    ),
                    DataConversionWarning,
                    stacklevel=2,
                )
            y_ds = DictDataset(tensors={'y': torch.as_tensor(y_tfmd, dtype=torch.long)},
                               tensor_infos={'y': TensorInfo(cat_sizes=[int(np.max(y_tfmd) + 1)])})
            self.classes_ = self.y_encoder_.categories_[0]
            if not self._supports_single_class() and len(self.classes_) == 1:
                raise ValueError(f'Training with only one class in the training set is not supported!')
        else:
            # regression
            if y_df[y_df.columns[0]].dtype == np.float64:
                self.is_y_float64_ = True
            y_tfmd = y_df.to_numpy(dtype=np.float32)
            if len(y_tfmd.shape) == 1:
                y_tfmd = y_tfmd[:, None]
            if len(y_tfmd.shape) != 2:
                raise ValueError('len(y.shape) != 2')
            y_ds = DictDataset(tensors={'y': torch.as_tensor(y_tfmd, dtype=torch.float32)},
                               tensor_infos={'y': TensorInfo(feat_shape=[y_tfmd.shape[1]])})
            if not self._supports_multioutput() and not self.is_y_1d_:
                warn(
                    (
                        "A column-vector y was passed when a 1d array was"
                        " expected. Please change the shape of y to "
                        "(n_samples,), for example using ravel()."
                    ),
                    DataConversionWarning,
                    stacklevel=2,
                )
            if not self._supports_multioutput() and y_ds.tensor_infos['y'].get_n_features() > 1:
                raise ValueError('Multioutput regression is not supported, '
                                 'please wrap this estimator with the MultiOutputRegressor '
                                 'from scikit-learn.')

        ds = DictDataset.join(x_ds, y_ds)

        # set n_features_in_ as required by https://scikit-learn.org/stable/developers/develop.html
        self.n_features_in_ = ds.tensor_infos['x_cont'].get_n_features() + ds.tensor_infos['x_cat'].get_n_features()

        self.cv_alg_interface_ = self._create_alg_interface(n_cv=n_cv)

        # ----- get random seeds -----
        random_state = params.get('random_state', None)
        if isinstance(random_state, int):
            seed = random_state
        elif random_state is None:
            seed = int(np.random.randint(0, 2 ** 31 - 1))
        elif isinstance(random_state, np.random.RandomState):
            seed = int(random_state.randint(0, 2 ** 31 - 1))
        else:
            raise ValueError(f'random_state type {type(random_state)} '
                             f'is not one of [NoneType, int, np.random.RandomState]')

        split_seed = seed
        refit_split_seed = seed + 1
        sub_split_seeds = list(np.random.RandomState(split_seed).randint(0, 2 ** 31 - 1, size=n_cv * n_repeats))
        sub_split_seeds = [int(seed) for seed in sub_split_seeds]
        refit_sub_split_seeds = list(
            np.random.RandomState(refit_split_seed).randint(0, 2 ** 31 - 1, size=n_refit))
        refit_sub_split_seeds = [int(seed) for seed in refit_sub_split_seeds]

        # ----- get train/val split -----

        if not isinstance(n_cv, int) or n_cv <= 0:
            raise ValueError(f'Expected n_cv to be an int >= 1, but got {n_cv=}')
        if val_idxs is not None:
            if n_repeats != 1:
                raise ValueError(f'Providing a validation split requires n_repeats=1, but got {n_repeats=}')

            # provided split
            val_idxs = torch.as_tensor(val_idxs, dtype=torch.long)
            if len(val_idxs.shape) == 1:
                val_idxs = val_idxs[None, :]

            train_idxs_list = []
            for i in range(val_idxs.shape[0]):
                is_val_idx = torch.zeros(ds.n_samples, dtype=torch.bool)
                is_val_idx[val_idxs[i]] = True
                train_idxs_list.append(torch.argwhere(~is_val_idx).squeeze(-1))

            train_idxs = torch.stack(train_idxs_list, dim=0)
            if val_idxs.shape[0] == 1 and n_cv > 1:
                # replicate according to n_cv, such that an ensemble can be created
                train_idxs = train_idxs.expand(n_cv, -1)
                val_idxs = val_idxs.expand(n_cv, -1)
            elif n_cv != val_idxs.shape[0]:
                raise ValueError(f'Value provided for {n_cv=} is not equal to {val_idxs.shape[0]=}')
        else:
            train_idxs_list = []
            val_idxs_list = []

            for i in range(n_repeats):
                if n_cv == 1:
                    # random split
                    splitter = RandomSplitter(seed=split_seed + i, first_fraction=1.0 - val_fraction)
                    train_idxs, val_idxs = splitter.get_idxs(ds)
                    train_idxs_list.append(train_idxs[None, :])
                    val_idxs_list.append(val_idxs[None, :])
                else:
                    splitter = KFoldSplitter(k=n_cv, seed=split_seed + i, stratified=self._is_classification())
                    idxs_tuples = splitter.get_idxs(ds)
                    train_idxs_list.append(torch.stack([t[0] for t in idxs_tuples], dim=0))
                    val_idxs_list.append(torch.stack([t[1] for t in idxs_tuples], dim=0))

                train_idxs = torch.cat(train_idxs_list, dim=0)
                val_idxs = torch.cat(val_idxs_list, dim=0)

        if val_idxs.shape[1] == 0:
            val_idxs = None  # no validation set

        # print(f'{val_idxs=}')
        # print(f'{np.mean(X / (1e-8 + np.linalg.norm(X, axis=0, keepdims=True)))=}')

        idxs_list = [SplitIdxs(train_idxs=train_idxs, val_idxs=val_idxs, test_idxs=None, split_seed=split_seed,
                               sub_split_seeds=sub_split_seeds, split_id=0)]

        # ----- resources -----

        try:
            import psutil
            n_physical_threads = psutil.cpu_count(logical=False)
        except ImportError:
            # this assumes that there are 2 logical threads per physical thread
            n_logical_threads = mp.cpu_count()
            n_physical_threads = max(1, n_logical_threads // 2)

        device = params.get('device', None)
        if device == 'cuda':
            device = 'cuda:0'  # 'cuda' doesn't work with some of the code

        n_threads = params.get('n_threads', n_physical_threads)
        self.n_threads_ = n_threads

        old_torch_n_threads = torch.get_num_threads()
        torch.set_num_threads(n_threads)

        gpu_devices = []

        device_names = get_available_device_names()
        if device is None:
            allowed_device_names = [name for name in device_names if name.split(':')[0] in self._allowed_device_names()]
            if 'cuda:0' in allowed_device_names:
                gpu_devices.append('cuda:0')
            elif 'mps' in allowed_device_names:
                gpu_devices.append('mps')
            # print(f'{gpu_devices=}')
            # print(f'{self._allowed_device_names()=}')
            # print(f'{allowed_device_names=}')
            # print(f'{device_names=}')
        elif device != 'cpu':
            if device not in device_names:
                raise ValueError(f'Unknown device name "{device}", known device names are {device_names}')
            gpu_devices.append(device)

        tmp_folder: Optional[str] = params.get('tmp_folder', None)
        if tmp_folder is None:
            tmp_folders = [None]
            refit_tmp_folders = [None]
        else:
            tmp_path = Path(tmp_folder)
            # make sure that the refit stage doesn't load the models from the cv stage
            tmp_folders = [tmp_path / 'cv']
            refit_tmp_folders = [tmp_path / 'refit']

        logger = StdoutLogger(verbosity_level=params.get('verbosity', 0))

        interface_resources = InterfaceResources(n_threads=n_threads, gpu_devices=gpu_devices,
                                                 time_in_seconds=time_to_fit_in_seconds)
        self.cv_alg_interface_.fit(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources,
                                   logger=logger, tmp_folders=tmp_folders, name=self.__class__.__name__)

        # todo: put alg_interface on the CPU after fit() (for saving)? How to do it?

        # todo: currently, there is only one alg_interface which may fit in parallel (for the NNs),
        #  but we could add an option to make them fit sequentially for RAM reasons or so
        #  (maybe this is best done via a MultiSplitWrapper or so)

        if n_refit > 0:
            self.refit_alg_interface_ = self.cv_alg_interface_.get_refit_interface(n_refit=n_refit)
            train_idxs = torch.arange(ds.n_samples, dtype=torch.long)[None, :].expand(n_refit, -1)
            refit_idxs_list = [SplitIdxs(train_idxs=train_idxs,
                                         val_idxs=None, test_idxs=None, split_seed=refit_split_seed,
                                         sub_split_seeds=refit_sub_split_seeds, split_id=0)]
            self.refit_alg_interface_.fit(ds=ds, idxs_list=refit_idxs_list, interface_resources=interface_resources,
                                          logger=logger, tmp_folders=refit_tmp_folders,
                                          name=self.__class__.__name__ + ' [refit]')
            self.alg_interface_ = self.refit_alg_interface_
        else:
            self.alg_interface_ = self.cv_alg_interface_

        if hasattr(self.alg_interface_, 'fit_params') and len(self.alg_interface_.fit_params) > 0:
            self.fit_params_ = self.alg_interface_.fit_params[0]

        torch.set_num_threads(old_torch_n_threads)
        return self

    def _predict_raw(self, X) -> torch.Tensor:
        """
        Predicts logits (for classification) or mean outputs (for regression)
        :param X: Input data.
        :return: Returns a tensor of shape [n_ensemble, n_samples, output_dim].
        """
        # Check is fit had been called
        check_is_fitted(self, ['alg_interface_', 'x_converter_'])

        old_torch_n_threads = torch.get_num_threads()
        torch.set_num_threads(self.n_threads_)

        # Input validation
        # if isinstance(X, np.ndarray):
        check_array_wrapper(X, force_all_finite='allow-nan', dtype=None)

        x_ds = self.x_converter_.transform(to_df(X))
        if torch.any(torch.isnan(x_ds.tensors['x_cont'])):
            raise ValueError('NaN values in continuous columns are currently not allowed!')
        y_preds = self.alg_interface_.predict(x_ds).detach().cpu()

        torch.set_num_threads(old_torch_n_threads)
        return y_preds

    def to(self, device: str) -> None:
        """
        Move the model (only implemented for RealMLP at the moment) to the specified device.
        :param device: PyTorch-compatible device name.
        """
        self.cv_alg_interface_.to(device)
        if hasattr(self, 'refit_alg_interface_'):
            self.refit_alg_interface_.to(device)


class AlgInterfaceClassifier(ClassifierMixin, AlgInterfaceEstimator):
    # inheritance order is important in scikit-learn 1.6
    # otherwise sklearn.base.is_classifier(...) returns False
    def _is_classification(self) -> bool:
        return True

    def predict_proba(self, X) -> np.ndarray:
        y_preds = self._predict_raw(X)
        # y_preds are logits, so take the softmax and the mean over the ensemble dimension afterward
        y_probs = torch.softmax(y_preds, dim=-1).mean(dim=0)
        return y_probs.numpy()

    def predict_proba_ensemble(self, X) -> np.ndarray:
        # same as predict_proba but does not average over ensemble members
        y_preds = self._predict_raw(X)
        # y_preds are logits, so take the softmax and the mean over the ensemble dimension afterward
        y_probs = torch.softmax(y_preds, dim=-1)
        return y_probs.numpy()

    def predict(self, X):
        """ Predict labels.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        """
        y_probs = self.predict_proba(X)
        class_idxs = np.argmax(y_probs, axis=-1)
        return np.asarray(self.classes_)[class_idxs]

    def predict_ensemble(self, X):
        y_probs = self.predict_proba_ensemble(X)
        class_idxs = np.argmax(y_probs, axis=-1)
        return np.asarray(self.classes_)[class_idxs]


class AlgInterfaceRegressor(RegressorMixin, AlgInterfaceEstimator):
    # inheritance order is important in scikit-learn 1.6
    # otherwise sklearn.base.regressor(...) or is_regressor(...) returns False
    def _is_classification(self) -> bool:
        return False

    def _more_tags(self):
        return utils.join_dicts(super()._more_tags(), {'multioutput': self._supports_multioutput()})

    def __sklearn_tags__(self):
        # from sklearn version 1.6+
        tags = super().__sklearn_tags__()
        tags.target_tags.multi_output = self._supports_multioutput()
        return tags

    def predict(self, X):
        y_preds = self._predict_raw(X)
        y_np = y_preds.mean(dim=0).numpy()
        # print(f'{self.is_y_1d_=}')
        if self.is_y_1d_ and y_np.shape[1] == 1:
            y_np = y_np[:, 0]
        if self.is_y_float64_:
            y_np = y_np.astype(np.float64)

        return y_np

    def predict_ensemble(self, X):
        y_preds = self._predict_raw(X)
        y_np = y_preds.numpy()
        # print(f'{self.is_y_1d_=}')
        if self.is_y_1d_:
            y_np = y_np[:, :, 0]
        if self.is_y_float64_:
            y_np = y_np.astype(np.float64)

        return y_np


================================================
FILE: pytabkit/models/sklearn/sklearn_interfaces.py
================================================
import pathlib
from typing import Optional, Any, Union, List, Dict, Literal

import numpy as np

from pytabkit.models import utils
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.sklearn.sklearn_base import AlgInterfaceRegressor, AlgInterfaceClassifier
from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface
from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface
from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface, RandomParamsNNAlgInterface
from pytabkit.models.alg_interfaces.ensemble_interfaces import AlgorithmSelectionAlgInterface, \
    CaruanaEnsembleAlgInterface

# the list of methods can be auto-generated using scripts/get_sklearn_names.py
__all__ = ["CatBoost_D_Classifier", "CatBoost_D_Regressor", "CatBoost_HPO_Classifier",
           "CatBoost_HPO_Regressor", "CatBoost_HPO_TPE_Classifier", "CatBoost_HPO_TPE_Regressor",
           "CatBoost_TD_Classifier",
           "CatBoost_TD_Regressor", "Ensemble_TD_Classifier", "Ensemble_TD_Regressor", "Ensemble_HPO_Classifier",
           "Ensemble_HPO_Regressor", "FTT_D_Classifier",
           "FTT_D_Regressor",
           "FTT_HPO_Classifier", "FTT_HPO_Regressor", "LGBM_D_Classifier", "LGBM_D_Regressor", "LGBM_HPO_Classifier",
           "LGBM_HPO_Regressor",
           "LGBM_HPO_TPE_Classifier", "LGBM_HPO_TPE_Regressor", "LGBM_TD_Classifier", "LGBM_TD_Regressor",
           "MLP_PLR_D_Classifier",
           "MLP_PLR_D_Regressor", "MLP_PLR_HPO_Classifier", "MLP_PLR_HPO_Regressor", "MLP_RTDL_D_Classifier",
           "MLP_RTDL_D_Regressor",
           "MLP_RTDL_HPO_Classifier", "MLP_RTDL_HPO_Regressor", "MLP_SKL_D_Classifier", "MLP_SKL_D_Regressor",
           "RF_HPO_Classifier",
           "RF_HPO_Regressor", "RF_SKL_D_Classifier", "RF_SKL_D_Regressor", "RealMLP_HPO_Classifier",
           "RealMLP_HPO_Regressor",
           "RealMLP_TD_Classifier", "RealMLP_TD_Regressor", "RealMLP_TD_S_Classifier", "RealMLP_TD_S_Regressor",
           "RealTabR_D_Classifier",
           "RealTabR_D_Regressor", "Resnet_RTDL_D_Classifier", "Resnet_RTDL_D_Regressor", "Resnet_RTDL_HPO_Classifier",
           "Resnet_RTDL_HPO_Regressor", "TabR_HPO_Classifier", "TabR_HPO_Regressor", "TabR_S_D_Classifier",
           "TabR_S_D_Regressor",
           "TabM_D_Classifier", "TabM_D_Regressor", "TabM_HPO_Classifier", "TabM_HPO_Regressor",
           "XRFM_D_Classifier", "XRFM_D_Regressor", "XRFM_HPO_Classifier", "XRFM_HPO_Regressor",
           "XGB_D_Classifier", "XGB_D_Regressor", "XGB_HPO_Classifier", "XGB_HPO_Regressor", "XGB_HPO_TPE_Classifier",
           "XGB_HPO_TPE_Regressor", "XGB_PBB_D_Classifier", "XGB_TD_Classifier", "XGB_TD_Regressor"]


class RealMLPConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 train_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None,
                 n_epochs: Optional[int] = None,
                 batch_size: Optional[int] = None, predict_batch_size: Optional[int] = None,
                 hidden_sizes: Optional[Union[List[int], Literal['rectangular']]] = None,
                 n_hidden_layers: Optional[int] = None,
                 hidden_width: Optional[int] = None,
                 tfms: Optional[List[str]] = None,
                 num_emb_type: Optional[str] = None,
                 use_plr_embeddings: Optional[bool] = None, plr_sigma: Optional[float] = None,
                 plr_hidden_1: Optional[int] = None, plr_hidden_2: Optional[int] = None,
                 plr_act_name: Optional[str] = None, plr_use_densenet: Optional[bool] = None,
                 plr_use_cos_bias: Optional[bool] = None, plr_lr_factor: Optional[float] = None,
                 max_one_hot_cat_size: Optional[int] = None, embedding_size: Optional[int] = None,
                 act: Optional[str] = None,
                 use_parametric_act: Optional[bool] = None, act_lr_factor: Optional[float] = None,
                 weight_param: Optional[str] = None, weight_init_mode: Optional[str] = None,
                 weight_init_gain: Optional[str] = None,
                 weight_lr_factor: Optional[float] = None,
                 bias_init_mode: Optional[str] = None, bias_lr_factor: Optional[float] = None,
                 bias_wd_factor: Optional[float] = None,
                 add_front_scale: Optional[bool] = None,
                 scale_lr_factor: Optional[float] = None,
                 first_layer_lr_factor: Optional[float] = None,
                 block_str: Optional[str] = None,
                 first_layer_config: Optional[Dict[str, Any]] = None,
                 last_layer_config: Optional[Dict[str, Any]] = None,
                 middle_layer_config: Optional[Dict[str, Any]] = None,
                 p_drop: Optional[float] = None, p_drop_sched: Optional[str] = None,
                 wd: Optional[float] = None, wd_sched: Optional[str] = None,
                 opt: Optional[str] = None,
                 lr: Optional[Union[float, Dict[str, float]]] = None, lr_sched: Optional[str] = None,
                 mom: Optional[float] = None, mom_sched: Optional[str] = None,
                 sq_mom: Optional[float] = None, sq_mom_sched: Optional[str] = None,
                 opt_eps: Optional[float] = None, opt_eps_sched: Optional[str] = None,
                 normalize_output: Optional[bool] = None, clamp_output: Optional[bool] = None,
                 use_ls: Optional[bool] = None, ls_eps: Optional[float] = None, ls_eps_sched: Optional[str] = None,
                 use_early_stopping: Optional[bool] = None,
                 early_stopping_additive_patience: Optional[int] = None,
                 early_stopping_multiplicative_patience: Optional[float] = None,
                 calibration_method: Optional[str] = None,
                 sort_quantile_predictions: Optional[bool] = None,
                 stop_epoch: Optional[int] = None,
                 use_best_mean_epoch_for_cv: Optional[bool] = None,
                 n_ens: Optional[int] = None,
                 ens_av_before_softmax: Optional[int] = None,
                 ):
        """
        Constructor for RealMLP, using the default parameters from RealMLP-TD.
        For lists of default parameters, we refer to pytabkit.models.sklearn.default_params.DefaultParams.
        RealMLP-TD does automatic preprocessing,
        so no manual preprocessing is necessary except for imputing missing numerical values.

        Tips for modifications:

        * For faster training: For large datasets (say >50K samples), especially on GPUs, increase batch_size.
          It can also help to decrease n_epochs, set use_plr_embeddings=False (in case of many numerical features),
          increase max_one_hot_cat_size (in case of large-cardinality categories), or set use_parametric_act=False
        * For more accuracy: You can try increasing n_epochs or hidden_sizes while also decreasing lr.
        * For classification, if you care about metrics like cross-entropy or AUC instead of accuracy,
          we recommend setting val_metric_name='cross_entropy' and use_ls=False.

        :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None).
            If None, 'cuda' will be used if available, otherwise 'cpu'.
        :param random_state: Random state to use for random number generation
            (splitting, initialization, batch shuffling). If None, the behavior is not deterministic.
        :param n_cv: Number of cross-validation splits to use (default=1).
            If validation set indices or an explicit validation set are given in fit(),
            `n_cv` models will be fitted using different random seeds.
            Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification).
            For n_cv=1, a single train-validation split will be used,
            where `val_fraction` controls the fraction of validation samples.
            If `n_refit=0` is set,
            the prediction will use the average of the models fitted during cross-validation.
            (Averaging is over probabilities for classification, and over outputs for regression.)
            Otherwise, refitted models will be used.
        :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0).
            If zero, only the models from the cross-validation stage are used.
            If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit())
            and their predictions will be averaged during predict().
        :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1).
            Values != 1 are only allowed when no custom validation split is provided.
            Larger number of repeats make things slower but reduce the potential for validation set overfitting,
            especially on smaller datasets.
        :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1).
            Only used if `n_cv==1` and no validation split is provided in fit().
        :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores).
        :param tmp_folder: Temporary folder in which data can be stored during fit().
            (Currently unused for RealMLP-TD and variants.) If None, methods generally try to not store intermediate data.
            Note that HPO and ensemble methods can use this to reduce RAM usage by storing fitted models,
            and will need this folder to be available whenever they are used.
        :param verbosity: Verbosity level (default=0, higher means more verbose).
            Set to 2 to see logs from intermediate epochs.
        :param train_metric_name: Name of the training metric
            (default='cross_entropy' for classification and 'mse' for regression).
            Currently most other metrics are not available for training.
        :param val_metric_name: Name of the validation metric (used for selecting the best epoch).
            Defaults are 'class_error' for classification and 'rmse' for regression.
            Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo',
            '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'.
            Main available regression metrics: 'rmse', 'mae', 'max_error',
            'pinball(0.95)' (also works with other quantiles specified directly in the string).
            For more metrics, we refer to `models.training.metrics.Metrics.apply()`.
        :param n_epochs: Number of epochs to train the model for (default=256)
        :param batch_size: Batch size to be used for fit(), default=256.
        :param predict_batch_size: Batch size to be used for predict(), default=1024.
        :param hidden_sizes: List of numbers of neurons for each hidden layer, default=[256, 256, 256].
            If this is set to 'rectangular', then [hidden_width] * n_hidden_layers will be used instead.
        :param n_hidden_layers: Number of hidden layers, default=3. Only used if hidden_sizes=='rectangular'.
        :param hidden_width: Width of each hidden layer, default=256. Only used if hidden_sizes=='rectangular'.
        :param tfms: List of preprocessing transformations,
            default=`['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding']`.
            Other possible transformations include: 'median_center', 'l2_normalize', 'l1_normalize', 'quantile', 'kdi'.
        :param num_emb_type: Type of numerical embeddings used (default='pbld'). If not set to 'ignore',
            it overrides the parameters `use_plr_embeddings`, `plr_act_name`, `plr_use_densenet`, `plr_use_cos_bias`.
            Possible values: 'ignore', 'none' (no numerical embeddings), 'pl', 'plr', 'pbld', 'pblrd'.
        :param use_plr_embeddings: Whether PLR (or PL) numerical embeddings should be used (default=True).
        :param plr_sigma: Initialization standard deviation for first PLR embedding layer (default=0.1).
        :param plr_hidden_1: (Half of the) number of hidden neurons in the first PLR hidden layer (default=8).
            This number will be doubled since there are sin() and cos() versions for each hidden neuron.
        :param plr_hidden_2: Number of output neurons of the PLR hidden layer,
            excluding the optional densenet connection (default=7).
        :param plr_act_name: Name of PLR activation function (default='linear').
            Use 'relu' for the PLR version and 'linear' for the PL version.
        :param plr_use_densenet: Whether to append the original feature to the numerical embeddings (default=True).
        :param plr_use_cos_bias: Whether to use the cos(wx+b)
            version for the periodic embeddings instead of the (sin(wx), cos(wx)) version (default=True).
        :param plr_lr_factor: Learning rate factor for PLR embeddings (default=0.1).
            Gets multiplied with lr and with the value of the schedule.
        :param max_one_hot_cat_size: Maximum category size that one-hot encoding should be applied to,
            including the category for missing/unknown values (default=9).
        :param embedding_size: Number of output features of categorical embedding layers (default=8).
        :param act: Activation function (default='selu' for classification and 'mish' for regression).
            Can also be 'relu' or 'silu'.
        :param use_parametric_act: Whether to use a parametric activation as described in the paper (default=True).
        :param act_lr_factor: Learning rate factor for parametric activation (default=0.1).
        :param weight_param: Weight parametrization (default='ntk'). See models.nn.WeightFitter() for more options.
        :param weight_init_mode: Weight initialization mode (default='std').
            See models.nn.WeightFitter() for more options.
        :param weight_init_gain: Multiplier for the weight initialization standard deviation.
            (Does not apply to 'std' initialization mode.)
        :param weight_lr_factor: Learning rate factor for weights.
        :param bias_init_mode: Bias initialization mode (default='he+5'). See models.nn.BiasFitter() for more options.
        :param bias_lr_factor: Bias learning rate factor.
        :param bias_wd_factor: Bias weight decay factor.
        :param add_front_scale: Whether to add a scaling layer (diagonal weight matrix)
            before the linear layers (default=True). If set to true and a scaling layer is already configured
            in the block_str, this will create an additional scaling layer.
        :param scale_lr_factor: Scaling layer learning rate factor
            (default=1.0 but will be overridden by default for the first layer in first_layer_config).
        :param first_layer_lr_factor: First layer learning rate factor
            (default=1.0).
        :param block_str: String describing the default hidden layer components.
            The default is 'w-b-a-d' for weight, bias, activation, dropout.
            By default, the last layer config will override it with 'w-b'
            and the first layer config will override it with 's-w-b-a-d', where the 's' stands for the scaling layer.
        :param first_layer_config: Dictionary with more options
            that can override the other options for the construction of the first MLP layer specifically.
            The default is dict(block_str='s-w-b-a-d', scale_lr_factor=6.0),
            using a scaling layer at the beginning of the first layer with lr factor 6.0.
        :param last_layer_config: Dictionary with more options
            that can override the other options for the construction of the last MLP layer specifically.
            The default is an empty dict, in which case the block_str will still be overridden by 'w-b'.
        :param middle_layer_config: Dictionary with more options
            that can override the other options for the construction of the layers except first and last MLP layer.
            The default is an empty dict.
        :param p_drop: Dropout probability (default=0.15). Needs to be in [0, 1).
        :param p_drop_sched: Dropout schedule (default='flat_cos').
        :param wd: Weight decay implemented as in the PyTorch AdamW but works with all optimizers
            (default=0.0 for regression and 1e-2 for classification).
            Weight decay is implemented as
            param -= current_lr_value * current_wd_value * param
            where the current lr and wd values are determined using the base values (lr and wd),
            factors for the given parameter if available, and the respective schedule.
            Note that this is not identical to the original AdamW paper,
            where the lr base value is not included in the update equation.
        :param wd_sched: Weight decay schedule.
        :param opt: Optimizer (default='adam'). See optim.optimizers.get_opt_class().
        :param lr: Learning rate base value (default=0.04 for classification and 0.14 for regression).
        :param lr_sched: Learning rate schedule (default='coslog4'). See training.scheduling.get_schedule().
        :param mom: Momentum parameter, aka :math:`\\beta_1` for Adam (default=0.9).
        :param mom_sched: Momentum schedule (default='constant').
        :param sq_mom: Momentum of squared gradients, aka :math:`\\beta_2` for Adam (default=0.95).
        :param sq_mom_sched: Schedule for sq_mom (default='constant').
        :param opt_eps: Epsilon parameter of the optimizer (default=1e-8 for Adam).
        :param opt_eps_sched: Schedule for opt_eps (default='constant').
        :param normalize_output: Whether to standardize the target for regression (default=True for regression).
        :param clamp_output: Whether to clamp the output for predict() for regression
            to the min/max range seen during training (default=True for regression).
        :param use_ls: Whether to use label smoothing for classification (default=True for classification).
        :param ls_eps: Epsilon parameter for label smoothing (default=0.1 for classification)
        :param ls_eps_sched: Schedule for ls_eps (default='constant').
        :param use_early_stopping: Whether to use early stopping (default=False).
            Note that even without early stopping,
            the best epoch on the validation set is selected if there is a validation set.
            Training is stopped if the epoch exceeds
            early_stopping_multiplicative_patience * best_epoch + early_stopping_additive_patience.
        :param early_stopping_additive_patience: See use_early_stopping (default=20).
        :param early_stopping_multiplicative_patience: See use_early_stopping (default=2).
            We recommend to set it to 1 for monotone learning rate schedules
            but to keep it at 2 for the default schedule.
        :param calibration_method: Post-hoc calibration method (only for classification).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        :param sort_quantile_predictions:
            If val_metric_name=='multi_pinball(...)', decides whether the predicted quantiles will be sorted
            to avoid quantile crossover. Default is True.
        :param stop_epoch:
            Epoch at which training should be stopped (for refitting).
            The total length of training used for the schedules will be determined by n_epochs,
            but the stopping epoch will be min(stop_epoch, n_epochs).
        :param use_best_mean_epoch_for_cv: If training an ensemble,
            whether they should all use a checkpoint from the same epoch with the best average loss,
            instead of using the best individual epochs (default=False).
        :param n_ens: Number of ensemble members that should be used per train-validation split (default=1).
            For best-epoch selection, the validation scores of averaged predictions will be used.
        :param ens_av_before_softmax: When using classifiction with n_ens>1, whether to average
            the ensemble predictions on each train-val split before taking the softmax (default=False). We recommend
            using False as it is representative of the averaging of models across train-val splits.
        """
        super().__init__()  # call the constructor of the other superclass for multiple inheritance
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.train_metric_name = train_metric_name
        self.val_metric_name = val_metric_name
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.predict_batch_size = predict_batch_size
        self.hidden_sizes = hidden_sizes
        self.n_hidden_layers = n_hidden_layers
        self.hidden_width = hidden_width
        self.tfms = tfms
        self.max_one_hot_cat_size = max_one_hot_cat_size
        self.embedding_size = embedding_size
        self.num_emb_type = num_emb_type
        self.use_plr_embeddings = use_plr_embeddings
        self.plr_sigma = plr_sigma
        self.plr_hidden_1 = plr_hidden_1
        self.plr_hidden_2 = plr_hidden_2
        self.plr_act_name = plr_act_name
        self.plr_use_densenet = plr_use_densenet
        self.plr_use_cos_bias = plr_use_cos_bias
        self.plr_lr_factor = plr_lr_factor
        self.act = act
        self.use_parametric_act = use_parametric_act
        self.act_lr_factor = act_lr_factor
        self.weight_param = weight_param
        self.weight_init_mode = weight_init_mode
        self.weight_init_gain = weight_init_gain
        self.weight_lr_factor = weight_lr_factor
        self.bias_init_mode = bias_init_mode
        self.bias_lr_factor = bias_lr_factor
        self.bias_wd_factor = bias_wd_factor
        self.add_front_scale = add_front_scale
        self.scale_lr_factor = scale_lr_factor
        self.first_layer_lr_factor = first_layer_lr_factor
        self.block_str = block_str
        self.first_layer_config = first_layer_config
        self.last_layer_config = last_layer_config
        self.middle_layer_config = middle_layer_config
        self.p_drop = p_drop
        self.p_drop_sched = p_drop_sched
        self.wd = wd
        self.wd_sched = wd_sched
        self.opt = opt
        self.lr = lr
        self.lr_sched = lr_sched
        self.mom = mom
        self.mom_sched = mom_sched
        self.sq_mom = sq_mom
        self.sq_mom_sched = sq_mom_sched
        self.opt_eps = opt_eps
        self.opt_eps_sched = opt_eps_sched
        self.normalize_output = normalize_output
        self.clamp_output = clamp_output
        self.use_ls = use_ls
        self.ls_eps = ls_eps
        self.ls_eps_sched = ls_eps_sched
        self.use_early_stopping = use_early_stopping
        self.early_stopping_additive_patience = early_stopping_additive_patience
        self.early_stopping_multiplicative_patience = early_stopping_multiplicative_patience
        self.calibration_method = calibration_method
        self.sort_quantile_predictions = sort_quantile_predictions
        self.stop_epoch = stop_epoch
        self.use_best_mean_epoch_for_cv = use_best_mean_epoch_for_cv
        self.n_ens = n_ens
        self.ens_av_before_softmax = ens_av_before_softmax


class RealMLP_TD_Classifier(RealMLPConstructorMixin, AlgInterfaceClassifier):
    """
    MLP-TD classifier. For constructor parameters, see `MLPConstructorMixin`.
    """

    def _get_default_params(self):
        return DefaultParams.RealMLP_TD_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        return NNAlgInterface(**self.get_config())

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealMLP_TD_S_Classifier(RealMLPConstructorMixin, AlgInterfaceClassifier):
    """
    MLP-TD-S classifier. For constructor parameters, see `MLPConstructorMixin`.
    """

    def _get_default_params(self):
        return DefaultParams.RealMLP_TD_S_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        return NNAlgInterface(**self.get_config())

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealMLP_TD_Regressor(RealMLPConstructorMixin, AlgInterfaceRegressor):
    """
    MLP-TD regressor. For constructor parameters, see `MLPConstructorMixin`.
    """

    def _get_default_params(self):
        return DefaultParams.RealMLP_TD_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        return NNAlgInterface(**self.get_config())

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealMLP_TD_S_Regressor(RealMLPConstructorMixin, AlgInterfaceRegressor):
    """
    MLP-TD-S regressor. For constructor parameters, see `MLPConstructorMixin`.
    """

    def _get_default_params(self):
        return DefaultParams.RealMLP_TD_S_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        return NNAlgInterface(**self.get_config())

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


# --------------------------------- GBDTs -----------------------------------


class LGBMConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_estimators: Optional[int] = None,
                 max_depth: Optional[int] = None,
                 num_leaves: Optional[int] = None,
                 lr: Optional[float] = None,
                 subsample: Optional[float] = None,
                 colsample_bytree: Optional[float] = None,
                 bagging_freq: Optional[float] = None,
                 min_data_in_leaf: Optional[int] = None,
                 min_sum_hessian_in_leaf: Optional[int] = None,
                 lambda_l1: Optional[float] = None,
                 lambda_l2: Optional[float] = None,
                 boosting: Optional[str] = None,
                 max_bin: Optional[int] = None,
                 cat_smooth: Optional[float] = None,
                 cat_l2: Optional[float] = None,
                 val_metric_name: Optional[str] = None,
                 calibration_method: Optional[str] = None,
                 ):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.num_leaves = num_leaves
        self.lr = lr
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.bagging_freq = bagging_freq
        self.min_data_in_leaf = min_data_in_leaf
        self.min_sum_hessian_in_leaf = min_sum_hessian_in_leaf
        self.lambda_l1 = lambda_l1
        self.lambda_l2 = lambda_l2
        self.boosting = boosting
        self.max_bin = max_bin
        self.cat_smooth = cat_smooth
        self.cat_l2 = cat_l2
        self.val_metric_name = val_metric_name
        self.calibration_method = calibration_method


class LGBM_TD_Classifier(LGBMConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.LGBM_TD_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)])


class LGBM_D_Classifier(LGBMConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.LGBM_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)])


class LGBM_TD_Regressor(LGBMConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.LGBM_TD_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_multioutput(self) -> bool:
        return False


class LGBM_D_Regressor(LGBMConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.LGBM_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_multioutput(self) -> bool:
        return False


class XGBConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 train_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None,
                 n_estimators: Optional[int] = None,
                 max_depth: Optional[int] = None,
                 lr: Optional[float] = None,
                 subsample: Optional[float] = None,
                 colsample_bytree: Optional[float] = None,
                 colsample_bylevel: Optional[float] = None,
                 colsample_bynode: Optional[float] = None,
                 min_child_weight: Optional[float] = None,
                 alpha: Optional[float] = None,
                 reg_lambda: Optional[float] = None,
                 gamma: Optional[float] = None,
                 tree_method: Optional[str] = None,
                 max_delta_step: Optional[float] = None,
                 max_cat_to_onehot: Optional[int] = None,
                 num_parallel_tree: Optional[int] = None,
                 max_bin: Optional[int] = None,
                 multi_strategy: Optional[str] = None,
                 calibration_method: Optional[str] = None,
                 ):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.train_metric_name = train_metric_name
        self.val_metric_name = val_metric_name
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.lr = lr
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.colsample_bynode = colsample_bynode
        self.min_child_weight = min_child_weight
        self.alpha = alpha
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.tree_method = tree_method
        self.max_delta_step = max_delta_step
        self.max_cat_to_onehot = max_cat_to_onehot
        self.num_parallel_tree = num_parallel_tree
        self.max_bin = max_bin
        self.multi_strategy = multi_strategy
        self.calibration_method = calibration_method


class XGB_TD_Classifier(XGBConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.XGB_TD_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class XGB_D_Classifier(XGBConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.XGB_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class XGB_PBB_D_Classifier(XGBConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.XGB_PBB_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class XGB_TD_Regressor(XGBConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.XGB_TD_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']

    def _supports_multioutput(self) -> bool:
        return False


class XGB_D_Regressor(XGBConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.XGB_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface
        return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']

    def _supports_multioutput(self) -> bool:
        return False


class CatBoostConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_estimators: Optional[int] = None,
                 max_depth: Optional[int] = None,
                 lr: Optional[float] = None,
                 subsample: Optional[float] = None,
                 colsample_bylevel: Optional[float] = None,
                 random_strength: Optional[float] = None,
                 bagging_temperature: Optional[float] = None,
                 leaf_estimation_iterations: Optional[int] = None,
                 bootstrap_type: Optional[str] = None,
                 boosting_type: Optional[str] = None,
                 min_data_in_leaf: Optional[int] = None,
                 grow_policy: Optional[str] = None,
                 num_leaves: Optional[int] = None,
                 max_bin: Optional[int] = None,
                 # renamed from border_count since it is named max_bin in the default parameters
                 l2_leaf_reg: Optional[float] = None,
                 one_hot_max_size: Optional[int] = None,
                 val_metric_name: Optional[str] = None,
                 train_metric_name: Optional[str] = None,
                 calibration_method: Optional[str] = None,
                 ):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.lr = lr
        self.subsample = subsample
        self.colsample_bylevel = colsample_bylevel
        self.random_strength = random_strength
        self.bagging_temperature = bagging_temperature
        self.leaf_estimation_iterations = leaf_estimation_iterations
        self.bootstrap_type = bootstrap_type
        self.boosting_type = boosting_type
        self.min_data_in_leaf = min_data_in_leaf
        self.grow_policy = grow_policy
        self.num_leaves = num_leaves
        self.max_bin = max_bin
        self.l2_leaf_reg = l2_leaf_reg
        self.one_hot_max_size = one_hot_max_size
        self.val_metric_name = val_metric_name
        self.train_metric_name = train_metric_name
        self.calibration_method = calibration_method


class CatBoost_TD_Classifier(CatBoostConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.CB_TD_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_D_Classifier(CatBoostConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.CB_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_TD_Regressor(CatBoostConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.CB_TD_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_D_Regressor(CatBoostConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.CB_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class RFConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_estimators: Optional[int] = None,
                 calibration_method: Optional[str] = None,
                 ):
        """
        Validation set is not used.
        :param device:
        :param random_state:
        :param n_cv:
        :param n_refit:
        :param n_repeats:
        :param val_fraction:
        :param n_threads:
        :param tmp_folder:
        :param verbosity:
        :param n_estimators:
        :param calibration_method: Post-hoc calibration method (only for classification).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        """
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_estimators = n_estimators
        self.calibration_method = calibration_method


class RF_SKL_D_Classifier(RFConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.RF_SKL_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface
        return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**self.get_config()) for i in range(n_cv)])


class RF_SKL_D_Regressor(RFConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.RF_SKL_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface
        return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**self.get_config()) for i in range(n_cv)])


class MLPSKLConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 calibration_method: Optional[str] = None,
                 ):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.calibration_method = calibration_method


class MLP_SKL_D_Classifier(MLPSKLConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.MLP_SKL_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import SklearnMLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([SklearnMLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])


class MLP_SKL_D_Regressor(MLPSKLConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.MLP_SKL_D

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import SklearnMLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([SklearnMLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])


# HPO methods

class GBDTHPOConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_estimators: Optional[int] = None,
                 hpo_space_name: Optional[str] = None,
                 n_hyperopt_steps: Optional[int] = None,
                 calibration_method: Optional[str] = None,
                 use_caruana_ensembling: Optional[bool] = None,
                 time_limit_s: Optional[float] = None,
                 ):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_estimators = n_estimators
        self.hpo_space_name = hpo_space_name
        self.n_hyperopt_steps = n_hyperopt_steps
        self.calibration_method = calibration_method
        self.use_caruana_ensembling = use_caruana_ensembling
        self.time_limit_s = time_limit_s


class XGB_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface([RandomParamsXGBAlgInterface(model_idx=i, **config) for j in range(n_cv)])
             for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class XGB_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    tree_method='hist', hpo_space_name='grinsztajn')

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBHyperoptAlgInterface
        return XGBHyperoptAlgInterface(**self.get_config())

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class XGB_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface([RandomParamsXGBAlgInterface(model_idx=i, **config) for j in range(n_cv)])
             for i in range(n_hyperopt_steps)], **config)

    def _supports_multioutput(self) -> bool:
        return False


class XGB_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    tree_method='hist', hpo_space_name='grinsztajn')

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBHyperoptAlgInterface
        return XGBHyperoptAlgInterface(**self.get_config())

    def _supports_multioutput(self) -> bool:
        return False


class LGBM_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface([RandomParamsLGBMAlgInterface(model_idx=i, **config) for j in range(n_cv)])
             for i in range(n_hyperopt_steps)], **config)


class LGBM_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    hpo_space_name='catboost_quality_benchmarks')

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMHyperoptAlgInterface
        return LGBMHyperoptAlgInterface(**self.get_config())


class LGBM_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface([RandomParamsLGBMAlgInterface(model_idx=i, **config) for j in range(n_cv)])
             for i in range(n_hyperopt_steps)], **config)

    def _supports_multioutput(self) -> bool:
        return False


class LGBM_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    hpo_space_name='catboost_quality_benchmarks')

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMHyperoptAlgInterface
        return LGBMHyperoptAlgInterface(**self.get_config())

    def _supports_multioutput(self) -> bool:
        return False


class CatBoost_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface(
                [RandomParamsCatBoostAlgInterface(model_idx=i, **config) for j in range(n_cv)])
                for i in range(n_hyperopt_steps)], **config)

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    hpo_space_name='shwartz-ziv')

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostHyperoptAlgInterface
        return CatBoostHyperoptAlgInterface(**self.get_config())

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface(
                [RandomParamsCatBoostAlgInterface(model_idx=i, **config) for j in range(n_cv)])
                for i in range(n_hyperopt_steps)], **config)

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class CatBoost_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_estimators=1000, n_hyperopt_steps=50,
                    early_stopping_rounds=300,
                    hpo_space_name='shwartz-ziv')

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostHyperoptAlgInterface
        return CatBoostHyperoptAlgInterface(**self.get_config())

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda']


class RF_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import RandomParamsRFAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface(
                [RandomParamsRFAlgInterface(model_idx=i, **config) for j in range(n_cv)])
                for i in range(n_hyperopt_steps)], **config)

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False


class RF_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self) -> Dict[str, Any]:
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.other_interfaces import RandomParamsRFAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [SingleSplitWrapperAlgInterface(
                [RandomParamsRFAlgInterface(model_idx=i, **config) for j in range(n_cv)])
                for i in range(n_hyperopt_steps)], **config)

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False


class RealMLPHPOConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_hyperopt_steps: Optional[int] = None, val_metric_name: Optional[str] = None,
                 calibration_method: Optional[str] = None, hpo_space_name: Optional[str] = None,
                 n_caruana_steps: Optional[int] = None, n_epochs: Optional[int] = None,
                 use_caruana_ensembling: Optional[bool] = None, train_metric_name: Optional[str] = None,
                 time_limit_s: Optional[float] = None,
                 ):
        """

        :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None).
            If None, 'cuda' will be used if available, otherwise 'cpu'.
        :param random_state: Random state to use for random number generation
            (splitting, initialization, batch shuffling). If None, the behavior is not deterministic.
        :param n_cv: Number of cross-validation splits to use (default=1).
            If validation set indices or an explicit validation set are given in fit(),
            `n_cv` models will be fitted using different random seeds.
            Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification).
            For n_cv=1, a single train-validation split will be used,
            where `val_fraction` controls the fraction of validation samples.
            If `n_refit=0` is set,
            the prediction will use the average of the models fitted during cross-validation.
            (Averaging is over probabilities for classification, and over outputs for regression.)
            Otherwise, refitted models will be used.
        :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0).
            If zero, only the models from the cross-validation stage are used.
            If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit())
            and their predictions will be averaged during predict().
        :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1).
            Values != 1 are only allowed when no custom validation split is provided.
            Larger number of repeats make things slower but reduce the potential for validation set overfitting,
            especially on smaller datasets.
        :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1).
            Only used if `n_cv==1` and no validation split is provided in fit().
        :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores).
        :param tmp_folder: Folder in which models can be stored.
            Setting this allows reducing RAM/VRAM usage by not having all models in RAM at the same time.
            In this case, the folder needs to be preserved as long as the model exists
            (including when the model is pickled to disk).
        :param verbosity: Verbosity level (default=0, higher means more verbose).
            Set to 2 to see logs from intermediate epochs.
        :param n_hyperopt_steps: Number of random hyperparameter configs
            that should be used to train models (default=50).
        :param val_metric_name: Name of the validation metric (used for selecting the best epoch).
            Not used for all models but at least for RealMLP and probably TabM.
            Defaults are 'class_error' for classification and 'rmse' for regression.
            Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo',
            '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'.
            Main available regression metrics: 'rmse', 'mae', 'max_error',
            'pinball(0.95)' (also works with other quantiles specified directly in the string).
            For more metrics, we refer to `models.training.metrics.Metrics.apply()`.
        :param calibration_method: Post-hoc calibration method (only for classification) (default=None).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        :param hpo_space_name: Name of the HPO space (default='default').
            The search space used in the paper for RealMLP is 'default'. However, we recommend using 'tabarena' for the best results.
        :param n_caruana_steps: Number of weight update iterations for Caruana et al. weighted ensembling (default=40).
            This parameter is only used when use_caruana_ensembling=True.
        :param n_epochs: Number of epochs to train for each NN (default=None).
            If set, it will override the values from the search space. (Might be ignored for non-RealMLP methods.)
        :param use_caruana_ensembling: Whether to use the algorithm by Caruana et al. (2004)
            to select a weighted ensemble of models instead of only selecting the best model (default=False).
        :param train_metric_name: Name of the training metric
            (default is cross_entropy for classification and mse for regression).
            For regression, pinball/multi_pinball can be used instead. (Might be ignored for non-RealMLP methods.)
        :param time_limit_s: Time limit in seconds (default=None).
        """
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_hyperopt_steps = n_hyperopt_steps
        self.val_metric_name = val_metric_name
        self.calibration_method = calibration_method
        self.hpo_space_name = hpo_space_name
        self.n_caruana_steps = n_caruana_steps
        self.n_epochs = n_epochs
        self.use_caruana_ensembling = use_caruana_ensembling
        self.train_metric_name = train_metric_name
        self.time_limit_s = time_limit_s


class RealMLP_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsNNAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealMLP_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsNNAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class ResnetConstructorMixin:
    def __init__(self,
                 module_d_embedding: Optional[int] = None,
                 module_d: Optional[int] = None,
                 module_d_hidden_factor: Optional[float] = None,
                 module_n_layers: Optional[int] = None,
                 module_activation: Optional[str] = None,
                 module_normalization: Optional[str] = None,
                 module_hidden_dropout: Optional[float] = None,
                 module_residual_dropout: Optional[float] = None,

                 verbose: Optional[int] = None,
                 max_epochs: Optional[int] = None,
                 batch_size: Optional[int] = None,
                 optimizer: Optional[str] = None,
                 es_patience: Optional[int] = None,
                 lr: Optional[float] = None,
                 lr_scheduler: Optional[bool] = None,
                 lr_patience: Optional[int] = None,
                 optimizer_weight_decay: Optional[float] = None,
                 use_checkpoints: Optional[bool] = None,
                 transformed_target: Optional[bool] = None,
                 tfms: Optional[List[str]] = None,
                 quantile_output_distribution: Optional[str] = None,
                 val_metric_name: Optional[str] = None,
                 device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 calibration_method: Optional[str] = None,
                 ):
        self.module_d_embedding = module_d_embedding
        self.module_d = module_d
        self.module_d_hidden_factor = module_d_hidden_factor
        self.module_n_layers = module_n_layers
        self.module_activation = module_activation
        self.module_normalization = module_normalization
        self.module_hidden_dropout = module_hidden_dropout
        self.module_residual_dropout = module_residual_dropout
        self.verbose = verbose
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.es_patience = es_patience
        self.lr_scheduler = lr_scheduler
        self.lr_patience = lr_patience
        self.lr = lr
        self.optimizer_weight_decay = optimizer_weight_decay
        self.use_checkpoints = use_checkpoints
        self.transformed_target = transformed_target
        self.tfms = tfms
        self.quantile_output_distribution = quantile_output_distribution
        self.val_metric_name = val_metric_name
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.calibration_method = calibration_method


class Resnet_RTDL_D_Classifier(ResnetConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.RESNET_RTDL_D_CLASS_TabZilla

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import ResnetSubSplitInterface
        return SingleSplitWrapperAlgInterface([ResnetSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        # only on windows, only recently?? probably a skorch problem?
        return True


class Resnet_RTDL_D_Regressor(ResnetConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.RESNET_RTDL_D_REG_TabZilla

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import ResnetSubSplitInterface
        return SingleSplitWrapperAlgInterface([ResnetSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True


class FTTransformerConstructorMixin:
    def __init__(self,
                 module_d_token: Optional[int] = None,
                 module_d_ffn_factor: Optional[float] = None,
                 module_n_layers: Optional[int] = None,
                 module_n_heads: Optional[int] = None,
                 module_token_bias: Optional[bool] = None,
                 module_attention_dropout: Optional[float] = None,
                 module_ffn_dropout: Optional[float] = None,
                 module_residual_dropout: Optional[float] = None,
                 module_activation: Optional[str] = None,
                 module_prenormalization: Optional[bool] = None,
                 module_initialization: Optional[str] = None,
                 module_kv_compression: Optional[str] = None,
                 module_kv_compression_sharing: Optional[str] = None,
                 verbose: Optional[int] = None,
                 max_epochs: Optional[int] = None,
                 batch_size: Optional[int] = None,
                 optimizer: Optional[str] = None,
                 es_patience: Optional[int] = None,
                 lr: Optional[float] = None,
                 lr_scheduler: Optional[bool] = None,
                 lr_patience: Optional[int] = None,
                 optimizer_weight_decay: Optional[float] = None,
                 use_checkpoints: Optional[bool] = None,
                 transformed_target: Optional[bool] = None,
                 tfms: Optional[List[str]] = None,
                 quantile_output_distribution: Optional[str] = None,
                 val_metric_name: Optional[str] = None,
                 device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 calibration_method: Optional[str] = None,
                 ):
        self.module_d_token = module_d_token
        self.module_d_ffn_factor = module_d_ffn_factor
        self.module_n_layers = module_n_layers
        self.module_n_heads = module_n_heads
        self.module_token_bias = module_token_bias
        self.module_attention_dropout = module_attention_dropout
        self.module_ffn_dropout = module_ffn_dropout
        self.module_residual_dropout = module_residual_dropout
        self.module_activation = module_activation
        self.module_prenormalization = module_prenormalization
        self.module_initialization = module_initialization
        self.module_kv_compression = module_kv_compression
        self.module_kv_compression_sharing = module_kv_compression_sharing
        self.verbose = verbose
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.es_patience = es_patience
        self.lr_scheduler = lr_scheduler
        self.lr_patience = lr_patience
        self.lr = lr
        self.optimizer_weight_decay = optimizer_weight_decay
        self.use_checkpoints = use_checkpoints
        self.transformed_target = transformed_target
        self.tfms = tfms
        self.quantile_output_distribution = quantile_output_distribution
        self.val_metric_name = val_metric_name
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.calibration_method = calibration_method


class FTT_D_Classifier(FTTransformerConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.FTT_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import FTTransformerSubSplitInterface
        return SingleSplitWrapperAlgInterface(
            [FTTransformerSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        # only on windows, only recently?? probably a skorch problem?
        return True


class FTT_D_Regressor(FTTransformerConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.FTT_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import FTTransformerSubSplitInterface
        return SingleSplitWrapperAlgInterface(
            [FTTransformerSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True


class RTDL_MLPConstructorMixin:
    def __init__(self,
                 module_d_embedding: Optional[int] = None,
                 module_d_layers: Optional[int] = None,
                 module_d_first_layer: Optional[int] = None,
                 module_d_last_layer: Optional[int] = None,
                 module_n_layers: Optional[int] = None,
                 module_dropout: Optional[float] = None,
                 verbose: Optional[int] = None,
                 max_epochs: Optional[int] = None,
                 batch_size: Optional[int] = None,
                 optimizer: Optional[str] = None,
                 es_patience: Optional[int] = None,
                 lr: Optional[float] = None,
                 lr_scheduler: Optional[bool] = None,
                 lr_patience: Optional[int] = None,
                 optimizer_weight_decay: Optional[float] = None,
                 use_checkpoints: Optional[bool] = None,
                 transformed_target: Optional[bool] = None,
                 tfms: Optional[List[str]] = None,
                 quantile_output_distribution: Optional[str] = None,
                 val_metric_name: Optional[str] = None,
                 module_num_emb_type: Optional[str] = None,
                 module_num_emb_dim: Optional[int] = None,
                 module_num_emb_hidden_dim: Optional[int] = None,
                 module_num_emb_sigma: Optional[float] = None,
                 module_num_emb_lite: Optional[bool] = None,
                 device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 calibration_method: Optional[str] = None,
                 ):
        self.module_d_embedding = module_d_embedding
        self.module_d_layers = module_d_layers
        self.module_d_first_layer = module_d_first_layer
        self.module_d_last_layer = module_d_last_layer
        self.module_n_layers = module_n_layers
        self.module_dropout = module_dropout
        self.verbose = verbose
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.es_patience = es_patience
        self.lr_scheduler = lr_scheduler
        self.lr_patience = lr_patience
        self.lr = lr
        self.optimizer_weight_decay = optimizer_weight_decay
        self.use_checkpoints = use_checkpoints
        self.transformed_target = transformed_target
        self.tfms = tfms
        self.quantile_output_distribution = quantile_output_distribution
        self.module_num_emb_type = module_num_emb_type
        self.module_num_emb_dim = module_num_emb_dim
        self.module_num_emb_hidden_dim = module_num_emb_hidden_dim
        self.module_num_emb_sigma = module_num_emb_sigma
        self.module_num_emb_lite = module_num_emb_lite
        self.val_metric_name = val_metric_name
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.calibration_method = calibration_method


class MLP_RTDL_D_Classifier(RTDL_MLPConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.MLP_RTDL_D_CLASS_TabZilla

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        # only on windows, only recently?? probably a skorch problem?
        return True


class MLP_RTDL_D_Regressor(RTDL_MLPConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.MLP_RTDL_D_REG_TabZilla

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True


class MLP_PLR_D_Classifier(RTDL_MLPConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.MLP_PLR_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        # only on windows, only recently?? probably a skorch problem?
        return True


class MLP_PLR_D_Regressor(RTDL_MLPConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.MLP_PLR_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface
        return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True


class TabrConstructorMixin:
    def __init__(self,
                 num_embeddings: Optional[int] = None,
                 d_main: Optional[int] = None,
                 d_multiplier: Optional[int] = None,
                 encoder_n_blocks: Optional[int] = None,
                 predictor_n_blocks: Optional[int] = None,
                 mixer_normalization: Optional[Union[bool, Literal['auto']]] = None,
                 context_dropout: Optional[float] = None,
                 dropout0: Optional[float] = None,
                 dropout1: Optional[float] = None,
                 normalization: Optional[str] = None,
                 activation: Optional[str] = None,
                 memory_efficient: Optional[bool] = None,
                 candidate_encoding_batch_size: Optional[int] = None,
                 n_epochs: Optional[int] = None,
                 batch_size: Optional[int] = None,
                 eval_batch_size: Optional[int] = None,
                 context_size: Optional[int] = None,
                 freeze_contexts_after_n_epochs: Optional[int] = None,
                 optimizer: Optional[Dict] = None,
                 patience: Optional[int] = None,
                 transformed_target: Optional[bool] = None,
                 tfms: Optional[List[str]] = None,
                 quantile_output_distribution: Optional[str] = None,
                 val_metric_name: Optional[str] = None,
                 add_scaling_layer: Optional[bool] = None,
                 scale_lr_factor: Optional[float] = None,
                 use_ntp_linear: Optional[bool] = None,
                 linear_init_type: Optional[str] = None,  # only relevant if use_ntp_linear=True
                 use_ntp_encoder: Optional[bool] = None,
                 ls_eps: Optional[float] = None,
                 device: Optional[str] = None,
                 random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 calibration_method: Optional[str] = None,
                 ):
        self.num_embeddings = num_embeddings
        self.d_main = d_main
        self.d_multiplier = d_multiplier
        self.encoder_n_blocks = encoder_n_blocks
        self.predictor_n_blocks = predictor_n_blocks
        self.mixer_normalization = mixer_normalization
        self.context_dropout = context_dropout
        self.dropout0 = dropout0
        self.dropout1 = dropout1
        self.normalization = normalization
        self.activation = activation
        self.memory_efficient = memory_efficient
        self.candidate_encoding_batch_size = candidate_encoding_batch_size
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.eval_batch_size = eval_batch_size
        self.context_size = context_size
        self.freeze_contexts_after_n_epochs = freeze_contexts_after_n_epochs
        self.optimizer = optimizer
        self.patience = patience
        self.transformed_target = transformed_target
        self.tfms = tfms
        self.quantile_output_distribution = quantile_output_distribution
        self.val_metric_name = val_metric_name
        self.add_scaling_layer = add_scaling_layer
        self.scale_lr_factor = scale_lr_factor
        self.use_ntp_linear = use_ntp_linear
        self.linear_init_type = linear_init_type
        self.use_ntp_encoder = use_ntp_encoder
        self.ls_eps = ls_eps

        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.calibration_method = calibration_method


class TabR_S_D_Classifier(TabrConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.TABR_S_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class TabR_S_D_Regressor(TabrConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.TABR_S_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealTabR_D_Classifier(TabrConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.RealTABR_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class RealTabR_D_Regressor(TabrConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.RealTABR_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class TabMConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 arch_type: Optional[str] = None,
                 tabm_k: Optional[int] = None,
                 num_emb_type: Optional[str] = None,
                 num_emb_n_bins: Optional[int] = None,
                 batch_size: Optional[int] = None,
                 lr: Optional[float] = None,
                 weight_decay: Optional[float] = None,
                 n_epochs: Optional[int] = None,
                 patience: Optional[int] = None,
                 d_embedding: Optional[int] = None,
                 d_block: Optional[int] = None,
                 n_blocks: Optional[Union[str, int]] = None,
                 dropout: Optional[float] = None,
                 compile_model: Optional[bool] = None,
                 allow_amp: Optional[bool] = None,
                 tfms: Optional[List[str]] = None,
                 gradient_clipping_norm: Optional[Union[float, Literal['none']]] = None,
                 calibration_method: Optional[str] = None,
                 share_training_batches: Optional[bool] = None,
                 val_metric_name: Optional[str] = None,
                 train_metric_name: Optional[str] = None,
                 ):
        """

        :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None).
            If None, 'cuda' will be used if available, otherwise 'cpu'.
        :param random_state: Random state to use for random number generation
            (splitting, initialization, batch shuffling). If None, the behavior is not deterministic.
        :param n_cv: Number of cross-validation splits to use (default=1).
            If validation set indices are given in fit(), `n_cv` models will be fitted using different random seeds.
            Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). If `n_refit=0` is set,
            the prediction will use the average of the models fitted during cross-validation.
            (Averaging is over probabilities for classification, and over outputs for regression.)
            Otherwise, refitted models will be used.
        :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0).
            If zero, only the models from the cross-validation stage are used.
            If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit())
            and their predictions will be averaged during predict().
        :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1).
            Values != 1 are only allowed when no custom validation split is provided.
            Larger number of repeats make things slower but reduce the potential for validation set overfitting,
            especially on smaller datasets.
        :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1).
            Only used if `n_cv==1` and no validation split is provided in fit().
        :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores).
        :param tmp_folder: Temporary folder in which data can be stored during fit().
            (Currently unused for TabM and variants.) If None, methods generally try to not store intermediate data.
        :param verbosity: Verbosity level (default=0, higher means more verbose).
            Set to 2 to see logs from intermediate epochs.
        :param arch_type: Architecture type for TabM, one of ['tabm', 'tabm-mini', 'tabm-normal', 'tabm-mini-normal', 'plain'].
        :param tabm_k: Value of $k$ (number of memory-efficient ensemble members). Default is 32.
        :param num_emb_type: Type of numerical embedding, one of ['none', 'pwl']. Default is 'none'.
            'pwl' stands for piecewise linear embeddings.
        :param num_emb_n_bins: Number of bins for piecewise linear embeddings (default=48).
        Only used when piecewise linear numerical embeddings are used.
        Must be at most the number of training samples, but >1.
        :param batch_size: Batch size, default is 256.
        :param lr: Learning rate, default is 2e-3.
        :param weight_decay: Weight decay, default is 0.
        :param n_epochs: Maximum number of epochs (if early stopping doesn't apply). Default is 1 billion.
        :param patience: Patience for early stopping. Default is 16
        :param d_embedding: Embedding dimension for numerical embeddings.
        :param d_block: Hidden layer size.
        :param n_blocks: Number of linear layers, or 'auto'. Default is 'auto', which will use
            3 when num_emb_type=='none' and 2 otherwise.
        :param dropout: Dropout probability. Default is 0.1.
        :param compile_model: Whether torch.compile should be applied to the model (default=False).
        :param allow_amp: Whether automatic mixed precision should be used if the device is a GPU (default=False).
        :param tfms: Preprocessing transformations, see models.nn_models.models.PreprocessingFactory.
            Default is ['quantile_tabr']. Categorical values will be one-hot encoded by the model.
            Note that in the original experiments, it seems that when cat_policy='ordinal',
            the ordinal-encoded categorical values will later be one-hot encoded by the model.
        :param gradient_clipping_norm: Norm for gradient clipping.
            Default is None from the example code (no gradient clipping), but the experiments from the paper use 1.0.
        :param calibration_method: Post-hoc calibration method (only for classification).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        :param share_training_batches: New in v1.4.1: Whether TabM should use the same training samples
            for each model in the batch (default=False).
            We adopt the default value False from the newer version of TabM,
            while the old code (prior to 1.4.1) was equivalent to share_training_batches=True,
            except that the new code also excludes certain parameters from weight decay.
        :param val_metric_name: Name of the validation metric used for early stopping.
            For classification, the default is 'class_error' but could be 'cross_entropy', 'brier', '1-auc_ovr' etc.
            For regression, the default is 'rmse' but could be 'mae'.
        :param train_metric_name: Name of the metric (loss) used for training.
            For classification, the default is 'cross_entropy'.
            For regression, it is 'mse' but could be set to something like 'multi_pinball(0.05,0.95)'.
        """
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity

        self.arch_type = arch_type
        self.num_emb_type = num_emb_type
        self.num_emb_n_bins = num_emb_n_bins
        self.n_epochs = n_epochs
        self.patience = patience
        self.batch_size = batch_size
        self.compile_model = compile_model
        self.lr = lr
        self.weight_decay = weight_decay
        self.d_embedding = d_embedding
        self.d_block = d_block
        self.n_blocks = n_blocks
        self.dropout = dropout
        self.tabm_k = tabm_k
        self.allow_amp = allow_amp
        self.tfms = tfms
        self.gradient_clipping_norm = gradient_clipping_norm
        self.calibration_method = calibration_method
        self.share_training_batches = share_training_batches
        self.val_metric_name = val_metric_name
        self.train_metric_name = train_metric_name


class TabM_D_Classifier(TabMConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.TABM_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_class(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False


class TabM_D_Regressor(TabMConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.TABM_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface
        return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_multioutput(self) -> bool:
        return False

    def _supports_single_sample(self) -> bool:
        return False


class TabM_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    """
    HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings
    """
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabm_interface import RandomParamsTabMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsTabMAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class TabM_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    """
    HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings
    """
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabm_interface import RandomParamsTabMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsTabMAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


# ------------------------------

class XRFMConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 bandwidth: Optional[float] = None,
                 p_interp: Optional[float] = None,
                 exponent: Optional[float] = None,
                 reg: Optional[float] = None,
                 iters: Optional[int] = None,
                 diag: Optional[bool] = None,
                 bandwidth_mode: Optional[str] = None,
                 kernel_type: Optional[str] = None,
                 max_leaf_samples: Optional[int] = None,
                 val_metric_name: Optional[str] = None,
                 early_stop_rfm: Optional[bool] = None,
                 early_stop_multiplier: Optional[float] = None,
                 classification_mode: Optional[str] = None,
                 calibration_method: Optional[str] = None,
                 time_limit_s: Optional[float] = None,
                 M_batch_size: Optional[int] = None,
                 ):
        """
        xRFM. In case of out-of-memory, try reducing M_batch_size and/or max_leaf_samples.
        Some parameters generally benefit a lot from tuning, such as the regularization (reg).

        :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None).
            If None, 'cuda' will be used if available, otherwise 'cpu'.
        :param random_state: Random state to use for random number generation
            (splitting, initialization, batch shuffling). If None, the behavior is not deterministic.
        :param n_cv: Number of cross-validation splits to use (default=1).
            If validation set indices are given in fit(), `n_cv` models will be fitted using different random seeds.
            Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). If `n_refit=0` is set,
            the prediction will use the average of the models fitted during cross-validation.
            (Averaging is over probabilities for classification, and over outputs for regression.)
            Otherwise, refitted models will be used.
        :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0).
            If zero, only the models from the cross-validation stage are used.
            If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit())
            and their predictions will be averaged during predict().
        :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1).
            Values != 1 are only allowed when no custom validation split is provided.
            Larger number of repeats make things slower but reduce the potential for validation set overfitting,
            especially on smaller datasets.
        :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1).
            Only used if `n_cv==1` and no validation split is provided in fit().
        :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores).
        :param tmp_folder: Temporary folder in which data can be stored during fit().
            (Currently unused for xRFM and variants.) If None, methods generally try to not store intermediate data.
        :param verbosity: Verbosity level (default=0, higher means more verbose).
        :param bandwidth: Bandwidth of the kernel, i.e., how wide the kernel is (default=10).
        :param p_interp: For kernel_type='lpq', this parameter controls the parameter p of the L_p norm
            in the exponent of the kernel. Specifically, we set p = 2 * p_interp + exponent * (1 - p_interp).
            Should be in [0, 1].
        :param exponent: Exponent of the norm inside the kernel (default=1). Should be in (0, 2].
            Recommended values are in [0.7, 1.4].
        :param reg: Regularization parameter lambda in the kernel ridge regression (default=1e-3).
        :param iters: How many iterations (fitting the regressor, updating the AGOP matrix) should be done (default=5).
            The default should be good for most cases.
        :param diag: Whether to only fit a diagonal AGOP matrix (default=True).
        :param bandwidth_mode: How to set the bandwidth (default='constant').
            For 'constant', the specified bandwidth will be used directly.
            For 'adaptive', it will be scaled relative to the median distance between samples.
            We recommend 'constant' for smaller datasets (< max_leaf_samples) where only a single RFM is fit.
            For larger datasets, 'adaptive' may be more suited since it can adapt the bandwidth to the data in the leaf.
        :param kernel_type: Type of kernel (default='l2').
            For 'l2', the L_2-norm will be used in the generalized Laplace kernel exp(-||x - x'||_2^q),
            where q is the exponent. This is the fastest kernel and a good default.
            For 'lpq', the slower exp(-||x - x'||_p^q) will be used, where p is determined from q and p_interp.
            It will use the kermac implementation if kermac is installed.
        :param max_leaf_samples: Maximum number of samples in a leaf of xRFM (default=60_000).
            For datasets with more than max_leaf_samples samples, the memory usage is O(max_leaf_samples**2)
            and the time complexity is roughly O(n_samples * max_leaf_samples**2).
            The default is around 60000, which is optimized for GPUs with ~40 GB of VRAM.
            Reduce this number to reduce the RAM usage.
            On GPUs with less VRAM, this number can be automatically lowered to avoid exceeding the maximum RAM.
        :param val_metric_name: Name of the validation metric (used for selecting the best iteration).
            Defaults are 'class_error' for classification and 'rmse' for regression.
            Available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auroc-ovr',
            'brier'.
            Available regression metrics: 'rmse'.
        :param early_stop_rfm: Whether to stop the iterations early if the error stops decreasing (default=False).
        :param early_stop_multiplier: Tolerance for early stopping, should be larger than one (default=1.1).
            Larger values will early-stop less aggressively.
        :param classification_mode: How to convert classification problems to regression problems internally
            (default='zero_one'). 'zero_one' uses a one-hot encoding,
            while 'prevalence' uses a simplex encoding with zero corresponding to the marginal class ratio.
        :param calibration_method: Post-hoc calibration method (only for classification) (default=None).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        :param time_limit_s:
            Time limit in seconds (default=None).
        :param M_batch_size:
            Batch size used to construct the AGOP matrix M (default=8000).
            Higher values can speed up the computation but may lead to out-of-memory (esp. for the 'lpq' kernel).
        """

        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity

        self.bandwidth = bandwidth
        self.p_interp = p_interp
        self.exponent = exponent
        self.reg = reg
        self.iters = iters
        self.diag = diag
        self.bandwidth_mode = bandwidth_mode
        self.kernel_type = kernel_type
        self.max_leaf_samples = max_leaf_samples
        self.val_metric_name = val_metric_name
        self.early_stop_rfm = early_stop_rfm
        self.early_stop_multiplier = early_stop_multiplier
        self.classification_mode = classification_mode
        self.calibration_method = calibration_method
        self.time_limit_s = time_limit_s
        self.M_batch_size = M_batch_size


class XRFM_D_Classifier(XRFMConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return DefaultParams.XRFM_D_CLASS

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface
        return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False


class XRFM_D_Regressor(XRFMConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return DefaultParams.XRFM_D_REG

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface
        return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**self.get_config()) for i in range(n_cv)])

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _non_deterministic_tag(self) -> bool:
        # set non-deterministic
        # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?)
        return True

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False


class XRFMHPOConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 n_hyperopt_steps: Optional[int] = None, val_metric_name: Optional[str] = None,
                 max_leaf_samples: Optional[int] = None,
                 M_batch_size: Optional[int] = None,
                 bandwidth_mode: Optional[str] = None,
                 calibration_method: Optional[str] = None, hpo_space_name: Optional[str] = None,
                 n_caruana_steps: Optional[int] = None,
                 use_caruana_ensembling: Optional[bool] = None,
                 time_limit_s: Optional[float] = None,
                 ):
        """

        :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None).
            If None, 'cuda' will be used if available, otherwise 'cpu'.
        :param random_state: Random state to use for random number generation
            (splitting, initialization, batch shuffling). If None, the behavior is not deterministic.
        :param n_cv: Number of cross-validation splits to use (default=1).
            If validation set indices or an explicit validation set are given in fit(),
            `n_cv` models will be fitted using different random seeds.
            Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification).
            For n_cv=1, a single train-validation split will be used,
            where `val_fraction` controls the fraction of validation samples.
            If `n_refit=0` is set,
            the prediction will use the average of the models fitted during cross-validation.
            (Averaging is over probabilities for classification, and over outputs for regression.)
            Otherwise, refitted models will be used.
        :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0).
            If zero, only the models from the cross-validation stage are used.
            If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit())
            and their predictions will be averaged during predict().
        :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1).
            Values != 1 are only allowed when no custom validation split is provided.
            Larger number of repeats make things slower but reduce the potential for validation set overfitting,
            especially on smaller datasets.
        :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1).
            Only used if `n_cv==1` and no validation split is provided in fit().
        :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores).
        :param tmp_folder: Folder in which models can be stored.
            Setting this allows reducing RAM/VRAM usage by not having all models in RAM at the same time.
            In this case, the folder needs to be preserved as long as the model exists
            (including when the model is pickled to disk).
        :param verbosity: Verbosity level (default=0, higher means more verbose).
            Set to 2 to see logs from intermediate epochs.
        :param n_hyperopt_steps: Number of random hyperparameter configs
            that should be used to train models (default=50).
        :param val_metric_name: Name of the validation metric (used for selecting the best epoch).
            Defaults are 'class_error' for classification and 'rmse' for regression.
            Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo',
            '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'.
            Main available regression metrics: 'rmse', 'mae', 'max_error',
            'pinball(0.95)' (also works with other quantiles specified directly in the string).
            For more metrics, we refer to `models.training.metrics.Metrics.apply()`.
        :param max_leaf_samples: Maximum number of samples in a leaf of xRFM.
            For datasets with more than max_leaf_samples samples, the memory usage is O(max_leaf_samples**2)
            and the time complexity is roughly O(n_samples * max_leaf_samples**2).
            The default is around 60000, which is optimized for GPUs with ~40 GB of VRAM.
            Reduce this number to reduce the RAM usage.
        :param M_batch_size:
            Batch size used to construct the AGOP matrix M (default=8000).
            Higher values can speed up the computation but may lead to out-of-memory (esp. for the 'lpq' kernel).
        :param bandwidth_mode: How to set the bandwidth (default='constant').
            For 'constant', the specified bandwidth will be used directly.
            For 'adaptive', it will be scaled relative to the median distance between samples.
            We recommend 'constant' for smaller datasets (< max_leaf_samples) where only a single RFM is fit.
            For larger datasets, 'adaptive' may be more suited since it can adapt the bandwidth to the data in the leaf.
        :param calibration_method: Post-hoc calibration method (only for classification) (default=None).
            We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing.
            For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics.
        :param hpo_space_name: Name of the HPO space.
            We recommend using 'tabarena' (the default) for the best results.
        :param n_caruana_steps: Number of weight update iterations for Caruana et al. weighted ensembling (default=40).
            This parameter is only used when use_caruana_ensembling=True.
        :param use_caruana_ensembling: Whether to use the algorithm by Caruana et al. (2004)
            to select a weighted ensemble of models instead of only selecting the best model (default=False).
        :param time_limit_s: Time limit in seconds (default=None).
        """
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.n_hyperopt_steps = n_hyperopt_steps
        self.val_metric_name = val_metric_name
        self.max_leaf_samples = max_leaf_samples
        self.M_batch_size = M_batch_size
        self.bandwidth_mode = bandwidth_mode
        self.calibration_method = calibration_method
        self.hpo_space_name = hpo_space_name
        self.n_caruana_steps = n_caruana_steps
        self.use_caruana_ensembling = use_caruana_ensembling
        self.time_limit_s = time_limit_s


class XRFM_HPO_Classifier(XRFMHPOConstructorMixin, AlgInterfaceClassifier):
    """
    HPO spaces ('default') use xRFM
    """
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xrfm_interfaces import RandomParamsxRFMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsxRFMAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False


class XRFM_HPO_Regressor(XRFMHPOConstructorMixin, AlgInterfaceRegressor):
    """
    HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings
    """
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.xrfm_interfaces import RandomParamsxRFMAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsxRFMAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']

    def _supports_single_sample(self) -> bool:
        return False

    def _supports_multioutput(self) -> bool:
        return False


# ------------------------------


class MLP_RTDL_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class MLP_RTDL_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class MLP_PLR_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, num_emb_type='plr',
                                                               **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class MLP_PLR_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type(
            [RandomParamsRTDLMLPAlgInterface(model_idx=i, num_emb_type='plr', **config)
             for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class Resnet_RTDL_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsResnetAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsResnetAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class Resnet_RTDL_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsResnetAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsResnetAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class FTT_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsFTTransformerAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsFTTransformerAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class FTT_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsFTTransformerAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsFTTransformerAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class TabR_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import RandomParamsTabRAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsTabRAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class TabR_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor):
    def _get_default_params(self):
        return dict(n_hyperopt_steps=50)

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.tabr_interface import RandomParamsTabRAlgInterface
        config = self.get_config()
        n_hyperopt_steps = config['n_hyperopt_steps']
        interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling',
                                                                   False) else AlgorithmSelectionAlgInterface
        return interface_type([RandomParamsTabRAlgInterface(model_idx=i, **config)
                               for i in range(n_hyperopt_steps)], **config)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


# Ensemble-TD

class Ensemble_TD_Classifier(AlgInterfaceClassifier):
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 val_metric_name: Optional[str] = None, use_ls: Optional[bool] = None,
                 calibration_method: Optional[str] = None):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.val_metric_name = val_metric_name
        self.use_ls = use_ls
        self.calibration_method = calibration_method

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface

        extra_params = dict()
        if self.val_metric_name is not None:
            extra_params['val_metric_name'] = self.val_metric_name
        if self.use_ls is not None:
            extra_params['use_ls'] = self.use_ls
        if self.calibration_method is not None:
            extra_params['calibration_method'] = self.calibration_method
        td_interfaces = [
            SingleSplitWrapperAlgInterface(
                [LGBMSubSplitInterface(**DefaultParams.LGBM_TD_CLASS, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            SingleSplitWrapperAlgInterface(
                [XGBSubSplitInterface(**DefaultParams.XGB_TD_CLASS, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            SingleSplitWrapperAlgInterface(
                [CatBoostSubSplitInterface(**DefaultParams.CB_TD_CLASS, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            NNAlgInterface(**utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, extra_params)),
        ]
        return CaruanaEnsembleAlgInterface(td_interfaces, **extra_params)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class Ensemble_TD_Regressor(AlgInterfaceRegressor):
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 val_metric_name: Optional[str] = None):
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.val_metric_name = val_metric_name

    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface
        from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface

        extra_params = dict()
        if self.val_metric_name is not None:
            extra_params['val_metric_name'] = self.val_metric_name
        td_interfaces = [
            SingleSplitWrapperAlgInterface(
                [LGBMSubSplitInterface(**DefaultParams.LGBM_TD_REG, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            SingleSplitWrapperAlgInterface(
                [XGBSubSplitInterface(**DefaultParams.XGB_TD_REG, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            SingleSplitWrapperAlgInterface(
                [CatBoostSubSplitInterface(**DefaultParams.CB_TD_REG, **extra_params, allow_gpu=False) for i in
                 range(n_cv)]),
            NNAlgInterface(**DefaultParams.RealMLP_TD_REG, **extra_params),
        ]
        return CaruanaEnsembleAlgInterface(td_interfaces, **extra_params)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class EnsembleHPOConstructorMixin:
    def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
                 n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2,
                 n_threads: Optional[int] = None,
                 tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0,
                 val_metric_name: Optional[str] = None,
                 n_hpo_steps: int = 50,
                 calibration_method: Optional[str] = None,
                 use_full_caruana_ensembling: bool = False,
                 n_caruana_steps: int = 40,
                 use_tabarena_spaces: bool = False,
                 time_limit_s: Optional[float] = None,
                 ):
        """
        :param device:
        :param random_state:
        :param n_cv:
        :param n_refit:
        :param n_repeats:
        :param val_fraction:
        :param n_threads:
        :param tmp_folder:
        :param verbosity:
        :param val_metric_name:
        :param n_hpo_steps: Number of HPO configs per method.
        :param calibration_method: Calibration method (only for classification).
        :param use_full_caruana_ensembling: Whether to also ensemble
            different hyperparameter configs of the same method (default=False).
            False corresponds to the method used in the paper,
            True should give better results (with larger inference time).
        :param n_caruana_steps: How many iterations to use for Caruana et al. (2004) weighted ensembling.
        :param use_tabarena_spaces: Whether to use search spaces from TabArena instead of from the RealMLP paper.
        :param time_limit_s: Time limit in seconds (default=None).
        """
        self.device = device
        self.random_state = random_state
        self.n_cv = n_cv
        self.n_refit = n_refit
        self.n_repeats = n_repeats
        self.val_fraction = val_fraction
        self.n_threads = n_threads
        self.tmp_folder = tmp_folder
        self.verbosity = verbosity
        self.val_metric_name = val_metric_name
        self.n_hpo_steps = n_hpo_steps
        self.calibration_method = calibration_method
        self.use_full_caruana_ensembling = use_full_caruana_ensembling
        self.n_caruana_steps = n_caruana_steps
        self.use_tabarena_spaces = use_tabarena_spaces
        self.time_limit_s = time_limit_s


class Ensemble_HPO_Classifier(EnsembleHPOConstructorMixin, AlgInterfaceClassifier):
    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface
        from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface

        extra_params = dict()
        if self.val_metric_name is not None:
            extra_params['val_metric_name'] = self.val_metric_name
        if self.calibration_method is not None:
            extra_params['calibration_method'] = self.calibration_method
        if self.use_tabarena_spaces:
            extra_params['hpo_space_name'] = 'tabarena'
        extra_params['n_caruana_steps'] = self.n_caruana_steps
        extra_params['time_limit_s'] = self.time_limit_s
        n_hpo_steps = self.n_hpo_steps or 50
        hpo_configs = [
            [RandomParamsLGBMAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
                 range(n_hpo_steps)],
            [RandomParamsXGBAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
                 range(n_hpo_steps)],
            [RandomParamsCatBoostAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
                 range(n_hpo_steps)],
            [RandomParamsNNAlgInterface(model_idx=i, **extra_params) for i in range(n_hpo_steps)],
        ]
        if self.use_full_caruana_ensembling:
            hpo_interfaces = sum(hpo_configs, [])
        else:
            hpo_interfaces = [AlgorithmSelectionAlgInterface(lst, **extra_params) for lst in hpo_configs]

        return CaruanaEnsembleAlgInterface(hpo_interfaces, **extra_params)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


class Ensemble_HPO_Regressor(EnsembleHPOConstructorMixin, AlgInterfaceRegressor):
    def _create_alg_interface(self, n_cv: int) -> AlgInterface:
        from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface
        from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface
        from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface

        extra_params = dict()
        if self.val_metric_name is not None:
            extra_params['val_metric_name'] = self.val_metric_name
        if self.use_tabarena_spaces:
            extra_params['hpo_space_name'] = 'tabarena'
        extra_params['n_caruana_steps'] = self.n_caruana_steps
        extra_params['time_limit_s'] = self.time_limit_s
        n_hpo_steps = self.n_hpo_steps or 50
        hpo_configs = [
            [RandomParamsLGBMAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
             range(n_hpo_steps)],
            [RandomParamsXGBAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
             range(n_hpo_steps)],
            [RandomParamsCatBoostAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in
             range(n_hpo_steps)],
            [RandomParamsNNAlgInterface(model_idx=i, **extra_params) for i in range(n_hpo_steps)],
        ]
        if self.use_full_caruana_ensembling:
            hpo_interfaces = sum(hpo_configs, [])
        else:
            hpo_interfaces = [AlgorithmSelectionAlgInterface(lst, **extra_params) for lst in hpo_configs]
        return CaruanaEnsembleAlgInterface(hpo_interfaces, **extra_params)

    def _allowed_device_names(self) -> List[str]:
        return ['cpu', 'cuda', 'mps']


================================================
FILE: pytabkit/models/torch_utils.py
================================================
from typing import List, Union, Optional

import torch
import numpy as np


def get_available_device_names() -> List['str']:
    device_names = ['cpu'] + [f'cuda:{i}' for i in range(torch.cuda.device_count())]
    if torch.backends.mps.is_available():
        device_names.append('mps')
    return device_names


def seeded_randperm(n, device, seed):
    generator = torch.Generator()
    generator.manual_seed(seed)
    # todo: can this not be generated directly on the device?
    return torch.randperm(n, generator=generator).to(device)


def permute_idxs(idxs, seed):
    return idxs[seeded_randperm(idxs.shape[0], idxs.device, seed)]


def batch_randperm(n_batch, n, device='cpu'):
    # batched randperm:
    # https://discuss.pytorch.org/t/batched-shuffling-of-feature-vectors/30188/4
    # https://github.com/pytorch/pytorch/issues/42502
    return torch.stack([torch.randperm(n, device=device) for i in range(n_batch)], dim=0)


# from https://github.com/runopti/stg/blob/9f630968c4f14cff6da4e54421c497f24ac1e08e/python/stg/layers.py#L10
def gauss_cdf(x):
    return 0.5 * (1 + torch.erf(x / np.sqrt(2)))


class ClampWithIdentityGradientFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input: torch.Tensor, low: torch.Tensor, high: torch.Tensor):
        return torch.minimum(torch.maximum(input, low), high)

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor):
        return grad_output, None, None


def clamp_with_identity_gradient_func(x, low, high):
    return ClampWithIdentityGradientFunc.apply(x, low, high)


def cat_if_necessary(tensors: List[torch.Tensor], dim: int):
    """
    Implements torch.cat() but doesn't copy if only one tensor is provided.
    This can make it faster if no copying behavior is needed.
    :param tensors: Tensors to be concatenated.
    :param dim: Dimension in which the tensor should be concatenated.
    :return: The concatenated tensor.
    """
    if len(tensors) == 1:
        return tensors[0]
    return torch.cat(tensors, dim=dim)


def hash_tensor(tensor: torch.Tensor) -> int:
    # for debugging purposes, to print two tensor's hashes to see if they are equal
    # from https://discuss.pytorch.org/t/defining-hash-function-for-multi-dimensional-tensor/107531
    import pickle
    # the .numpy() appears to be necessary for equal tensors to have equal hashes
    return hash(pickle.dumps(tensor.detach().cpu().numpy()))


def torch_np_quantile(tensor: torch.Tensor, q: float, dim: int, keepdim: bool = False) -> torch.Tensor:
    """
    Alternative implementation for torch.quantile() using np.quantile()
    since the implementation of torch.quantile() uses too much RAM (extreme for Airlines_DepDelay_10M)
    and can fail for too large tensors.
    See also https://github.com/pytorch/pytorch/issues/64947
    :param tensor: tensor
    :param q: Quantile value.
    :param dim: As in torch.quantile()
    :param keepdim: As in torch.quantile()
    :return: Tensor with quantiles.
    """
    x_np = tensor.detach().cpu().numpy()
    q_np = np.quantile(x_np, q=q, axis=dim, keepdims=keepdim)
    return torch.as_tensor(q_np, device=tensor.device, dtype=tensor.dtype)


from time import perf_counter
import torch


def _cuda_in_use() -> bool:
    """Return True if CUDA is available and initialized."""
    if not torch.cuda.is_available():
        return False
    # is_initialized exists in recent PyTorch; fall back to True if missing
    is_initialized = getattr(torch.cuda, "is_initialized", None)
    if is_initialized is None:
        return True
    return is_initialized()


class TorchTimer:
    """
    Timer for measuring code blocks, with optional CUDA synchronization.

    Usage:
        with TorchTimer() as t:
            y = model(x)
        print(t.elapsed)

        # Or manual start/stop:
        t = TorchTimer()
        t.start()
        y = model(x)
        t.stop()
        print(t.elapsed)
    """

    def __init__(self, use_cuda: Optional[bool] = None, record_history: bool = False):
        """
        Args:
            use_cuda:
                - None (default): auto-detect; sync only if CUDA is in use.
                - True: force CUDA sync (if available).
                - False: never sync CUDA.
            record_history:
                If True, every measurement is appended to `self.history`.
        """
        self._user_use_cuda = use_cuda
        self.record_history = record_history
        self.elapsed = None
        self.history = [] if record_history else None
        self._start = None

    @property
    def _do_cuda_sync(self) -> bool:
        if self._user_use_cuda is False:
            return False
        if self._user_use_cuda is True:
            return torch.cuda.is_available()
        # Auto mode: only if CUDA is available *and* initialized
        return _cuda_in_use()

    # ------- context manager API -------

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop()

    # ------- manual API -------

    def start(self):
        if self._do_cuda_sync:
            torch.cuda.synchronize()
        self._start = perf_counter()

    def stop(self):
        if self._start is None:
            raise RuntimeError("TorchTimer.stop() called before start().")
        if self._do_cuda_sync:
            torch.cuda.synchronize()
        self.elapsed = perf_counter() - self._start
        if self.record_history:
            self.history.append(self.elapsed)
        return self.elapsed


def get_available_memory_gb(device: Union[str, torch.device]) -> float:
    """
    Return the available memory (in GB) on the given device.

    Parameters
    ----------
    device : str or torch.device
        Device identifier, e.g. "cuda", "cuda:0", or torch.device("cuda:0").

    Returns
    -------
    float
        Available memory in gigabytes.

    Notes
    -----
    - For CUDA devices, this uses torch.cuda.mem_get_info if available.
    - For CPU, it uses psutil.virtual_memory().available.
    - For other device types, NotImplementedError is raised.
    """
    dev = torch.device(device)

    if dev.type == "cuda":
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available, but a CUDA device was requested.")

        # Ensure we are querying the correct device
        torch.cuda.synchronize(dev)

        if hasattr(torch.cuda, "mem_get_info"):
            free_bytes, total_bytes = torch.cuda.mem_get_info(dev)
        else:
            # Fallback: approximate using total_memory - reserved_by_pytorch
            props = torch.cuda.get_device_properties(dev)
            total_bytes = props.total_memory
            reserved_bytes = torch.cuda.memory_reserved(dev)
            free_bytes = max(total_bytes - reserved_bytes, 0)

        return free_bytes / (1024 ** 3)  # bytes -> GiB

    elif dev.type == "cpu":
        try:
            import psutil
        except ImportError as e:
            raise ImportError(
                "psutil is required to query CPU memory. Install via `pip install psutil`."
            ) from e

        mem = psutil.virtual_memory()
        return mem.available / (1024 ** 3)

    else:
        raise NotImplementedError(f"Memory query not implemented for device type '{dev.type}'")


================================================
FILE: pytabkit/models/training/__init__.py
================================================


================================================
FILE: pytabkit/models/training/auc_mu.py
================================================
# taken from https://github.com/kleimanr/auc_mu/blob/master/auc_mu.py

"""
Computation of the measure 'AUC Mu'. This measure requires installation of the
numpy and sklearn libraries.

This code corresponds to the paper: Kleiman, R., Page, D. ``AUC Mu: A
Performance Metric for Multi-Class Machine Learning Models``, Proceedings of the
2019 International Conference on Machine Learning (ICML).
"""

__author__ = "Ross Kleiman"
__copyright__ = "Copyright 2019"
__credits__ = ["Ross Kleiman"]
__license__ = "MIT"
__version__ = "1.0"
__maintainer__ = "Ross Kleiman"
__email__ = "rkleiman@cs.wisc.edu"
__status__ = "Production"

import numpy as np
from sklearn.metrics import roc_auc_score


# ----------------------------------------------------------------------
def auc_mu_impl(y_true, y_score, A=None, W=None):
    """
    Compute the multi-class measure AUC Mu from prediction scores and labels.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        The true class labels in the range [0, n_samples-1]

    y_score : array, shape = [n_samples, n_classes]
        Target scores, where each row is a categorical distribution over the
        n_classes.

    A : array, shape = [n_classes, n_classes], optional
        The partition (or misclassification cost) matrix. If ``None`` A is the
        argmax partition matrix. Entry A_{i,j} is the cost of classifying an
        instance as class i when the true class is j. It is expected that
        diagonal entries in A are zero and off-diagonal entries are positive.

    W : array, shape = [n_classes, n_classes], optional
        The weight matrix for incorporating class skew into AUC Mu. If ``None``,
        the standard AUC Mu is calculated. If W is specified, it is expected to
        be a lower triangular matrix where entrix W_{i,j} is a positive float
        from 0 to 1 for the partial score between classes i and j. Entries not
        in the lower triangular portion of W must be 0 and the sum of all
        entries in W must be 1.

    Returns
    -------
    auc_mu : float

    References
    ----------
    .. [1] Kleiman, R., Page, D. ``AUC Mu: A Performance Metric for Multi-Class
           Machine Learning Models``, Proceedings of the 2019 International
           Conference on Machine Learning (ICML).

    """

    # Validate input arguments
    if not isinstance(y_score, np.ndarray):
        raise TypeError("Expected y_score to be np.ndarray, got: %s"
                        % type(y_score))
    if not y_score.ndim == 2:
        raise ValueError("Expected y_score to be 2 dimensional, got: %s"
                         % y_score.ndim)
    n_samples, n_classes = y_score.shape

    if not isinstance(y_true, np.ndarray):
        raise TypeError("Expected y_true to be np.ndarray, got: %s"
                        % type(y_true))
    if not y_true.ndim == 1:
        raise ValueError("Expected y_true to be 1 dimensional, got: %s"
                         % y_true.ndim)
    if not y_true.shape[0] == n_samples:
        raise ValueError("Expected y_true to be shape %s, got: %s"
                         % (str(y_score.shape), str(y_true.shape)))
    unique_labels = np.unique(y_true)
    if not np.all(unique_labels == np.arange(n_classes)):
        raise ValueError("Expected y_true values in range 0..%i, got: %s"
                         % (n_classes - 1, str(unique_labels)))

    if A is None:
        A = np.ones((n_classes, n_classes)) - np.eye(n_classes)
    if not isinstance(A, np.ndarray):
        raise TypeError("Expected A to be np.ndarray, got: %s"
                        % type(A))
    if not A.ndim == 2:
        raise ValueError("Expected A to be 2 dimensional, got: %s"
                         % A.ndim)
    if not A.shape == (n_classes, n_classes):
        raise ValueError("Expected A to be shape (%i, %i), got: %s"
                         % (n_classes, n_classes, str(A.shape)))
    if not np.all(A.diagonal() == np.zeros(n_classes)):
        raise ValueError("Expected A to be zero on the diagonals")
    if not np.all(A >= 0):
        raise ValueError("Expected A to be non-negative")

    if W is None:
        W = np.tri(n_classes, k=-1)
        W /= W.sum()
    if not isinstance(W, np.ndarray):
        raise TypeError("Expected W to be np.ndarray, got: %s"
                        % type(W))
    if not W.ndim == 2:
        raise ValueError("Expected W to be 2 dimensional, got: %s"
                         % W.ndim)
    if not W.shape == (n_classes, n_classes):
        raise ValueError("Expected W to be shape (%i, %i), got: %s"
                         % (n_classes, n_classes, str(W.shape)))

    auc_total = 0.0

    for class_i in range(n_classes):
        preds_i = y_score[y_true == class_i]
        n_i = preds_i.shape[0]
        for class_j in range(class_i):
            preds_j = y_score[y_true == class_j]
            temp_preds = np.vstack((preds_i, preds_j))
            n_j = preds_j.shape[0]
            n = n_i + n_j

            temp_labels = np.zeros(n, dtype=int)
            temp_labels[n_i:n] = 1

            v = A[class_i, :] - A[class_j, :]
            scores = np.dot(temp_preds, v)

            score_i_j = roc_auc_score(temp_labels, scores)
            auc_total += W[class_i, class_j] * score_i_j

    return auc_total

================================================
FILE: pytabkit/models/training/coord.py
================================================
from typing import Dict

from pytabkit.models.training.scheduling import ConstantSchedule, get_schedule

# layers are created multiple times => either only register after stacking or allow to register multiple times


class HyperparamManager:
    class HyperGetter:
        def __init__(self, tc: 'HyperparamManager', hyper_name: str, base_value_pattern: str, sched_pattern: str):
            self.tc = tc
            self.hyper_name = hyper_name
            self.base_value_pattern = base_value_pattern
            self.sched_pattern = sched_pattern

        def __call__(self):
            return self.tc.hyper_base_values[self.hyper_name][self.base_value_pattern] * \
                   self.tc.get_hyper_sched_values()[self.hyper_name][self.sched_pattern]

    def __init__(self, **config):
        self.config = config
        self.hyper_base_values = {}
        self.hyper_scheds = {}
        self.hyper_sched_values = None
        # regularization terms
        self.reg_terms = []
        self.needs_update = True  # indicates whether self.hyper_sched_values needs to be updated
        self.more_info_dict = {}  # can be set from outside

    def get_more_info_dict(self) -> Dict:
        return self.more_info_dict

    def _find_pattern(self, d: dict, scope):
        pattern = None
        for key in d:
            if scope.matches(key):
                #print(d, scope, key)
                pattern = key
        if pattern is None:  # no pattern was found
            raise ValueError(f'No key in dict {d} matches scope {str(scope)}')
        return pattern

    def register_hyper(self, name: str, scope, default=None, default_sched=lambda: ConstantSchedule(1.0)):
        if name not in self.hyper_scheds:
            base_dict = self.config.get(name, default)
            if not isinstance(base_dict, dict):
                base_dict = {'': base_dict}
            sched_dict = self.config.get(name + '_sched', default_sched)
            if not isinstance(sched_dict, dict):
                sched_dict = {'': sched_dict}
            sched_dict = {key: get_schedule(sched) if isinstance(sched, str) else sched()
                            for key, sched in sched_dict.items()}
            self.hyper_scheds[name] = sched_dict
            self.hyper_base_values[name] = base_dict
            self.needs_update = True

        return HyperparamManager.HyperGetter(self, name,
                                            base_value_pattern=self._find_pattern(self.hyper_base_values[name], scope),
                                            sched_pattern=self._find_pattern(self.hyper_scheds[name], scope))

    # def _to_array(self, value, name: str, length: int) -> torch.Tensor:
    #     if hasattr(value, "__len__"):
    #         # result is already a list or a numpy array
    #         if len(value) != length:
    #             raise ValueError(f'Hyperparameter {name} has {len(value)} values but should have {length} values')
    #         return torch.as_tensor(value)
    #     else:
    #         return torch.as_tensor([value] * length)

    def get_hyper_sched_values(self):
        self.update_hyper_sched_values()
        return self.hyper_sched_values

    def update_hyper_sched_values(self):
        if self.needs_update:
            # print(f'update')
            self.hyper_sched_values = {name: {pattern: sched.get_value() for pattern, sched in sched_dict.items()}
                                       for name, sched_dict in self.hyper_scheds.items()}
            self.needs_update = False

    def add_reg_term(self, loss):
        self.reg_terms.append(loss)

    def update_hypers(self, learner):
        # reset regularization terms
        self.reg_terms = []

        self.needs_update = True

        for name, sched_dict in self.hyper_scheds.items():
            for pattern, sched in sched_dict.items():
                sched.update(learner)

        self.update_hyper_sched_values()


================================================
FILE: pytabkit/models/training/lightning_callbacks.py
================================================
from typing import List, Any, Optional, Union, Dict

import numpy as np
import torch

try:
    from lightning.pytorch.callbacks import Callback
    import lightning.pytorch as pl
except ImportError:
    from pytorch_lightning.callbacks import Callback
    import pytorch_lightning as pl
from torch import Tensor

from pytabkit.models.nn_models.base import Variable, Layer
from pytabkit.models.training.coord import HyperparamManager
from pytabkit.models.training.logging import Logger


class ParamCheckpointer:
    def __init__(self, n_tv_splits: int, n_tt_splits: int, n_ens: int):
        self.n_tv_splits = n_tv_splits
        self.n_tt_splits = n_tt_splits
        self.n_ens = n_ens
        self.ckpt_params = [None] * (self.n_tt_splits * self.n_tv_splits)
        self.ckpt_buffers = [None] * (self.n_tt_splits * self.n_tv_splits)

    def save(self, parallel_idx: int, model_idx: int, model: Layer):
        idx = self.n_tv_splits * parallel_idx + model_idx
        with torch.no_grad():
            for ckpt, values in [(self.ckpt_params, model.parameters()), (self.ckpt_buffers, model.buffers())]:
                if ckpt[idx] is None:
                    ckpt[idx] = [v[idx*self.n_ens:(idx+1)*self.n_ens].clone() for v in values]
                else:
                    for c, v in zip(ckpt[idx], values):
                        c.copy_(v[idx*self.n_ens:(idx+1)*self.n_ens])

    def restore(self, parallel_idx: int, model_idx: int, model: Layer):
        idx = self.n_tv_splits * parallel_idx + model_idx
        with torch.no_grad():
            for ckpt, values in [(self.ckpt_params, model.parameters()), (self.ckpt_buffers, model.buffers())]:
                if ckpt[idx] is not None:
                    for c, v in zip(ckpt[idx], values):
                        # print(f'Restore diff: {v[start:end]-c}')
                        v[idx*self.n_ens:(idx+1)*self.n_ens] = c

    def save_all(self, model: Layer):
        for parallel_idx in range(self.n_tt_splits):
            for model_idx in range(self.n_tv_splits):
                self.save(parallel_idx, model_idx, model)

    def restore_all(self, model: Layer):
        for parallel_idx in range(self.n_tt_splits):
            for model_idx in range(self.n_tv_splits):
                self.restore(parallel_idx, model_idx, model)


class HyperparamCallback(Callback):
    def __init__(self, hp_manager):
        self.hp_manager = hp_manager

    def on_train_batch_start(
            self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
    ) -> None:
        # print(list(pl_module.model.parameters())[-1][0, -1].item())
        self.hp_manager.update_hypers(pl_module)

    def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: Tensor) -> None:
        reg_terms = self.hp_manager.reg_terms
        if len(reg_terms) > 0:
            pl_module.loss += sum(reg_terms)

    def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        del self.hp_manager  # todo: added


class L1L2RegCallback(Callback):
    def __init__(self, hp_manager: HyperparamManager, model: Layer):
        self.hp_manager = hp_manager
        self.params: List[Variable] = list(model.parameters())
        self.l1_getters = [self.hp_manager.register_hyper('l1_reg', p.context.scope, default=0.0)
                           for p in self.params]
        self.l2_getters = [self.hp_manager.register_hyper('l2_reg', p.context.scope, default=0.0)
                           for p in self.params]

    def on_after_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        for l1_getter, l2_getter, p in zip(self.l1_getters, self.l2_getters, self.params):
            l1_reg = l1_getter() * p.hyper_factors.get('l1_reg', 1.0)
            l2_reg = l2_getter() * p.hyper_factors.get('l2_reg', 1.0)

            if l1_reg != 0.0:
                p.grad += l1_reg * torch.sign(p)
            if l2_reg != 0.0:
                p.grad += (2.0 * l2_reg) * p

        self.hp_manager.update_hypers(pl_module)


class ModelCheckpointCallback(Callback):
    def __init__(self, n_tt_splits: int, n_tv_splits: int, n_ens: int, use_best_mean_epoch: bool, val_metric_name: str,
                 restore_best: bool = False):
        self.n_tt_splits = n_tt_splits
        self.n_tv_splits = n_tv_splits
        self.n_ens = n_ens
        self.val_metric_name = val_metric_name
        self.restore_best = restore_best
        self.use_best_mean_epoch = use_best_mean_epoch
        self.ckpt = ParamCheckpointer(n_tv_splits=n_tv_splits, n_tt_splits=self.n_tt_splits, n_ens=n_ens)

    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        self.ckpt.save_all(pl_module.model)

    def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        for tt_split_idx in range(self.n_tt_splits):
            for tv_split_idx in range(self.n_tv_splits):
                if self.use_best_mean_epoch:
                    if pl_module.best_mean_val_epochs[self.val_metric_name][tt_split_idx] == pl_module.progress.epoch:
                        # if this is the best epoch, save the model
                        self.ckpt.save(tt_split_idx, tv_split_idx, pl_module.model)
                else:
                    if pl_module.best_val_epochs[self.val_metric_name][tt_split_idx][tv_split_idx] == pl_module.progress.epoch:
                        # print(f'found improvement')
                        # if this is the best epoch, save the model
                        self.ckpt.save(tt_split_idx, tv_split_idx, pl_module.model)

    def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        # restore at the end. In case of multiple val metrics, can use restore() separately to restore the desired one.
        self.restore(pl_module)

    def restore(self, pl_module: "pl.LightningModule") -> None:
        # restore best params
        for key, state in pl_module.optimizers().opt.state.items():
            # lightning automatically moves the model to the CPU after training,
            # so we have to do the same for the optimizer state
            # for sfadam
            if 'z' in state:
                state['z'] = state['z'].cpu()
        pl_module.optimizers().eval()  # todo: bit of a hack because ideally the optimizer state should also be restored
        if not self.restore_best:
            raise RuntimeError('ValidationCallback: Cannot restore best params when using save_best_params=False')
        self.ckpt.restore_all(pl_module.model)


class StopAtEpochsCallback(Callback):
    def __init__(self, stop_epochs: List[List[Union[Dict[str, int], int]]], n_models: int, n_ens: int, model: Layer, logger: Optional[Logger] = None):
        print(f'Refit: {stop_epochs=}')

        # stop_epochs now has a dict with {metric_name: stop_epoch}, so we need to extract just the stop_epoch
        def get_epoch(value: Union[Dict[str, int], int]):
            if isinstance(value, dict):
                values = list(value.values())
                if len(values) != 1:
                    raise ValueError(f'Got stop epochs for multiple metrics, which is not supported in refitting!')
                return values[0]
            return value
        self.stop_epochs = [[get_epoch(ep) for ep in lst] for lst in stop_epochs]
        self.final_stop_epoch = np.max(sum(self.stop_epochs, []))
        self.model = model
        self.ckpt = ParamCheckpointer(n_tv_splits=n_models, n_tt_splits=len(stop_epochs), n_ens=n_ens)
        self.logger = logger
        self.n_models = n_models

    def _handle_epoch(self, trainer: "pl.Trainer", epoch: int) -> None:
        if self.logger:
            self.logger.log(2, f'Refit Epoch {epoch}/{self.final_stop_epoch}')

        if epoch == self.final_stop_epoch:
            # print(f'Stopping the training at epoch {epoch}')
            self.ckpt.restore_all(self.model)
            trainer.should_stop = True
            return

        for tt_split_idx, tv_stop_epochs in enumerate(self.stop_epochs):
            for tv_split_idx, ep in enumerate(tv_stop_epochs):
                if ep == epoch:
                    # print(f'Saving checkpoint for model {i}')
                    self.ckpt.save(tt_split_idx, tv_split_idx, self.model)

    # def on_train_batch_start(
    #     self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
    # ) -> None:
    #     print('train batch')

    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        self._handle_epoch(trainer, epoch=0)

    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
        self._handle_epoch(trainer, epoch=trainer.current_epoch + 1)


================================================
FILE: pytabkit/models/training/lightning_modules.py
================================================
from pytabkit.models.training.lightning_callbacks import ModelCheckpointCallback

try:
    import lightning.pytorch as pl
except ImportError:
    import pytorch_lightning as pl
from typing import List, Optional, Dict, Any
import numpy as np
import torch

from pytabkit.models.data.data import ParallelDictDataLoader, DictDataset
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources
from pytabkit.models.nn_models.base import Layer
from pytabkit.models.optim.optimizers import get_opt_class
from pytabkit.models.training.nn_creator import NNCreator
from pytabkit.models.training.logging import StdoutLogger, Logger
from pytabkit.models.training.metrics import Metrics
from pytabkit.models.training.scheduling import LearnerProgress


def postprocess_multiquantile(y_pred: torch.Tensor, val_metric_name: Optional[str] = None,
                              sort_quantile_predictions: bool = True,
                              **config):
    if val_metric_name is None or not val_metric_name.startswith('multi_pinball(') or not sort_quantile_predictions:
        return y_pred

    quantiles = [float(q_str) for q_str in val_metric_name[len('multi_pinball('):-1].split(',')]
    if not all([a <= b for a, b in zip(quantiles[:-1], quantiles[1:])]):
        raise ValueError(f'Quantiles {quantiles} must be sorted')

    return y_pred.sort(dim=-1)[0]


class TabNNModule(pl.LightningModule):
    def __init__(self, n_epochs: int = 256, logger: Optional[Logger] = None,
                 fit_params: Optional[List[Dict[str, Any]]] = None,
                 **config):
        """
        Pytorch Lightning Module for building and training a pytorch NN for tabular data.
        The core of the module is the NNCreatorInterface, which is used to create the model, the callbacks,
        the hyperparameter manager and the dataloaders. The TabNNModule is responsible for the training loop,
        (optional) validation and inference.
        """
        super().__init__()
        self.my_logger = logger or StdoutLogger(verbosity_level=config.get('verbosity', 0))
        # todo: improve this
        self.creator = NNCreator(
            n_epochs=n_epochs, fit_params=fit_params, **config
        )

        self.hp_manager = self.creator.hp_manager
        self.model: Optional[Layer] = None
        self.criterion = None
        self.train_dl = None

        self.progress = LearnerProgress()
        self.progress.max_epochs = n_epochs
        self.fit_params = fit_params

        # Validation
        self.val_preds = []
        self.old_training = None
        self.val_dl = None
        self.save_best_params = True
        self.val_metric_names = None
        self.epoch_mean_val_errors = None
        self.best_mean_val_errors = None
        self.best_mean_val_epochs = None
        self.best_val_errors = None
        self.best_val_epochs = None
        self.has_stopped_list = None
        self.callbacks = None
        # will contain {val_metric_name: ModelCheckpointCallback(..., val_metric_name)}
        self.ckpt_callbacks = dict()

        # LightningModule
        self.automatic_optimization = False

        self.config = config

    def compile_model(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources):
        """
        Method to create the model and all other training dependencies given the dataset and the assigned resources.
        Once this is called, the module is ready for training.
        """
        self.creator.setup_from_dataset(
            ds, idxs_list=idxs_list, interface_resources=interface_resources
        )
        self.is_classification_ = ds.tensor_infos['y'].is_cat()
        self.model = self.creator.create_model(ds, idxs_list=idxs_list)
        self.train_dl, self.val_dl = self.creator.create_dataloaders(ds)
        self.criterion, self.val_metric_names = self.creator.get_criterions()

    def create_callbacks(self):
        """ Helper method to return callbacks for the trainer.fit callback argument."""
        assert self.val_metric_names is not None
        self.callbacks = self.creator.create_callbacks(self.model, self.my_logger, self.val_metric_names)
        self.ckpt_callbacks = {}
        for callback in self.callbacks:
            if isinstance(callback, ModelCheckpointCallback):
                self.ckpt_callbacks[callback.val_metric_name] = callback
        return self.callbacks

    def get_predict_dataloader(self, ds: DictDataset):
        """ Helper method to create a dataloader for inference."""
        ds_x, _ = ds.split_xy()
        ds_x = self.creator.static_model.forward_ds(ds_x)
        # ds_x = self.static_model.forward_ds(ds_x)
        idxs_single = torch.arange(ds.n_samples, dtype=torch.long)
        n_ens = self.config.get('n_ens', 1)
        idxs = idxs_single[None, :].expand(
            self.creator.n_tt_splits * self.creator.n_tv_splits * n_ens, -1
        )

        return ParallelDictDataLoader(ds=ds_x, idxs=idxs,
                                      batch_size=self.creator.config.get("predict_batch_size", 1024))

    # ----- Start LightningModule Methods -----
    def on_fit_start(self):
        self.model.train()
        self.optimizers().train()
        # mean val errors will not be accurate if all epochs after this yield NaN
        self.best_mean_val_errors = {val_metric_name: [np.inf] * self.creator.n_tt_splits for val_metric_name in
                                     self.val_metric_names}
        # epoch 0 counts as before training, epoch 1 is first epoch
        self.best_mean_val_epochs = {val_metric_name: [0] * self.creator.n_tt_splits for val_metric_name in
                                     self.val_metric_names}
        # don't use simpler notation of the form [[]] * 2 because this will have two references to the same inner array!
        self.best_val_errors = {
            val_metric_name: [[np.inf] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for
            val_metric_name in self.val_metric_names}
        self.best_val_epochs = {
            val_metric_name: [[0] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for
            val_metric_name in self.val_metric_names}
        self.has_stopped_list = {
            val_metric_name: [[False] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for
            val_metric_name in self.val_metric_names}

    def training_step(self, batch, batch_idx):
        # x = batch["x_cont"]
        # x = x / (1e-8 + x.std(dim=-2, keepdim=True))
        # print(f'{x.mean().item()=}')
        # print(f'{list(self.model.parameters())[0].mean().item()=}')
        # print(f'{list(self.model.parameters())[-1].mean().item()=}')
        output = self.model(batch)
        opt = self.optimizers()
        # do sum() over models dimension
        loss = self.criterion(output["x_cont"], output["y"]).sum()
        # print(f'{loss.item()=}')
        # Callbacks for regularization are called before the backward pass
        self.manual_backward(loss)
        opt.step(loss=loss)
        opt.zero_grad()

        self.progress.total_samples += batch["y"].shape[-2]
        self.progress.epoch_float = (
                self.progress.total_samples / self.train_dl.get_num_iterated_samples()
        )
        return loss

    def on_validation_start(self):
        self.old_training = self.model.training
        self.val_preds = []
        self.model.eval()

    def validation_step(self, batch, batch_idx):
        self.val_preds.append(self.model(batch)["x_cont"])

    def on_validation_epoch_end(self):
        self.model.train(self.old_training)
        self.old_training = None
        y_pred = self._postprocess_ens_pred(torch.cat(self.val_preds, dim=-2))

        y_pred = postprocess_multiquantile(y_pred, **self.config)

        n_ens = self.config.get('n_ens', 1)
        # y is duplicated by the dataloader as well in the ensemble case, deduplicate it
        y = self.val_dl.val_y[::n_ens]

        use_early_stopping = self.config.get('use_early_stopping', False)
        early_stopping_additive_patience = self.config.get('early_stopping_additive_patience', 20)
        early_stopping_multiplicative_patience = self.config.get('early_stopping_multiplicative_patience', 2)

        for val_metric_name in self.val_metric_names:
            val_errors = torch.as_tensor(
                [
                    Metrics.apply(
                        y_pred[i, :, :], y[i, :, :], val_metric_name
                    )
                    for i in range(y_pred.shape[0])
                ]
            )
            val_errors = val_errors.view(
                self.creator.n_tt_splits, self.creator.n_tv_splits
            )
            mean_val_errors = val_errors.mean(dim=-1)  # mean over cv/refit dimension
            mean_val_error = mean_val_errors.mean().item()

            self.my_logger.log(
                2,
                f"Epoch {self.progress.epoch + 1}/{self.progress.max_epochs}: val {val_metric_name} = {mean_val_error:6.6f}",
            )

            current_epoch = self.progress.epoch + 1

            for tt_split_idx in range(self.creator.n_tt_splits):
                use_last_best_epoch = self.config.get('use_last_best_epoch', True)

                has_stopped = self.has_stopped_list[val_metric_name][tt_split_idx]

                # compute best single-split validation errors
                for tv_split_idx in range(self.creator.n_tv_splits):
                    if use_early_stopping and not has_stopped[tv_split_idx]:
                        if current_epoch > early_stopping_multiplicative_patience \
                                * self.best_val_epochs[val_metric_name][tt_split_idx][tv_split_idx] \
                                + early_stopping_additive_patience:
                            has_stopped[tv_split_idx] = True

                    if not has_stopped[tv_split_idx]:
                        # compute best validation errors
                        current_err = val_errors[tt_split_idx, tv_split_idx].item()
                        best_err = self.best_val_errors[val_metric_name][tt_split_idx][tv_split_idx]
                        # use <= on purpose such that latest epoch among tied best epochs is kept
                        # this has been slightly beneficial for accuracy in previous experiments
                        improved = current_err <= best_err if use_last_best_epoch \
                            else current_err < best_err
                        if improved:
                            self.best_val_errors[val_metric_name][tt_split_idx][tv_split_idx] = current_err
                            self.best_val_epochs[val_metric_name][tt_split_idx][tv_split_idx] = (
                                    self.progress.epoch + 1
                            )

                if not any(has_stopped):
                    # compute best mean validation errors (averaged over sub-splits (cv/refit))
                    # use <= on purpose such that latest epoch among tied best epochs is kept
                    # this has been slightly beneficial for accuracy in previous experiments
                    improved = mean_val_errors[tt_split_idx] <= self.best_mean_val_errors[val_metric_name][
                        tt_split_idx] if use_last_best_epoch \
                        else mean_val_errors[tt_split_idx] < self.best_mean_val_errors[val_metric_name][tt_split_idx]
                    if improved:
                        self.best_mean_val_errors[val_metric_name][tt_split_idx] = mean_val_errors[tt_split_idx]
                        self.best_mean_val_epochs[val_metric_name][tt_split_idx] = (
                                self.progress.epoch + 1
                        )
        self.progress.epoch += 1

        if use_early_stopping and all(all([all(sub_lst) for sub_lst in lst]) for lst in self.has_stopped_list.values()):
            self.trainer.should_stop = True

    def on_fit_end(self):
        # if self.creator.config.get("use_best_epoch", True):
        #     self.fit_params = [{'stop_epoch': mean_ep, 'best_indiv_stop_epochs': single_eps}
        #                        for mean_ep, single_eps in zip(self.best_mean_val_epochs, self.best_val_epochs)]
        # else:
        #     self.fit_params = [
        #         {"stop_epoch": self.progress.max_epochs}
        #         for i in range(self.creator.n_tt_splits)
        #     ]

        if self.creator.config.get("use_best_epoch", True):
            self.fit_params = [{'stop_epoch': {val_metric_name: self.best_mean_val_epochs[val_metric_name][i] for
                                               val_metric_name in self.val_metric_names},
                                'best_indiv_stop_epochs': {val_metric_name: self.best_val_epochs[val_metric_name][i] for
                                                           val_metric_name in self.val_metric_names}}
                               for i in range(self.creator.n_tt_splits)]
        else:
            self.fit_params = [
                {"stop_epoch": {val_metric_name: self.progress.max_epochs for val_metric_name in self.val_metric_names}}
                for i in range(self.creator.n_tt_splits)
            ]

        # put in eval() mode for predict(), so we don't need to save the trainer and the optimizer state
        self.optimizers(use_pl_optimizer=False).eval()

        # delete stuff so we don't save the dataset when pickling RealMLP
        del self.creator.train_idxs
        del self.creator.val_idxs
        del self.train_dl
        del self.val_dl
        del self.val_preds
        del self.callbacks
        del self.ckpt_callbacks


    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
        self.model.eval()
        with torch.no_grad():
            return self._postprocess_ens_pred(self.model(batch)["x_cont"].to("cpu"))

    def _postprocess_ens_pred(self, y_pred: torch.Tensor) -> torch.Tensor:
        # if n_ens > 1, we need to average the predictions of the ensemble members
        n_ens = self.config.get('n_ens', 1)
        if n_ens == 1:
            return y_pred

        y_pred = y_pred.reshape(y_pred.shape[0] // n_ens, n_ens, *y_pred.shape[1:])
        if self.is_classification_ and not self.config.get('ens_av_before_softmax', False):
            y_pred = torch.softmax(y_pred, dim=-1)
            y_pred = y_pred.mean(dim=1)
            y_pred = torch.log(y_pred + 1e-30)
        else:
            y_pred = y_pred.mean(dim=1)
        return y_pred


    def configure_optimizers(self):
        param_groups = [{"params": [p], "lr": 0.01} for p in self.model.parameters()]
        return get_opt_class(self.config.get('opt', 'adam'))(param_groups, self.hp_manager)

    def restore_ckpt_for_val_metric_name(self, val_metric_name: str):
        self.ckpt_callbacks[val_metric_name].restore(self)

    # from https://github.com/Lightning-AI/pytorch-lightning/discussions/19759
    # def on_fit_start(self) -> None:
    #     self.optimizers().train()  # already above

    # def on_predict_start(self) -> None:
    #     print(f'predict start')
    #     self.optimizers(use_pl_optimizer=False).eval()

    def on_validation_model_eval(self) -> None:
        self.model.eval()
        self.optimizers(use_pl_optimizer=False).eval()

    def on_validation_model_train(self) -> None:
        self.model.train()
        self.optimizers(use_pl_optimizer=False).train()

    def on_test_model_eval(self) -> None:
        self.model.eval()
        self.optimizers(use_pl_optimizer=False).eval()

    def on_test_model_train(self) -> None:
        self.model.train()
        self.optimizers(use_pl_optimizer=False).train()

    def on_predict_model_eval(self) -> None:  # redundant with on_predict_start()
        self.model.eval()
        # don't do it here in case we don't have the optimizers at predict time
        # self.optimizers(use_pl_optimizer=False).eval()

    def to(self, *args: Any, **kwargs: Any) -> 'TabNNModule':
        super().to(*args, **kwargs)
        # print(f'moving static model to {args} {kwargs}')
        self.creator.static_model.to(*args, **kwargs)


================================================
FILE: pytabkit/models/training/logging.py
================================================
class Logger:
    def __init__(self, verbosity_level):
        # higher verbosity level means more verbose
        self.verbosity_level = verbosity_level

    def get_verbosity_level(self):
        return self.verbosity_level

    def log(self, verbosity: int, content: str):
        if verbosity <= self.verbosity_level:
            self.force_log(content)

    def force_log(self, content: str):
        raise NotImplementedError()


class StdoutLogger(Logger):
    def __init__(self, verbosity_level=0):
        super().__init__(verbosity_level)

    def force_log(self, content: str):
        print(content, flush=True)


================================================
FILE: pytabkit/models/training/metrics.py
================================================
import traceback
from typing import Dict, Any, List, Optional, Tuple, Callable

import numpy as np
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, matthews_corrcoef
import torch.nn.functional as F
import torch
import copy

from pytabkit.models.data.data import DictDataset, TaskType
from pytabkit.models.data.nested_dict import NestedDict
from pytabkit.models.torch_utils import cat_if_necessary, torch_np_quantile
from pytabkit.models.training.auc_mu import auc_mu_impl


# see also: https://scikit-learn.org/stable/modules/model_evaluation.html


def to_one_hot(y, num_classes, label_smoothing_eps=0.0):
    one_hot = F.one_hot(y, num_classes).float()
    if label_smoothing_eps > 0.0:
        low = label_smoothing_eps / num_classes
        high = 1.0 - label_smoothing_eps + low
        return low + (high - low) * one_hot
    else:
        return one_hot


def apply_reduction(res, reduction):
    if reduction == 'mean':
        return res.mean(dim=-1)
    elif reduction is None:
        return res
    elif reduction == 'sum':
        return res.sum(dim=-1)

    return None


def cross_entropy(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'):
    if torch.is_floating_point(y):
        res = (-F.log_softmax(y_pred, dim=-1) * y).sum(dim=-1)
    else:
        res = -F.log_softmax(y_pred, dim=-1).gather(-1, y).squeeze(-1)
    return apply_reduction(res, reduction)


def softmax_kldiv(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'):
    if torch.is_floating_point(y):
        # add 1e-30 to prevent taking the log of 0 -> it gets then multiplied by 0 anyway
        res = (((y + 1e-30).log() - F.log_softmax(y_pred, dim=-1)) * y).sum(dim=-1)
    else:
        res = -F.log_softmax(y_pred, dim=-1).gather(-1, y).squeeze(-1)
    return apply_reduction(res, reduction)


def brier_loss(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'):
    if not torch.is_floating_point(y):
        y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1])
    res = (F.softmax(y_pred, dim=-1) - y).square().sum(dim=-1)
    result = apply_reduction(res, reduction)
    # print(f'{result.item()=}, {y_pred[4]=}')
    return result


def cos_loss(y_pred, y, reduction='mean'):
    if not torch.is_floating_point(y):
        y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1])
    res = 1.0 - (y_pred * y).sum(dim=-1) / (y_pred.norm(dim=-1) + 1e-3)
    return apply_reduction(res, reduction)


def mse(y_pred, y, reduction='mean'):
    if not torch.is_floating_point(y):
        # in case mse should be used for classification
        y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1])
    if y_pred.dim() != y.dim():
        raise RuntimeError('MSE: y_pred.dim() != y.dim(): could lead to broadcasting errors')
    res = ((y_pred - y) ** 2).mean(dim=-1)
    return apply_reduction(res, reduction)


def pinball_loss(y_pred: torch.Tensor, y: torch.Tensor, quantile: float, reduction='mean'):
    if y_pred.dim() != y.dim():
        raise RuntimeError('Pinball loss: y_pred.dim() != y.dim(): could lead to broadcasting errors')
    err = y_pred - y
    # print(f'{quantile*err=}')
    res = torch.maximum((1 - quantile) * err, -quantile * err).mean(dim=-1)
    return apply_reduction(res, reduction)


def multi_pinball_loss(y_pred: torch.Tensor, y: torch.Tensor, quantiles: List[float], reduction='mean'):
    if y_pred.dim() != y.dim():
        raise RuntimeError('Multi-Pinball loss: y_pred.dim() != y.dim(): could lead to broadcasting errors')
    # print(f'{y_pred.shape=}, {y.shape=}')
    err = y_pred - y
    assert y.shape[-1] == 1
    assert err.shape[-1] == len(quantiles)
    # print(f'{quantile*err=}')
    # print(f'{y_pred[:5]=}, {y[:5]=}')
    quantiles = torch.as_tensor(quantiles, dtype=torch.float32, device=err.device)
    res = torch.maximum((1 - quantiles) * err, -quantiles * err).mean(dim=-1)
    return apply_reduction(res, reduction)


def mean_interleave(input, repeats, dim):
    assert input.shape[dim] % repeats == 0
    new_shape = input.shape[:dim] + [input.shape[dim] // repeats, repeats] + input.shape[dim + 1:]
    return input.view(new_shape).mean(dim=dim + 1)


def get_y_probs(y: torch.Tensor, n_classes: int) -> torch.Tensor:
    """
    Returns the empirical probabilities of all classes in y.
    :param y: Tensor of shape [..., n_batch, 1] and dtype torch.long or another integer dtype,
    containing class labels in {0, 1, ..., n_classes-1}
    :param n_classes: Total number of classes
    :return: returns a tensor of shape [..., n_classes]
    """
    if y.shape[-1] != 1:
        raise ValueError(f'get_y_probs() only supports single-label classification')
    if torch.is_floating_point(y):
        raise ValueError(f'get_y_probs() expects y with non-floating dtype')
    if len(y.shape) > 2:
        # recursion
        return cat_if_necessary([get_y_probs(y[i], n_classes) for i in range(y.shape[0])], dim=0)

    return torch.bincount(y.squeeze(-1), minlength=n_classes).to(torch.float32) / y.shape[0]


def insert_missing_class_columns(y_pred: torch.Tensor, train_ds: DictDataset) -> torch.Tensor:
    """
    If train_ds.tensors['y'] does not contain some of the classes specified in train_ds.tensor_infos['y']
    and if y_pred does not contain columns for these missing classes,
    add columns for the missing classes to y_pred, with small probabilities.
    :param y_pred: Tensor of logits, shape [n_batch, n_classes]
    :param train_ds: Dataset used for training the model that produced y_pred.
    :return: Returns y_pred with possibly some columns inserted.
    """
    n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item()
    if y_pred.shape[-1] >= n_classes:
        return y_pred  # already all columns

    # assume that the missing classes/columns in y_pred are exactly those that are not represented in the training set
    train_class_counts = torch.bincount(train_ds.tensors['y'].squeeze(-1), minlength=n_classes).cpu()
    n_missing = n_classes - y_pred.shape[-1]
    pred_col_idx = 0
    new_cols = []
    logsumexp = torch.logsumexp(y_pred, dim=-1)
    # expected posterior probability of the class under uniform prior
    # (expected value of corresponding Dirichlet distribution, which is conjugate prior to "multinoulli" distribution)
    posterior_prob = 1 / (train_ds.n_samples + n_classes)
    # ensure that the probability of missing classes is posterior_prob if y_pred are the logits
    missing_values = logsumexp + np.log(posterior_prob / (1 - posterior_prob * n_missing))
    for i in range(n_classes):
        if train_class_counts[i] > 0:
            # this column should be represented
            new_cols.append(y_pred[:, pred_col_idx])
            pred_col_idx += 1
        else:
            new_cols.append(missing_values)

    return torch.stack(new_cols, dim=-1)


def remove_missing_classes(y_pred: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Removes missing classes from y_pred and y.
    For example, if y_pred.shape[-1] == 4 but y only contains the values 0 and 2,
    the columns y_pred[..., 1] and y_pred[..., 3] will be removed and the values (0, 2) will be mapped to (0, 1).
    :param y_pred: Predictions of shape (n_samples, n_classes) (should be logits
    because probabilities will not be normalized anymore after removing columns).
    :param y: classes of shape (n_samples,)
    :return: y_pred and y with missing classes removed
    """
    # shapes: y_pred should be n_samples x n_classes, y should be n_samples
    n_classes = y_pred.shape[-1]
    counts = torch.bincount(y, minlength=n_classes)
    is_present = counts > 0
    if torch.all(is_present).item():
        # all classes are present, nothing needs to be removed
        return y_pred, y

    num_present = is_present.sum().item()
    reduced_y_pred = y_pred[..., is_present]
    class_mapping = torch.zeros(n_classes, dtype=torch.long, device=y.device)
    class_mapping[is_present] = torch.arange(num_present, dtype=torch.long, device=y.device)
    reduced_y = class_mapping[y]
    # print(f'{is_present=}, {reduced_y_pred.shape=}, {torch.unique(reduced_y)=}')
    return reduced_y_pred, reduced_y


def expected_calibration_error(y_pred: torch.Tensor, y: torch.Tensor):
    if y.is_floating_point():
        y = y.argmax(dim=-1)
    else:
        y = y.squeeze(-1)

    if len(y_pred.shape) == 3:
        # contains a n_models dimension
        y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])]
        y_models = [y[i] for i in range(y.shape[0])]
    else:
        y_pred_models = [y_pred]
        y_models = [y]

    model_scores = []
    # evaluate separately for each model
    for y_pred_indiv, y_indiv in zip(y_pred_models, y_models):
        # handle classes that don't occur in the test set
        y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv)

        # convert logits to probabilities
        y_pred_indiv_probs = F.softmax(y_pred_indiv, dim=-1)

        # ensure that no probabilities are zero or one to circumvent some problems
        # https://github.com/Lightning-AI/torchmetrics/issues/1646
        y_pred_indiv_probs = y_pred_indiv_probs.clamp(1e-7, 1 - 1e-7)
        y_pred_indiv_probs = y_pred_indiv_probs / y_pred_indiv_probs.sum(dim=-1, keepdim=True)

        num_classes = y_pred_indiv_probs.shape[-1]
        is_binary = num_classes == 2
        if is_binary:
            # binary classification, torchmetrics expects only probabilities of the positive class
            y_pred_indiv_probs = y_pred_indiv_probs[..., 1]

        # print(f'{torch.unique(y_indiv)=}')
        # print(f'{torch.unique(y_pred_indiv_probs)=}')
        # print(f'{y_indiv.shape=}, {y_pred_indiv_probs.shape=}')
        # print(f'{torch.min(y_pred_indiv_probs)=}')
        # print(f'{torch.max(y_pred_indiv_probs)=}')
        import torchmetrics
        metric = torchmetrics.CalibrationError(task='binary' if is_binary else 'multiclass', num_classes=num_classes)
        model_scores.append(metric.forward(y_pred_indiv_probs, y_indiv))

    if len(y_pred.shape) == 3:
        # input had n_models dimension, so output should have it, too
        return torch.as_tensor(model_scores, dtype=torch.float32)
    else:
        return torch.as_tensor(model_scores[0], dtype=torch.float32)


def auc_ovr_torchmetrics(y_pred: torch.Tensor, y: torch.Tensor):
    if y.is_floating_point():
        y = y.argmax(dim=-1)
    else:
        y = y.squeeze(-1)

    if len(y_pred.shape) == 3:
        # contains a n_models dimension
        y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])]
        y_models = [y[i] for i in range(y.shape[0])]
    else:
        y_pred_models = [y_pred]
        y_models = [y]

    model_scores = []
    # evaluate separately for each model
    for y_pred_indiv, y_indiv in zip(y_pred_models, y_models):
        # handle classes that don't occur in the test set
        y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv)

        # convert logits to probabilities
        y_pred_indiv_probs = F.softmax(y_pred_indiv, dim=-1)

        # ensure that no probabilities are zero or one to circumvent some problems
        # https://github.com/Lightning-AI/torchmetrics/issues/1646
        y_pred_indiv_probs = y_pred_indiv_probs.clamp(1e-7, 1 - 1e-7)
        y_pred_indiv_probs = y_pred_indiv_probs / y_pred_indiv_probs.sum(dim=-1, keepdim=True)

        num_classes = y_pred_indiv_probs.shape[-1]
        is_binary = num_classes == 2
        if is_binary:
            # binary classification, torchmetrics expects only probabilities of the positive class
            y_pred_indiv_probs = y_pred_indiv_probs[..., 1]

        # print(f'{torch.unique(y_indiv)=}')
        # print(f'{torch.unique(y_pred_indiv_probs)=}')
        # print(f'{y_indiv.shape=}, {y_pred_indiv_probs.shape=}')
        # print(f'{torch.min(y_pred_indiv_probs)=}')
        # print(f'{torch.max(y_pred_indiv_probs)=}')
        import torchmetrics
        metric = torchmetrics.AUROC(task='binary' if is_binary else 'multiclass', num_classes=num_classes)
        model_scores.append(metric.forward(y_pred_indiv_probs, y_indiv))

    if len(y_pred.shape) == 3:
        # input had n_models dimension, so output should have it, too
        return torch.as_tensor(model_scores, dtype=torch.float32)
    else:
        return torch.as_tensor(model_scores[0], dtype=torch.float32)


class Metrics:
    def __init__(self, metric_names, val_metric_name, task_type):
        self.metric_names = metric_names
        self.val_metric_name = val_metric_name
        self.task_type = task_type
        if val_metric_name not in metric_names:
            self.metric_names.append(val_metric_name)

    def compute_metrics_dict(self, y_preds: List[torch.Tensor], y: torch.Tensor, use_ens: bool) -> NestedDict:
        """
        :param y_preds: y predictions by (possibly multiple) ensemble members
        :param y: actual labels (one-hot encoded in case of classification)
        :param use_ens: Whether to also compute metrics for ensembled predictions
        :return: Returns a NestedDict indexed by [str(n_models), str(start_idx), metric_name]
        containing the respective metric values (float) for an ensemble using y_preds[start_idx:start_idx+n_models]
        In the ensembling case, n_models > 1 is also used, but only with start_idx = 0
        """
        if np.any([y_pred.dim() != 2 for y_pred in y_preds]):
            raise RuntimeError('Not all y_preds have dim 2')
        if y.dim() != 2:
            raise RuntimeError('y.dim() != 2')

        results_dict = NestedDict()

        # individual results
        for start_idx, y_pred in enumerate(y_preds):
            for metric_name in self.metric_names:
                result = Metrics.apply(y_pred, y, metric_name).item()
                results_dict[str(1), str(start_idx), metric_name] = float(result)

        # ensemble results
        if len(y_preds) > 1 and use_ens:
            for n_models in range(2, len(y_preds) + 1):
                y_pred = Metrics.avg_preds(y_preds[:n_models], self.task_type)
                for metric_name in self.metric_names:
                    result = Metrics.apply(y_pred, y, metric_name).cpu().numpy()
                    results_dict[str(n_models), str(0), metric_name] = float(result)

        return results_dict

    def compute_val_score(self, val_metrics_dict: NestedDict) -> float:
        # ['1'] refers to ensemble with 1 member
        # values() contains the results for the different individual models
        individual_val_scores = [indiv_dict[self.val_metric_name] for indiv_dict in val_metrics_dict['1'].values()]
        return float(np.mean(individual_val_scores))

    @staticmethod
    def apply(y_pred: torch.Tensor, y: torch.Tensor, metric_name: str) -> torch.Tensor:
        # shapes in general: n_models x n_samples x output_dim
        # for some classification metrics, y should contain the class numbers,
        # be of type torch.long and have output_dim = 1
        # for other classification metrics like cross_entropy, y can also be soft labels with output_dim = n_classes
        # in the classification case, y_pred are assumed to be logits
        invalid = torch.logical_or(torch.isnan(y_pred), torch.isinf(y_pred))
        if torch.any(invalid):
            if y.is_floating_point():
                # regression
                y_pred = torch.clone(y_pred)
                y_pred[torch.any(invalid, dim=-1), :] = 0.0
            else:
                # classification
                # y_pred[invalid] = -np.inf  # leads to NaN after softmax()
                y_pred = torch.clone(y_pred)
                not_invalid = y_pred[~invalid]
                if len(not_invalid) == 0:
                    y_pred[invalid] = 0.0
                else:
                    y_pred[invalid] = torch.min(not_invalid) - 100  # a very small value, basically zero probability
                y_pred_probs = torch.softmax(y_pred, dim=-1)
                y_pred = torch.log(y_pred_probs + 1e-30)

        def get_y_categorical():
            if y.is_floating_point():
                return y.argmax(dim=-1)
            return y.squeeze(-1)

        if metric_name == 'class_error':
            return torch.count_nonzero(y_pred.argmax(dim=-1) != get_y_categorical(), dim=-1) / y_pred.shape[-2]
        elif metric_name == 'cos_loss':
            return cos_loss(y_pred, y)
        elif metric_name == 'cross_entropy':
            return cross_entropy(y_pred, y)
        elif metric_name == 'n_cross_entropy':
            n_classes = y_pred.shape[-1]
            y_avg_log = torch.log(get_y_probs(y, n_classes) + 1e-30)
            # insert batch dimension and expand along batch dimension
            y_avg_log = y_avg_log.unsqueeze(-2).expand(*y_pred.shape)
            return cross_entropy(y_pred, y) / cross_entropy(y_avg_log, y)
        elif metric_name == 'ce_unif':
            return (-F.softmax(y_pred, dim=-1).log()).mean(dim=-1).mean(dim=-1)
        elif metric_name == '1-auc_ovo':
            return 1.0 - Metrics.apply_sklearn_classification_metric(
                y_pred, y, lambda y1, y2: roc_auc_score(y1, y2, multi_class='ovo'), needs_pred_probs=True)
        elif metric_name == '1-auc_ovr':
            return 1.0 - Metrics.apply_sklearn_classification_metric(
                y_pred, y, lambda y1, y2: roc_auc_score(y1, y2, multi_class='ovr'), needs_pred_probs=True)
        elif metric_name == '1-auc_ovr_alt':
            return 1.0 - auc_ovr_torchmetrics(y_pred, y)
        elif metric_name == '1-auc_mu':
            return 1.0 - Metrics.apply_sklearn_classification_metric(
                y_pred, y, auc_mu_impl, needs_pred_probs=True, two_class_single_column=False)
        elif metric_name == 'brier':
            return brier_loss(y_pred, y)
        elif metric_name == 'n_brier':
            n_classes = y_pred.shape[-1]
            y_avg_log = torch.log(get_y_probs(y, n_classes) + 1e-30)
            # insert batch dimension and expand along batch dimension
            y_avg_log = y_avg_log.unsqueeze(-2).expand(*y_pred.shape)
            return brier_loss(y_pred, y) / brier_loss(y_avg_log, y)
        elif metric_name == '1-balanced_accuracy':
            return 1.0 - Metrics.apply_sklearn_classification_metric(y_pred, y, balanced_accuracy_score,
                                                                     needs_pred_probs=False)
        elif metric_name == '1-mcc':
            return 1.0 - Metrics.apply_sklearn_classification_metric(y_pred, y, matthews_corrcoef,
                                                                     needs_pred_probs=False)
        elif metric_name == 'ece':
            return expected_calibration_error(y_pred, y)
        elif metric_name == 'rmse':
            return mse(y_pred, y).sqrt()
        elif metric_name == 'nrmse':
            # rmse relative to rmse of the best constant predictor
            rmse = mse(y_pred, y).sqrt()
            den = y.std(correction=0)
            return rmse / den
        elif metric_name == 'mae':
            return (y_pred - y).abs().mean(dim=-1).mean(dim=-1)
        elif metric_name == 'nmae':
            # mae relative to mae of the best constant predictor
            median = torch.median(y)
            mae = (y_pred - y).abs().mean(dim=-1).mean(dim=-1)
            den = (median - y).abs().mean(dim=-1).mean(dim=-1)
            return mae / den
        elif metric_name == 'max_error':
            return (y_pred - y).abs().max(dim=-1)[0].max(dim=-1)[0]
        elif metric_name == 'n_max_error':
            # max error relative to the max error of the best constant predictor
            max_error = (y_pred - y).abs().max(dim=-1)[0].max(dim=-1)[0]
            max = y.max(dim=-1)[0].max(dim=-1)[0]
            min = y.min(dim=-1)[0].min(dim=-1)[0]
            ref_error = (0.5 * (max - min))
            return max_error / (ref_error + 1e-30)
        elif metric_name.startswith('pinball('):
            # expected format: pinball(number), e.g. pinball(0.95)
            quantile = float(metric_name[len('pinball('):-1])
            result = pinball_loss(y_pred, y, quantile)
            # print(f'pinball loss: {result:g}')
            return result
        elif metric_name.startswith('n_pinball('):
            # expected format: n_pinball(number), e.g. n_pinball(0.95)
            # compute loss divided by loss of the best constant predictor
            quantile = float(metric_name[len('n_pinball('):-1])
            raw_loss = pinball_loss(y_pred, y, quantile)
            best_constant_y_pred = torch_np_quantile(y, quantile, dim=-2, keepdim=True).expand(*y_pred.shape)
            best_constant_loss = pinball_loss(best_constant_y_pred, y, quantile)
            return raw_loss / (best_constant_loss + 1e-30)
        elif metric_name.startswith('c_pinball('):
            # expected format: c_pinball(number), e.g. c_pinball(0.95)
            # compute pinball loss after post-hoc calibration
            quantile = float(metric_name[len('c_pinball('):-1])
            err_quantile = torch_np_quantile(y - y_pred, quantile, dim=-2, keepdim=True)
            raw_loss = pinball_loss(y_pred + err_quantile, y, quantile)
            return raw_loss
        elif metric_name.startswith('multi_pinball('):
            # expected format: multi_pinball(number1, ..., numberk), e.g. multi_pinball(0.25, 0.5, 0.75)
            quantiles = [float(nbr) for nbr in metric_name[len('multi_pinball('):-1].split(',')]
            result = multi_pinball_loss(y_pred, y, quantiles)
            # print(f'pinball loss: {result:g}')
            return result
        else:
            try:
                import probmetrics.metrics
            except ImportError:
                raise ValueError(f'Unknown metric {metric_name}')

            try:
                y_cat = get_y_categorical()
                y_pred = y_pred

                calref_dict = {
                    f'{cr_short}-{loss_short}-{posthoc_short}{cv_short}': f'{cr_long}_{loss_long}_{posthoc_long}_{cv_long}'
                    for cr_short, cr_long in [('ref', 'refinement'), ('cal', 'calib-err')]
                    for loss_short, loss_long in [('ll', 'logloss'), ('br', 'brier')]
                    for posthoc_short, posthoc_long in [('ts', 'ts-mix'), ('is', 'isotonic-mix')]
                    for cv_short, cv_long in [('', 'all'), ('-cv5', 'cv-5')]
                }

                prob_metric_name = metric_name
                if metric_name in calref_dict:
                    prob_metric_name = calref_dict[metric_name]

                if 'ts-mix' in prob_metric_name:
                    # run temperature scaling on CPU, it's more efficient (at least for smaller datasets)
                    y_cat = y_cat.cpu()
                    y_pred = y_pred.cpu()
                metric = probmetrics.metrics.Metric.from_name(prob_metric_name)

                # todo: doesn't work with soft target distributions for now
                if len(y_pred.shape) == 2:
                    return metric.compute_from_labels_logits(y_cat, y_pred)
                elif len(y_pred.shape) == 3:
                    return torch.stack(
                        [metric.compute_from_labels_logits(y_cat[i], y_pred[i]) for i in range(y_pred.shape[0])], dim=0)
                else:
                    raise AssertionError(f'{len(y_pred.shape)=}, but must be 2 or 3')
            except ImportError:
                pass
            except ValueError as e:  # can be thrown if the name is unknown to Metric
                traceback.print_exc()

            raise ValueError(f'Unknown metric {metric_name}')

    @staticmethod
    def apply_sklearn_classification_metric(y_pred: torch.Tensor, y: torch.Tensor, metric_function: Callable,
                                            needs_pred_probs: bool, two_class_single_column: bool = True):
        if y.is_floating_point():
            y = y.argmax(dim=-1)
        else:
            y = y.squeeze(-1)

        if len(y_pred.shape) == 3:
            # contains a n_models dimension
            y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])]
            y_models = [y[i] for i in range(y.shape[0])]
        else:
            y_pred_models = [y_pred]
            y_models = [y]

        model_scores = []
        # evaluate separately for each model
        for y_pred_indiv, y_indiv in zip(y_pred_models, y_models):
            # handle classes that don't occur in the test set
            y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv)

            if needs_pred_probs:
                # convert logits to probabilities
                y_pred_np = F.softmax(y_pred_indiv, dim=-1).cpu().numpy()
                if y_pred_np.shape[-1] == 2 and two_class_single_column:
                    # binary classification, scikit-learn expects only probabilities of the positive class
                    y_pred_np = y_pred_np[..., 1]
            else:
                # convert logits to predicted class
                y_pred_np = torch.argmax(y_pred_indiv, dim=-1).cpu().numpy()

            y_np = y_indiv.cpu().numpy()
            model_scores.append(metric_function(y_np, y_pred_np))

        if len(y_pred.shape) == 3:
            # input had n_models dimension, so output should have it, too
            return torch.as_tensor(model_scores, dtype=torch.float32)
        else:
            return torch.as_tensor(model_scores[0], dtype=torch.float32)

    @staticmethod
    def avg_preds(y_preds: List[torch.Tensor], task_type):
        if task_type == TaskType.CLASSIFICATION:
            # it should be logmeanexp, but doesn't matter because it is normalized by softmax
            # y_pred = torch.logsumexp(torch.stack(y_preds, dim=0), dim=0)
            probs = [F.softmax(y_pred, dim=-1) for y_pred in y_preds]
            avg_probs = sum(probs) / len(probs)
            y_pred = torch.log(avg_probs + 1e-30)
        else:
            y_pred = sum(y_preds) / len(y_preds)
        return y_pred

    @staticmethod
    def defaults(y_cat_sizes, val_metric_name: Optional[str] = None) -> 'Metrics':
        if val_metric_name is None:
            val_metric_name = 'class_error' if y_cat_sizes[0] > 0 else 'rmse'

        # removed cos_loss
        default_class_metrics = ['class_error', 'cross_entropy', 'ce_unif', 'brier',
                                 'n_cross_entropy', 'n_brier',
                                 '1-balanced_accuracy', '1-mcc', 'ece', '1-auc_ovo', '1-auc_ovr']

        if len(y_cat_sizes) == 1 and y_cat_sizes[0] == 2:
            # bin class
            return Metrics(default_class_metrics, val_metric_name, TaskType.CLASSIFICATION)
        elif y_cat_sizes[0] > 0:
            if y_cat_sizes[0] > 100:
                default_class_metrics = [m for m in default_class_metrics if m != '1-auc_ovo']
            # multi-class (or multi-label classification)
            return Metrics(default_class_metrics, val_metric_name, TaskType.CLASSIFICATION)
        else:  # regression
            return Metrics(['rmse', 'mae', 'max_error', 'nrmse', 'nmae', 'n_max_error',
                            'pinball(0.95)', 'n_pinball(0.95)'],
                           val_metric_name, TaskType.REGRESSION)

    @staticmethod
    def default_val_metric_name(task_type):
        if task_type == TaskType.CLASSIFICATION:
            return 'class_error'
        elif task_type == TaskType.REGRESSION:
            return 'rmse'
        else:
            raise ValueError(f'Unknown task type {task_type}')

    @staticmethod
    def default_eval_metric_name(task_type):
        if task_type == TaskType.CLASSIFICATION:
            return 'class_error'
        elif task_type == TaskType.REGRESSION:
            return 'nrmse'
        else:
            raise ValueError(f'Unknown task type {task_type}')


================================================
FILE: pytabkit/models/training/nn_creator.py
================================================
import functools
from typing import List, Optional, Tuple, Callable, Dict, Any

import numpy as np
import torch

from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset, ParallelDictDataLoader, TaskType, ValDictDataLoader
from pytabkit.models.nn_models.base import set_hp_context, SequentialLayer, Layer, Variable
from pytabkit.models.nn_models.models import NNFactory
from pytabkit.models.training.coord import HyperparamManager
from pytabkit.models.training.logging import Logger
from pytabkit.models.training.metrics import Metrics, mse, cross_entropy
from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources

def get_realmlp_auto_batch_size(n_train: int):
    # if n_train <= 2**6:  # 64
    #     return 2**4  # 16
    # elif n_train <= 2**8:
    #     return 2**5
    # elif n_train <= 2**10:
    #     return 2**6
    # elif n_train <= 2**12:
    #     return 2**7
    # elif n_train <= 2**15:
    #     return 2**8
    # elif n_train <= 2**17:
    #     return 2**9
    #
    # return 2**10

    if n_train <= 1024:
        return 64
    elif n_train <= 8192:
        return 128
    elif n_train <= 30_000:
        return 256
    elif n_train <= 100_000:
        return 512

    return 1024


class NNCreator:
    def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config):
        self.fit_params = fit_params
        self.config = config
        self.device_info = None  # todo: allow better configurability, including mps?
        self.n_tt_splits = None
        self.n_tv_splits = None
        self.static_model = None

        self.factory = self.config.get('factory', None)
        if self.factory is None:
            self.factory = NNFactory(**self.config)

        self.hp_manager = HyperparamManager(**self.config)

        # Data Info
        self.is_cv = None
        self.train_idxs = None
        self.val_idxs = None
        self.n_classes = None

    def setup_from_dataset(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources):
        torch.backends.cuda.matmul.allow_tf32 = False  # todo: should we do this?
        # todo: allow preprocessing on CPU and then only put batches on GPU in data loader?
        gpu_devices = interface_resources.gpu_devices
        self.device_info = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu'

        # the code below requires all splits to have the same number of sub-splits
        assert np.all([idxs_list[i].train_idxs.shape[0] == idxs_list[0].train_idxs.shape[0]
                       for i in range(len(idxs_list))])

        # we can then decompose the overall number of sub-splits into the number of splits
        # and the number of sub-splits per split
        self.n_tt_splits = len(idxs_list)
        self.n_tv_splits = idxs_list[0].train_idxs.shape[0]

        self.is_cv = idxs_list[0].val_idxs is not None
        assert np.all([(split_idxs.val_idxs is not None) == self.is_cv for split_idxs in idxs_list])

        y_cat_sizes = ds.tensor_infos['y'].get_cat_sizes().numpy()
        self.n_classes = y_cat_sizes[0]
        self.train_idxs = torch.cat([split_idxs.train_idxs for split_idxs in idxs_list], dim=0)
        self.val_idxs = torch.cat([split_idxs.val_idxs for split_idxs in idxs_list], dim=0) if self.is_cv else None

    def get_criterions(self) -> Tuple[Callable, List[str]]:
        task_type = TaskType.REGRESSION if self.n_classes == 0 else TaskType.CLASSIFICATION
        # train criterion
        # todo: add more options?
        train_metric_name = self.config.get('train_metric_name', None)
        if train_metric_name is None:
            train_criterion = mse if self.n_classes == 0 else cross_entropy  # defaults
        elif train_metric_name == 'mse':
            train_criterion = mse
        elif train_metric_name == 'cross_entropy':
            train_criterion = cross_entropy
        else:
            train_criterion = functools.partial(Metrics.apply, metric_name=train_metric_name)
            # train_criterion = lambda y_pred, y, mn=train_metric_name: Metrics.apply(y_pred, y, mn)
        # else:
        #     raise ValueError(f'{train_metric_name=} is currently not supported')

        val_metric_name = self.config.get('val_metric_name', Metrics.default_val_metric_name(task_type))
        val_metric_names = self.config.get('val_metric_names', [val_metric_name])
        return train_criterion, val_metric_names

    def create_model(self, ds: DictDataset, idxs_list: List[SplitIdxs]):
        ds = ds.to(self.device_info)
        # Create static model
        model_fitter = self.factory.create(ds.tensor_infos)
        static_fitter, dynamic_fitter = model_fitter.split_off_dynamic()
        self.static_model, ds = static_fitter.fit_transform(ds)

        # in the single split case, we can already apply static fitters to the dataset
        is_single_split = len(idxs_list) == 1 and idxs_list[0].n_trainval_splits == 1

        n_ens = self.config.get('n_ens', 1)

        models = []
        # Build non-static models
        for split_idx, split_idxs in enumerate(idxs_list):
            # loop over different trainval-test splits
            # fit initial values only on train
            model_idx = 0
            with torch.no_grad():
                # fit initial values on train_ds
                for sub_idx in range(split_idxs.n_trainval_splits):
                    for ens_idx in range(n_ens):
                        # loop over different train-val splits
                        if 'feature_importances' in self.config:
                            assert n_ens == 1  # don't know if model_idx is handled correctly otherwise
                            self.hp_manager.get_more_info_dict()['feature_importances'] = \
                                self.config['feature_importances'][model_idx]
                        if 'fixed_weight' in self.config:
                            assert n_ens == 1  # don't know if model_idx is handled correctly otherwise
                            self.hp_manager.get_more_info_dict()['fixed_weight'] = \
                                self.config['fixed_weight'][model_idx]
                        train_ds = ds.get_sub_dataset(split_idxs.train_idxs[sub_idx, :])
                        # still call it 'trainval_ds'
                        # because that's what the clipping and output standardization layers use
                        self.hp_manager.get_more_info_dict()['trainval_ds'] = train_ds
                        data_fitter, individual_fitter = dynamic_fitter.split_off_individual()
                        ram_limit_gb = self.config.get('init_ram_limit_gb', 1.0)
                        with set_hp_context(self.hp_manager):
                            torch.manual_seed(utils.combine_seeds(split_idxs.split_seed, ens_idx))  # should not be necessary, but just in case
                            # torch.manual_seed(split_idxs.split_seed + ens_idx)  # should not be necessary, but just in case
                            data_tfm, tfmd_ds = data_fitter.fit_transform_subsample(
                                train_ds, ram_limit_gb, needs_tensors=individual_fitter.needs_tensors)

                        torch.manual_seed(utils.combine_seeds(split_idxs.sub_split_seeds[sub_idx], ens_idx))
                        # torch.manual_seed(split_idxs.sub_split_seeds[sub_idx] + ens_idx)
                        with set_hp_context(self.hp_manager):
                            individual_tfm = individual_fitter.fit_transform_subsample(
                                tfmd_ds, ram_limit_gb=ram_limit_gb, needs_tensors=False)[0]
                        if is_single_split and self.config.get('allow_single_split_opt', True):
                            self.static_model = SequentialLayer([self.static_model, data_tfm])
                            models.append(individual_tfm)
                        else:
                            models.append(SequentialLayer([data_tfm, individual_tfm]))
                        self.hp_manager.get_more_info_dict()['trainval_ds'] = None

                        model_idx += 1

        # print(f'{models[0]=}')
        # for p in models[0].parameters():
        #     print(str(p.context.scope))
        vectorized_model = models[0].stack(models).to(self.device_info)

        fixed_init_params: Optional[List[Variable]] = self.config.get('fixed_init_params', None)
        if fixed_init_params is not None:
            assert n_ens == 1
            fixed_init_param_patterns = self.config['fixed_init_param_patterns']
            reinit_lr_factor = self.config.get('reinit_lr_factor', 1.0)
            for param, fixed_init_param in zip(vectorized_model.parameters(), fixed_init_params):
                scope_str = str(param.context.scope)
                # print(scope_str)
                if any(pattern in scope_str for pattern in fixed_init_param_patterns):
                    print(f'Initializing {scope_str} from fixed parameters')
                    with torch.no_grad():
                        param.copy_(fixed_init_param)
                        param: Variable = param
                        param.hyper_factors['lr'] = reinit_lr_factor * fixed_init_param.hyper_factors.get('lr', 1.0)
                        # param.hyper_factors['wd'] = 0.0
                else:
                    print(f'Initializing {scope_str} newly')

        return vectorized_model

    def create_callbacks(self, model: Layer, logger: Logger, val_metric_names: List[str]):
        from pytabkit.models.training.lightning_callbacks import StopAtEpochsCallback, HyperparamCallback, \
            L1L2RegCallback, \
            ModelCheckpointCallback
        callbacks = [HyperparamCallback(self.hp_manager), L1L2RegCallback(self.hp_manager, model)]
        n_ens = self.config.get('n_ens', 1)
        # if validation
        if self.is_cv and self.fit_params is None and self.config.get('use_best_epoch', True):
            for val_metric_name in val_metric_names:
                callbacks.append(ModelCheckpointCallback(n_tt_splits=self.n_tt_splits, n_tv_splits=self.n_tv_splits,
                                                         n_ens=n_ens,
                                                         use_best_mean_epoch=self.config.get('use_best_mean_epoch_for_cv',
                                                                                             False),
                                                         val_metric_name=val_metric_name,
                                                         restore_best=self.config.get('use_best_epoch', True)))
        elif self.fit_params is not None:
            if self.config.get('use_best_mean_epoch_for_refit', True):
                stop_epochs = [[params['stop_epoch']] * self.n_tv_splits for params in self.fit_params]
            else:
                if 'best_indiv_stop_epochs' not in self.fit_params[0] \
                        or len(self.fit_params[0]['best_indiv_stop_epochs']) != self.n_tv_splits:
                    raise ValueError(f'Setting use_best_mean_epoch_for_refit=False '
                                     f'requires setting use_best_epoch=True and n_cv==n_refit')
                stop_epochs = [params['best_indiv_stop_epochs'] for params in self.fit_params]
            callbacks.append(
                StopAtEpochsCallback(stop_epochs=stop_epochs, n_models=self.n_tv_splits, n_ens=n_ens, model=model,
                                     logger=logger))
            # only for debugging:
            # callbacks.append(ValidationCallback(ds=ds, val_idxs=test_idxs,
            #                    metric_name=Metrics.default_metric_name(task_type),
            #                    logger=logger, n_models=n_models, n_parallel=n_parallel,
            #                    save_best_params=False,
            #                    val_batch_size=self.config.get('predict_batch_size', 256)))
        return callbacks

    def create_dataloaders(self, ds: DictDataset):
        ds = ds.to(self.device_info)
        ds = self.static_model(ds)
        batch_size = self.config.get('batch_size', 256)
        n_ens = self.config.get('n_ens', 1)
        if batch_size == 'auto':
            batch_size = get_realmlp_auto_batch_size(self.train_idxs.shape[0])
        train_dl = ParallelDictDataLoader(ds, self.train_idxs.repeat_interleave(n_ens, dim=0), batch_size=batch_size,
                                          shuffle=True, drop_last=True, adjust_bs=self.config.get('adjust_bs', False))
        val_dl = None
        if self.is_cv and self.fit_params is None:
            val_dl = ValDictDataLoader(ds, self.val_idxs.repeat_interleave(n_ens, dim=0), val_batch_size=self.config.get('predict_batch_size', 1024))
        return train_dl, val_dl


================================================
FILE: pytabkit/models/training/scheduling.py
================================================
import numpy as np
import math


class LearnerProgress:
    def __init__(self):
        self.epoch = 0
        self.epoch_steps = 0
        self.total_steps = 0
        self.epoch_samples = 0
        self.total_samples = 0
        self.epoch_float = 0.0
        self.max_epochs = 0

    def get_fit_progress(self):
        return None if self.max_epochs is None else self.epoch_float / self.max_epochs


def sched_prod(first, second):
    if not isinstance(first, Schedule):
        first = ConstantSchedule(first)
    if not isinstance(second, Schedule):
        second = ConstantSchedule(second)
    if isinstance(first, TimeSchedule) and isinstance(second, TimeSchedule):
        return ProductTimeSchedule_(first, second)
    return ProductSchedule_(first, second)


def sched_sum(first, second):
    if not isinstance(first, Schedule):
        first = ConstantSchedule(first)
    if not isinstance(second, Schedule):
        second = ConstantSchedule(second)
    if isinstance(first, TimeSchedule) and isinstance(second, TimeSchedule):
        return SumTimeSchedule_(first, second)
    return SumSchedule_(first, second)


class Schedule:
    def get_value(self):
        raise NotImplementedError()

    def update(self, learner):
        raise NotImplementedError()

    def __mul__(self, other):
        return sched_prod(self, other)

    def __rmul__(self, other):
        return sched_prod(other, self)

    def __add__(self, other):
        return sched_sum(self, other)

    def __radd__(self, other):
        return sched_sum(other, self)

    def __neg__(self):
        return -1.0 * self

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)


class TimeSchedule(Schedule):
    def __init__(self):
        self.t = 0.0

    def call_time_(self, t: float):
        raise NotImplementedError()

    def get_value(self):
        return self.call_time_(self.t)

    def update(self, learner):
        self.t = learner.progress.get_fit_progress()

    def scaled(self, ymin=0., ymax=1., tmin=0., tmax=1.):
        return ScaledSchedule(self, ymin, ymax, tmin, tmax)

    def reversed(self):
        return self.scaled(tmin=1., tmax=0.)


class ConstantSchedule(TimeSchedule):
    def __init__(self, val):
        super().__init__()
        self.val = val

    def call_time_(self, t: float):
        return self.val


class FunctionSchedule(TimeSchedule):
    def __init__(self, f):
        super().__init__()
        self.f = f

    def call_time_(self, t: float):
        return self.f(t)


class ScaledSchedule(TimeSchedule):
    def __init__(self, base_schedule: TimeSchedule, ymin=0., ymax=1., tmin=0., tmax=1.):
        super().__init__()
        self.base_schedule = base_schedule
        self.ymin = ymin
        self.ymax = ymax
        self.tmin = tmin
        self.tmax = tmax

    def call_time_(self, t: float):
        return self.ymin + (self.ymax - self.ymin) * self.base_schedule.call_time_(
            self.tmin + (self.tmax - self.tmin) * t)


class ProductSchedule_(Schedule):
    def __init__(self, first: Schedule, second: Schedule):
        super().__init__()
        self.first = first
        self.second = second

    def get_value(self):
        return self.first.get_value() * self.second.get_value()

    def update(self, learner):
        self.first.update(learner)
        self.second.update(learner)


class ProductTimeSchedule_(TimeSchedule):
    def __init__(self, first: TimeSchedule, second: TimeSchedule):
        super().__init__()
        self.first = first
        self.second = second

    def call_time_(self, t: float):
        return self.first.call_time_(t) * self.second.call_time_(t)


class SumSchedule_(Schedule):
    def __init__(self, first: Schedule, second: Schedule):
        super().__init__()
        self.first = first
        self.second = second

    def get_value(self):
        return self.first.get_value() + self.second.get_value()

    def update(self, learner):
        self.first.update(learner)
        self.second.update(learner)


class SumTimeSchedule_(TimeSchedule):
    def __init__(self, first: TimeSchedule, second: TimeSchedule):
        super().__init__()
        self.first = first
        self.second = second

    def call_time_(self, t: float):
        return self.first.call_time_(t) + self.second.call_time_(t)


class ScheduleSequence(TimeSchedule):
    def __init__(self, lengths, schedules):
        super().__init__()
        self.lengths = np.array(lengths)
        self.event_times = np.hstack([[0.], np.cumsum(self.lengths)])
        self.schedules = schedules

    def call_time_(self, t: float):
        idx = np.max(np.argwhere(self.event_times <= t))
        idx = min(idx, len(self.schedules)-1)
        start = self.event_times[idx]
        end = self.event_times[idx+1]
        return self.schedules[idx].call_time_((t-start)/(end-start))


class ExponentialSchedule(TimeSchedule):
    def __init__(self, start, end):
        super().__init__()
        self.log_start = np.log(start)
        self.log_end = np.log(end)

    def call_time_(self, t: float):
        return np.exp(self.log_start + t * (self.log_end - self.log_start))


def cos_warm_func(x):
    if x < 2 ** (-10):
        return 1.0
    else:
        base_x = 2**(int(np.log2(x))-1)  # negative float values are rounded up
        return 0.5 + 0.5*np.cos(np.pi * (x/base_x - 1))


def combine_scheds(lengths, schedules):
    return ScheduleSequence(lengths, schedules)


def get_cos_sched() -> FunctionSchedule:
    return FunctionSchedule(lambda x: 0.5 * (1.0 - math.cos(math.pi * x)))


def get_id_sched() -> FunctionSchedule:
    return FunctionSchedule(lambda x: x)


def get_lin_sched() -> FunctionSchedule:
    return FunctionSchedule(lambda x: 1.-x)


def get_cos_warm_sched() -> FunctionSchedule:
    return FunctionSchedule(cos_warm_func)


def connect_cos_scheds(times, values):
    return combine_scheds([t2 - t1 for t1, t2 in zip(times[:-1], times[1:])],
                          [get_cos_sched().scaled(v1, v2) for v1, v2 in zip(values[:-1], values[1:])])


def connect_lin_scheds(times, values):
    return combine_scheds([t2 - t1 for t1, t2 in zip(times[:-1], times[1:])],
                          [get_cos_sched().scaled(v1, v2) for v1, v2 in zip(values[:-1], values[1:])])


class FirstToLastSchedule(TimeSchedule):
    def __init__(self, n_params):
        super().__init__()
        argmax_points = np.linspace(0.2, 0.6, n_params)
        self.scheds = [combine_scheds([t, 1.-t], [get_cos_sched().scaled(0.04, 1.), get_cos_sched().scaled(1., 1e-5)])
                  for t in argmax_points]

    def call_time_(self, t: float):
        return np.array([s.call_time_(t) for s in self.scheds])


class StepFunctionSchedule(Schedule):
    def __init__(self, f):
        self.step = 0
        self.f = f

    def update(self, learner):
        self.step = learner.progress.total_steps

    def get_value(self):
        return self.f(self.step)


class EpochLengthSqMomSchedule(Schedule):
    def __init__(self, min_value: float = 0.95, base_value: float = 0.5):
        self.value = min_value
        self.min_value = min_value
        self.base_value = base_value

    def update(self, learner):
        n_batches_per_epoch = len(learner.data_loader)
        self.value = max(self.min_value, self.base_value ** (1 / n_batches_per_epoch))

    def get_value(self):
        return self.value


class CoslogFunc:
    def __init__(self, n_cycles: int):
        self.n_cycles = n_cycles

    def __call__(self, t):
        return 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2 ** self.n_cycles - 1) * t)))


class GenCoslogFunc:
    def __init__(self, n_cycles: int, base: float):
        self.n_cycles = n_cycles
        self.base = base

    def __call__(self, t):
        return 0.5 * (1 - np.cos(2 * np.pi * np.log(1 + (self.base ** self.n_cycles - 1) * t) / np.log(self.base)))


class AltCoslogFunc:
    def __init__(self, n_cycles: int):
        self.n_cycles = n_cycles

    def __call__(self, t):
        return 0.5 * (1 - np.cos(2 * np.pi * np.log2(np.sqrt(2) + (2 ** self.n_cycles - np.sqrt(2)) * t)))


def cos_func(x):
    return 0.5 * (1.0 - math.cos(math.pi * x))


def identity_func(x):
    return x


def lin_func(x):
    return 1 - x


def get_schedule(sched_name: str) -> Schedule:
    sched_type = sched_name
    base_sched = None

    cos_sched = FunctionSchedule(cos_func)  # from 0 to 1
    # id_sched = FunctionSchedule(identity_func)
    lin_sched = FunctionSchedule(lin_func)  # from 1 to 0
    cos_warm_sched = FunctionSchedule(cos_warm_func)
    identity_sched = FunctionSchedule(identity_func)
    constant_sched = ConstantSchedule(1.0)

    one_cycle_lr_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 1e-5)])
    fastai1_lr_sched = combine_scheds([0.3, 0.7], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 4e-6)])
    mod_one_cycle_lr_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(1e-5, 1.), cos_sched.scaled(1., 1e-5)])

    if not isinstance(sched_type, str):
        base_sched = sched_type
    elif sched_type == 'linear':
        return lin_sched
    elif sched_type == 'constant' or sched_type == 'flat':
        return ConstantSchedule(1.0)
    elif sched_type == 'one_cycle':
        base_sched = one_cycle_lr_sched
    elif sched_type == 'two_cycle':
        base_sched = combine_scheds([0.5, 0.5], [one_cycle_lr_sched] * 2)
    elif sched_type == 'three_cycle':
        base_sched = combine_scheds([0.25, 0.25, 0.5], [one_cycle_lr_sched] * 3)
    elif sched_type == 'four_cycle':
        base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [one_cycle_lr_sched] * 4)
    elif sched_type == 'c4':
        base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [mod_one_cycle_lr_sched] * 4)
    elif sched_type == 'c5':
        base_sched = combine_scheds([0.0625, 0.0625, 0.125, 0.25, 0.5], [mod_one_cycle_lr_sched] * 5)
    elif sched_type == 'long_plateau':
        base_sched = combine_scheds([0.2, 0.6, 0.2],
                                    [cos_sched.scaled(0.04, 1), ConstantSchedule(1.0), cos_sched.scaled(1, 1e-5)])
    elif sched_type == 'sched1':
        base_sched = connect_cos_scheds([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], [0.04, 1.0, 0.01, 1.0, 1.0, 1e-5])
    elif sched_type == 'sched2':
        base_sched = connect_cos_scheds([0.0, 0.125, 0.375, 0.5, 0.75, 1.0], [0.04, 1.0, 0.05, 1.0, 1.0, 1e-5])
    elif sched_type == 'sched3':
        base_sched = connect_cos_scheds([0.0, 8 / 64, 16 / 64, 24 / 64, 32 / 64, 56 / 64, 1.0],
                                        [1e-3, 1.0, 1.0, 1e-3, 1.0, 1.0, 1e-3])
    elif sched_type == 'sched4':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
    elif sched_type == 'sched5':
        base_sched = connect_cos_scheds([0.0, 0.75, 1.0], [0.04, 1.0, 1e-5])
    elif sched_type == 'sched6':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.5, 0.5], [base_sched] * 2)
    elif sched_type == 'sched7':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.25, 0.25, 0.5], [base_sched] * 3)
    elif sched_type == 'sched8':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [base_sched] * 4)
    elif sched_type == 'sched9':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.125]*8, [base_sched] * 8)
    elif sched_type == 'sched10':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.0625, 0.0625, 0.125, 0.25, 0.5], [base_sched] * 5)
    elif sched_type == 'sched11':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5],
                                    [ConstantSchedule(lr) * base_sched for lr in [0.6, 0.8, 1.0, 1.5]])
    elif sched_type == 'sched12':
        base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5])
        base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5],
                                    [ConstantSchedule(lr) * base_sched for lr in [1.0, 1.0, 1.0, 1.5]])
    elif sched_type == 'custom1':
        sched = connect_cos_scheds([0.0, 0.5, 1.0], [4e-2, 1.0, 1e-5])
        base_sched = combine_scheds([0.5, 0.5], [sched] * 2)
    elif sched_type == 'flat_anneal':
        base_sched = combine_scheds([0.6, 0.4], [ConstantSchedule(1.0), cos_sched.scaled(1., 1e-5)])
    elif sched_type == 'flat_cos':
        base_sched = combine_scheds([0.5, 0.5], [ConstantSchedule(1.0), cos_sched.scaled(1., 0.)])
    elif sched_type == 'cos_anneal':
        base_sched = cos_sched.scaled(1.0, 1e-4)
    elif sched_type == 'fastai1':
        base_sched = fastai1_lr_sched
    elif sched_type == 'cos_warm':
        base_sched = cos_warm_sched
    elif sched_type == 'cos_warm_4':
        base_sched = connect_cos_scheds([0.0, 1/15, 1/15, 3/15, 3/15, 7/15, 7/15, 1.0],
                                        [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0])
    elif sched_type == 'datarobot':
        # described in https://www.youtube.com/watch?v=WPQOkoXhdBQ
        base_sched = combine_scheds([0.25, 0.5, 0.25], [cos_sched.scaled(0.1, 1.), cos_sched.scaled(1., 0.1),
                                                                cos_sched.scaled(0.1, 0.003)])
    elif sched_type == 'one_cycle_0.1':
        base_sched = combine_scheds([0.1, 0.9], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 1e-5)])
    elif sched_type == 'one_cycle_mom':
        base_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(0.95, 0.85), cos_sched.scaled(0.85, 0.95)])
    elif sched_type == '1-1/step':
        base_sched = StepFunctionSchedule(lambda step: 1-1/(step+1))
    elif sched_type == 'epoch_length':
        base_sched = EpochLengthSqMomSchedule()
    elif sched_type == 'epoch_length_2':
        base_sched = EpochLengthSqMomSchedule(base_value=0.1)
    elif sched_type == 'epoch_length_3':
        base_sched = EpochLengthSqMomSchedule(base_value=0.05)
    elif sched_type == 'cos_log_15':
        base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t))))
    elif sched_type == 'cos_log_31':
        base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 31 * t))))
    elif sched_type == 'cos_log_63':
        base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 63 * t))))
    elif sched_type == 'cos_log_31_sq_mom':
        base_sched = FunctionSchedule(lambda t: np.exp(-0.05 * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 31 * t))))
                                                - 1e-8)
    elif sched_type == 'cos_sched':
        base_sched = cos_sched.scaled(1., 0.)
    elif sched_type == 'cos':
        base_sched = cos_sched.scaled(1., 0.)
    elif sched_type == 'cos_increasing':
        base_sched = cos_sched.scaled(0., 1.)
    elif sched_type == 'quad':
        base_sched = FunctionSchedule(lambda t: (1-t)**2)
    elif sched_type == 'cubic':
        base_sched = FunctionSchedule(lambda t: (1 - t) ** 3)
    elif sched_type == 'lin_cos_log_15':
        base_sched = FunctionSchedule(lambda t: 2 * t * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t))))
    elif sched_type == 'lin2_cos_log_15':
        base_sched = FunctionSchedule(lambda t: (0.5 + t) * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t))))
    elif sched_type == 'lin3_cos_log_15':
        base_sched = FunctionSchedule(lambda t: (1.5 - t) * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t))))
    elif isinstance(sched_type, str) and sched_type.startswith('coslin'):
        n_cycles = int(sched_type[len('coslin')])
        base_sched = FunctionSchedule(lambda t, c=n_cycles: 0.5 * (1 - np.cos(2 * np.pi * c * t)))
        # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t))))
    elif isinstance(sched_type, str) and sched_type.startswith('coslog'):
        n_cycles = int(sched_type[len('coslog'):])
        base_sched = FunctionSchedule(CoslogFunc(n_cycles))
    elif isinstance(sched_type, str) and sched_type.startswith('gencoslog'):
        components = sched_type[len('gencoslog'):].split('-')
        assert len(components)==2
        n_cycles = int(components[0])
        base = float(components[1])
        base_sched = FunctionSchedule(GenCoslogFunc(n_cycles, base))
    elif sched_type == 'warmup_0.05_cos':
        base_sched = connect_cos_scheds([0.0, 0.05, 1.0],
                                        [0.0, 1.0, 0.0])
    elif sched_type == 'expm4t':
        base_sched = FunctionSchedule(lambda t: np.exp(-4*t))
    elif sched_type == 'expm3t':
        base_sched = FunctionSchedule(lambda t: np.exp(-3*t))
    elif sched_type == 'expm5t':
        base_sched = FunctionSchedule(lambda t: np.exp(-5*t))
    elif sched_type == 'expm6t':
        base_sched = FunctionSchedule(lambda t: np.exp(-6*t))
    elif sched_type == 'expm8t':
        base_sched = FunctionSchedule(lambda t: np.exp(-8 * t))
    elif sched_type == 'invp1e-2':
        base_sched = FunctionSchedule(lambda t: 1e-2 / (t + 1e-2))
    elif sched_type == 'invsqrtp1e-3':
        base_sched = FunctionSchedule(lambda t: np.sqrt(1e-3) / np.sqrt(t + 1e-3))
    elif sched_type == 'quartic':
        base_sched = FunctionSchedule(lambda t: (1.-t)**4)
    elif sched_type == 'pow5':
        base_sched = FunctionSchedule(lambda t: (1.-t)**5)
    elif sched_type == 'pow6':
        base_sched = FunctionSchedule(lambda t: (1.-t)**6)
    elif sched_type == 'warmup_inv':
        # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8)))
        base_sched = FunctionSchedule(lambda t: min(20*t, 0.05/(t+1e-8)))
    elif sched_type == 'sqrt_cos':
        # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8)))
        base_sched = FunctionSchedule(lambda t: 0.05/(np.sqrt(t)+0.05) * (0.5 - 0.5*np.cos(5 * 2 * np.pi * t)))
    elif sched_type == 'lin_cos':
        # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8)))
        base_sched = FunctionSchedule(lambda t: (1-t) * (0.5 - 0.5*np.cos(20 * 2 * np.pi * t)))
    elif sched_type == 'linwarm.05eps':
        base_sched = combine_scheds([0.05, 0.95], [identity_sched.scaled(1e-3, 1.0), constant_sched])
    elif isinstance(sched_type, str) and sched_type.startswith('altcoslog'):
        n_cycles = int(sched_type[len('altcoslog')])
        base_sched = FunctionSchedule(AltCoslogFunc(n_cycles))
        # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t))))
    elif isinstance(sched_type, str) and sched_type.startswith('altquadcyc'):
        n_cycles = int(sched_type[len('altquadcyc')])
        single_cycle = FunctionSchedule(lambda t: 4 * (t-0.5)**2)
        cycle_sched = single_cycle
        for i in range(n_cycles-1):
            cycle_sched = combine_scheds([0.5, 0.5], [cycle_sched, single_cycle])
        cycle_sched = cycle_sched.scaled(tmax=0.75)
        base_sched = cycle_sched
        # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t))))

    if base_sched is None:
        raise ValueError(f'Unknown schedule type "{sched_type}"')
    return base_sched


================================================
FILE: pytabkit/models/utils.py
================================================
import multiprocessing as mp
import os
import os.path
import heapq
import glob
import gzip
import shutil
import timeit
from pathlib import Path
from typing import List, Tuple, Any, Dict, Union, Optional, Callable

import copy
import uuid
import multiprocessing
import time
import json

from torch import multiprocessing as mp

try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
from sklearn.base import check_is_fitted
import numpy as np


def select_from_config(config: Dict, keys: List):
    selected = {}
    for key in keys:
        if key in config:
            selected[key] = config[key]
    return selected


def adapt_config(config, **kwargs):
    new_config = copy.deepcopy(config)
    for key, value in kwargs.items():
        new_config[key] = value
    return new_config


def existsDir(directory):
    if directory != '':
        if not os.path.exists(directory):
            return False
    return True


def existsFile(file_path):
    return os.path.isfile(file_path)


def ensureDir(file_path):
    directory = os.path.dirname(file_path)
    if directory != '':
        if not os.path.exists(directory):
            os.makedirs(directory)


def matchFiles(file_matcher):
    return glob.glob(file_matcher)


def newDirname(prefix):
    i = 0
    name = prefix
    if existsDir(prefix):
        while existsDir(prefix + "_" + str(i)):
            i += 1
        name = prefix + "_" + str(i)
    os.makedirs(name)
    return name


def getSubfolderNames(folder):
    return [os.path.basename(name)
            for name in os.listdir(folder)
            if os.path.isdir(os.path.join(folder, name))]


def getSubfolders(folder):
    return [os.path.join(folder, name)
            for name in os.listdir(folder)
            if os.path.isdir(os.path.join(folder, name))]


def writeToFile(filename, content):
    ensureDir(filename)
    file = open(filename, 'w')
    file.truncate()
    file.write(content)
    file.close()


def readFromFile(filename):
    if not os.path.isfile(filename):
        return ''

    file = open(filename, 'r')
    result = file.read()
    file.close()
    return result


def create_dir(path):
    os.makedirs(path)


def delete_file(path):
    os.remove(path)


def serialize(filename: Union[Path, str], obj: Any, compressed: bool = False, use_json: bool = False,
              use_yaml: bool = False, use_msgpack: bool = False, use_pickle: bool = False):
    # json only works for nested dicts
    ensureDir(filename)
    if compressed:
        file = gzip.open(filename, 'wt' if (use_json or use_yaml) else 'wb', compresslevel=5)
    else:
        file = open(filename, 'w' if (use_json or use_yaml) else 'wb')
    if use_json:
        json.dump(obj, file)
    elif use_yaml:
        import yaml
        yaml.dump(obj, file, Dumper=Dumper)
    elif use_msgpack:
        import msgpack
        msgpack.dump(obj, file)
    elif use_pickle:
        import pickle
        pickle.dump(obj, file)
    else:
        # dill can dump lambdas, and dill also dumps the class and not only the contents
        import dill
        dill.dump(obj, file)
    file.close()


def deserialize(filename: Union[Path, str], compressed: bool = False, use_json: bool = False, use_yaml: bool = False,
                use_msgpack: bool = False, use_pickle: bool = False):
    # json only works for nested dicts
    if compressed:
        file = gzip.open(filename, 'rt' if (use_json or use_yaml) else 'rb')
    else:
        file = open(filename, 'r' if (use_json or use_yaml) else 'rb')
    if use_json:
        result = json.load(file)
    elif use_yaml:
        import yaml
        result = yaml.load(file, Loader=Loader)
    elif use_msgpack:
        import msgpack
        result = msgpack.load(file)
    elif use_pickle:
        import pickle
        result = pickle.load(file)
    else:
        import dill
        result = dill.load(file)
    file.close()
    return result


def copyFile(src, dst):
    ensureDir(dst)
    shutil.copyfile(src, dst)


def nsmallest(n, inputList):
    return heapq.nsmallest(n, inputList)[-1]


def identity(x):
    return x


def set_none_except(lst, idxs):
    for i in range(len(lst)):
        if i not in idxs:
            lst[i] = None


def argsort(lst):
    # from https://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python
    return sorted(range(len(lst)), key=lst.__getitem__)


def join_dicts(*dicts):
    # Attention: arguments do not commute since later dicts can override entries from earlier dicts!
    result = copy.copy(dicts[0])
    for d in dicts[1:]:
        result.update(d)
    return result


def update_dict(d: dict, update: Optional[dict] = None, remove_keys: Optional[Union[Any, List[Any]]] = None):
    d = copy.copy(d)
    if update is not None:
        d.update(update)
    if remove_keys is not None:
        if isinstance(remove_keys, List):
            for key in remove_keys:
                if key in d:
                    d.pop(key)
        else:
            if remove_keys in d:
                d.pop(remove_keys)
    return d


def map_nested(obj: Union[List, Dict, Any], f: Callable, dim: int):
    """
    dim=0 will apply f to obj directly, dim=1 to all elements in obj, etc.
    """
    if dim <= 0:
        return f(obj)
    elif isinstance(obj, dict):
        return {key: map_nested(value, f, dim-1) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [map_nested(value, f, dim-1) for value in obj]


def select_nested(obj: Union[List, Dict], idx: Any, dim: int):
    return map_nested(obj, lambda x: x[idx], dim)


def shift_dim_nested(obj: Union[List, Dict], dim1: int, dim2: int):
    # in a nested combination of lists and dicts, shift the indexing dimension dim1 to dim2
    # example: if d = {'a': [{'b': 1}, {'b': 2}]}, dim1 = 1, dim2 = 2, then the result should be
    # {'a': {'b': [1, 2]}}

    if dim1 < 0 or dim2 < 0:
        raise ValueError(f'expected dim1 >= 0 and dim2 >= 0, but got {dim1=} and {dim2=}')
    # if dim2 <= dim1:
    #     raise ValueError(f'expected dim2 > dim1, but got {dim1=} and {dim2=}')

    if dim1 > 0 and dim2 > 0:
        if isinstance(obj, dict):
            return {key: shift_dim_nested(value, dim1-1, dim2-1) for key, value in obj.items()}
        else:
            # assume that value is a list
            return [shift_dim_nested(value, dim1-1, dim2-1) for value in obj]
    elif dim1 > 1:
        # dim1 > dim2, shift backwards
        return shift_dim_nested(shift_dim_nested(obj, dim1, dim1 - 1), dim1 - 1, dim2)
    elif dim2 > 1:
        # dim2 > dim1, shift forwards
        return shift_dim_nested(shift_dim_nested(obj, dim1, dim1 + 1), dim1 + 1, dim2)
    else:
        # switch dimensions 0 and 1
        if isinstance(obj, dict):
            first = next(iter(obj.values()))
            if isinstance(first, dict):
                # swap two dicts
                return {key2: {key1: obj[key1][key2] for key1 in obj} for key2 in first}
            else:
                # assume it is a list
                return [{key1: obj[key1][i] for key1 in obj} for i in range(len(first))]
        else:
            first = obj[0]
            if isinstance(first, dict):
                return {key2: [obj[i][key2] for i in range(len(obj))] for key2 in first}
            else:
                # assume it is a list
                return [[obj[i][j] for i in range(len(obj))] for j in range(len(first))]
            pass
        pass


def pretty_table_str(str_table):
    if len(str_table) == 0:
        return ''
    max_lens = [np.max([len(row[i]) for row in str_table]) for i in range(len(str_table[0]))]
    whole_str = ''
    for row in str_table:
        for i, entry in enumerate(row):
            whole_str += entry + (' ' * (max_lens[i] - len(entry)))
        whole_str += '\n'
    return whole_str[:-1]  # remove last newline


def get_uuid_str():
    pid_str = str(multiprocessing.current_process().pid)
    time_str = str(time.time_ns())
    rand_str = str(uuid.UUID(bytes=os.urandom(16), version=4))
    return '_'.join([time_str, pid_str, rand_str])


def get_batch_intervals(n_total: int, batch_size: int) -> List[Tuple[int, int]]:
    boundaries = [i * batch_size for i in range(1 + n_total // batch_size)]
    if boundaries[-1] != n_total:
        boundaries.append(n_total)
    return [(start, stop) for start, stop in zip(boundaries[:-1], boundaries[1:])]


def all_equal(lst: List):
    # see https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical
    return not lst or [lst[0]]*len(lst) == lst


class Timer:
    def __init__(self):
        self.start_time_total = None
        self.start_time_process = None
        self.acc_time_total = 0.0
        self.acc_time_process = 0.0

    def start(self):
        if self.start_time_total is None or self.start_time_process is None:
            self.start_time_total = timeit.default_timer()
            self.start_time_process = time.process_time()

    def pause(self):
        if self.start_time_total is None or self.start_time_process is None:
            return  # has already been paused or not been started
        self.acc_time_total += timeit.default_timer() - self.start_time_total
        self.acc_time_process += time.process_time() - self.start_time_process
        self.start_time_total = None
        self.start_time_process = None

    def get_result_dict(self):
        return {'total': self.acc_time_total, 'process': self.acc_time_process}


class TimePrinter:
    def __init__(self, desc: str):
        self.desc = desc
        self.timer = Timer()

    def __enter__(self):
        self.timer.start()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.timer.pause()
        print(f'Time for {self.desc}: {self.timer.get_result_dict()["total"]:g}s')


def extract_params(config: Dict[str, Any],
                   param_configs: List[Union[Tuple[str, Optional[Union[str, List[str]]]],
                                             Tuple[str, Optional[Union[str, List[str]]], Any]]]) -> Dict[str, Any]:
    """
    Convert parameters in config to correct parameter names for another method and (optionally) insert default values
    :param config: Dictionary with values for parameters
    :param param_configs: Tuples specifying parameter names, e.g.:
    ('eta', None) specifies that result['eta'] = config['eta'] should be set if 'eta' is in config
    ('eta', 'lr') specifies that result['eta'] = config['lr'] should be set if 'lr' is in config
    ('eta, ['eta', 'lr']) specifies that either config['eta'] or config['lr'] should be used, if available
    A third value in the tuple specifies a default value that should be used if no value is available in config.
    :return: A dictionary as specified above.
    """
    result = {}
    for param_config in param_configs:
        target_name = param_config[0]
        source_names = param_config[1]
        if source_names is None:
            source_names = [target_name]
        elif isinstance(source_names, str):
            source_names = [source_names]

        source_names_in_config = [source_name for source_name in source_names if source_name in config]
        if len(source_names_in_config) == 0:
            if len(param_config) >= 3:
                # default value specified
                result[target_name] = param_config[2]  # use the default value
        elif len(source_names_in_config) == 1:
            result[target_name] = config[source_names_in_config[0]]
        else:
            raise ValueError(f'Found multiple parameter names encoding the same parameter: {source_names_in_config}')
    return result


def reverse_argmin(x: Union[List, np.ndarray]):
    """
    Does the same as np.argmin but in case of equality selects the last best one
    :param x: list or array of numbers
    :return: index of last minimum
    """
    if isinstance(x, list):
        x = np.asarray(x)
    assert(len(x.shape) == 1)
    return len(x) - 1 - int(np.argmin(x[::-1]))


def combine_seeds(seed_1: int, seed_2: int) -> int:
    """
    Combines two random seeds to a new seed in a hopefully "typically injective" way
    :param seed_1: First random seed.
    :param seed_2: Second random seed.
    :return: Another random seed
    """
    generator = np.random.default_rng(seed=seed_1)
    return int(generator.integers(low=0, high=2**24) + seed_2)


def numpy_to_native_rec(obj: Any):
    if isinstance(obj, list):
        return [numpy_to_native_rec(o) for o in obj]
    elif isinstance(obj, dict):
        return {key: numpy_to_native_rec(value) for key, value in obj.items()}
    else:
        # https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types
        # works for arrays as well as numpy scalars
        return getattr(obj, "tolist", lambda: obj)()


class ProcessPoolMapper:
    def __init__(self, n_processes: int, chunksize=1):
        self.n_processes = n_processes
        self.chunksize = chunksize
        pass

    def _apply(self, f_and_args_serialized: str) -> str:
        import dill
        f, args = dill.loads(f_and_args_serialized)
        return dill.dumps(f(*args))

    def map(self, f, args_tuples: List[Tuple]) -> Any:
        import dill
        if self.n_processes == 1:
            return [f(*args) for args in args_tuples]

        mp_ctx = mp.get_context('spawn')
        pool = mp_ctx.Pool(self.n_processes)
        serialized_args = [dill.dumps(args) for args in args_tuples]

        results = pool.map(self._apply, serialized_args, chunksize=self.chunksize)
        pool.terminate()

        return [dill.loads(s) for s in results]


# adapted from https://github.com/yandex-research/tabular-dl-tabr/blob/75105013189c76bc4f247633c2fb856bc948e579/lib/data.py#L262
class TabrQuantileTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, noise=1e-3, random_state=None, n_quantiles=1000, subsample=1_000_000_000,
                 output_distribution="normal"):
        self.noise = noise
        self.random_state = random_state
        self.n_quantiles = n_quantiles
        self.subsample = subsample
        self.output_distribution = output_distribution

    def fit(self, X, y=None):
        # Calculate the number of quantiles based on data size
        n_quantiles = max(min(X.shape[0] // 30, self.n_quantiles), 10)

        # Initialize QuantileTransformer
        normalizer = QuantileTransformer(
            output_distribution=self.output_distribution,
            n_quantiles=n_quantiles,
            subsample=self.subsample,
            random_state=self.random_state
        )

        # Add noise if required
        X_modified = self._add_noise(X) if self.noise > 0 else X

        # Fit the normalizer
        normalizer.fit(X_modified)
        # show that it's fitted
        self.normalizer_ = normalizer

        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        return self.normalizer_.transform(X)

    def _add_noise(self, X):
        stds = np.std(X, axis=0, keepdims=True)
        noise_std = self.noise / np.maximum(stds, self.noise)
        rng = np.random.default_rng(self.random_state)
        return X + noise_std * rng.standard_normal(X.shape)


class FunctionRunner:
    def __init__(self, dill_f_args_kwargs, result_queue):
        self.dill_f_args_kwargs = dill_f_args_kwargs
        self.result_queue = result_queue

    def __call__(self):
        # print(f'DEBUG: FunctionRunner start')
        import dill
        f, args, kwargs = dill.loads(self.dill_f_args_kwargs)
        result = f(*args, **kwargs)
        self.result_queue.put(result)
        self.result_queue.join()


class FunctionProcess:
    """
    Helper class to run a single function in a separate process.
    """
    def __init__(self, f, *args, **kwargs):
        import dill
        self.result_queue = mp.JoinableQueue()
        self.process = mp.Process(target=FunctionRunner(dill.dumps((f, args, kwargs)), self.result_queue))

    def start(self) -> 'FunctionProcess':
        self.process.start()
        return self

    def is_done(self) -> bool:
        return not self.result_queue.empty()

    def get_ram_usage_gb(self) -> float:
        import psutil
        return psutil.Process(self.process.pid).memory_info().rss / 1024 ** 3

    def pop_result(self) -> Any:
        result = self.result_queue.get()
        self.result_queue.task_done()
        time.sleep(1e-2)
        self.process.terminate()
        return result


class ObjectLoadingContext:
    def __init__(self, obj: Any, filename: Optional[Union[str, Path]] = None):
        self.obj = obj
        self.filename = filename
        self.saved = False

    def __enter__(self) -> Any:
        # use pickle since it works better with torch than dill
        if self.saved:
            self.obj = deserialize(self.filename, use_pickle=True)
        return self.obj

    def __exit__(self, type, value, traceback) -> None:
        if self.filename is not None:
            serialize(self.filename, self.obj, use_pickle=True)
            self.saved = True
            del self.obj


# taken from TabArena
def convert_numpy_dtypes(data: dict) -> dict:
    """Converts NumPy dtypes in a dictionary to Python dtypes.
    Some hyperparameter search space's generate configs with
    numpy dtypes which aren't serializable to yaml. This fixes that.
    """
    converted_data = {}
    for key, value in data.items():
        if isinstance(value, np.generic):
            converted_data[key] = value.item()
        elif isinstance(value, dict):
            converted_data[key] = convert_numpy_dtypes(value)
        elif isinstance(value, list):
            converted_data[key] = [
                convert_numpy_dtypes({i: v})[i] if isinstance(v, (dict, np.generic)) else v for i, v in enumerate(value)
            ]
        else:
            converted_data[key] = value
    return converted_data

================================================
FILE: scripts/analyze_hpo_best_params.py
================================================
import msgpack_numpy as m
m.patch()
import numbers
from typing import Optional

import fire
import numpy as np

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection, TaskDescription
from pytabkit.bench.run.results import ResultManager
from pytabkit.models import utils


def analyze_hpo_best(alg_name: str, coll_name: str, n_splits: int = 10, data_path: Optional[str] = None):
    print(f'Analyzing {coll_name}:')
    if data_path is not None:
        paths = Paths(data_path)
    else:
        paths = Paths.from_env_variables()
    if '/' in coll_name:
        task_infos = [TaskDescription(*coll_name.split('/')).load_info(paths)]
    else:
        task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    best_params = []
    for task_info in task_infos:
        for split_id in range(n_splits):
            results_path = paths.results_alg_task_split(task_info.task_desc, alg_name, n_cv=1,
                                                        split_type=SplitType.RANDOM, split_id=split_id)
            result_manager = ResultManager.load(results_path, load_other=True, load_preds=False)
            if (not isinstance(result_manager.other_dict['cv'], dict)
                    or 'fit_params' not in result_manager.other_dict['cv']):
                raise ValueError(
                    f'Did not get a dict containing fit_params, instead got {result_manager.other_dict["cv"]=}')
            fit_params = result_manager.other_dict['cv']['fit_params']
            print(f'{fit_params=}')

            # print(fit_params)
            best_params.append(fit_params['hyper_fit_params'] if 'hyper_fit_params' in fit_params
                               else (fit_params['sub_fit_params'] if 'sub_fit_params' in fit_params else fit_params))
            if isinstance(best_params[-1], list):
                best_params[-1] = best_params[-1][0]

            # add keys from sub-dicts like in scikit-learn with __
            flattened_params = {}
            for key, value in best_params[-1].items():
                if isinstance(value, dict):
                    for sub_key, sub_value in value.items():
                        flattened_params[f'{key}__{sub_key}'] = sub_value
            best_params[-1] = utils.join_dicts(best_params[-1], flattened_params)
            # print(best_params[-1])
            # print(result_manager.other_dict)
            # return

    param_names = sorted(list(best_params[0].keys()))

    for param_name in param_names:
        values = [config[param_name] for config in best_params]
        unique_values = []
        # do it manually so that it only requires equality comparison and not hashing or other comparisons
        for v in values:
            if v not in unique_values:
                unique_values.append(v)
        # print(f'Processing {param_name=} with {unique_values=}')

        if len(unique_values) == 1:
            continue  # a hyperparam that hasn't been tuned, most likely
        elif len(unique_values) <= 10:
            print(f'Frequencies of best values for hyperparameter {param_name}:')
            for value in unique_values:
                n_best = len([v for v in values if v == value])
                print(f'{value}: {n_best}')
            print()
        elif all(isinstance(v, numbers.Number) for v in unique_values):
            print(f'Hyperparameter {param_name}: mean={np.mean(values):g}, quantiles:')
            for q in np.linspace(0.0, 1.0, 11):
                print(f'alpha={q:g}: {np.quantile(values, q)}')
            print()
        else:
            print(f'No method for printing values of hyperparameter {param_name}')
            print()

    #
    # for act_name in ['relu', 'selu', 'mish']:
    #     n_best = len([config for config in best_params if config['act'] == act_name])
    #     print(f'Number of times that {act_name} was best: {n_best}')


if __name__ == '__main__':
    fire.Fire(analyze_hpo_best)


================================================
FILE: scripts/analyze_tasks.py
================================================
from pathlib import Path
from typing import List, Optional

import fire
import matplotlib.pyplot as plt
import numpy as np

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.models import utils


def print_task_analysis(coll_name: str, paths: Paths):
    coll = TaskCollection.from_name(coll_name, paths)
    # coll.save(paths)
    task_infos = coll.load_infos(paths)
    print(f'Data sets in task collection {coll_name}:')
    str_table = [['Data set: ', 'n ', 'k ', 'd ', 'd_one_hot ', 'd_one_hot_leq_10 ', 'd_one_hot_target ', 'largest_cat']]
    for task_info in task_infos:
        name = task_info.task_desc.task_name
        n = task_info.n_samples
        # k = number of classes
        k = task_info.tensor_infos['y'].get_cat_sizes()[0].item()
        n_cont = task_info.tensor_infos['x_cont'].get_n_features()
        cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy()
        d = n_cont + len(cat_sizes)
        # ignore 'missing' categories
        d_one_hot = n_cont + sum([1 if cs==3 else cs-1 for cs in cat_sizes])
        d_one_hot_leq_10 = n_cont + sum([(1 if cs==3 else cs-1) if cs <= 11 else 1 for cs in cat_sizes])
        n_target = 1 if k <= 2 else k
        d_one_hot_target = n_cont + sum([(1 if cs==3 else min(n_target, cs-1)) for cs in cat_sizes])
        largest_cat = 0
        if cat_sizes is not None and len(cat_sizes) > 0:
            largest_cat = int(np.max(task_info.tensor_infos['x_cat'].get_cat_sizes().numpy()))

        str_table.append([name + ' ', str(n) + ' ', str(k) + ' ', str(d.item()) + ' ', str(d_one_hot.item()) + ' ',
                          str(d_one_hot_leq_10.item()) + ' ', str(d_one_hot_target.item()) + ' ',
                          str(largest_cat) + ' '])
    print(utils.pretty_table_str(str_table))
    print()
    print(f'Number of tasks with more than 1000 samples: {len([ti for ti in task_infos if ti.n_samples >= 1000])}')
    print(f'Number of tasks: {len(task_infos)}')
    print()
    print()


def plot_tasks(coll_name: str, paths: Paths):
    coll = TaskCollection.from_name(coll_name, paths)
    task_infos = coll.load_infos(paths)
    plt.figure(figsize=(5, 4))

    for task_info in task_infos:
        n_cont = task_info.tensor_infos['x_cont'].get_n_features()
        cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy()
        d = n_cont + len(cat_sizes)
        n = task_info.n_samples
        plt.loglog(n, d, 'k.')

    plt.xlabel('Number of samples')
    plt.ylabel('Number of features')
    plt.tight_layout()
    filename = Path('../plots') / f'{coll_name}.pdf'
    utils.ensureDir(filename)
    plt.savefig(filename, bbox_inches='tight')


def plot_tasks_multi(coll_names: List[str], paths: Paths):
    plt.figure(figsize=(7, 5))

    for coll_name in coll_names:
        coll = TaskCollection.from_name(coll_name, paths)
        task_infos = coll.load_infos(paths)

        ds = []
        ns = []

        for task_info in task_infos:
            n_cont = task_info.tensor_infos['x_cont'].get_n_features()
            cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy()
            d = n_cont + len(cat_sizes)
            n = task_info.n_samples
            ds.append(d)
            ns.append(n)

        plt.loglog(ns, ds, '.', label=coll_name)

    plt.legend()

    plt.xlabel('Number of samples')
    plt.ylabel('Number of features')
    plt.tight_layout()
    filename = Path('../plots') / f'data_set_characteristics.pdf'
    utils.ensureDir(filename)
    plt.savefig(filename, bbox_inches='tight')


def analyze_tasks(coll_name: Optional[str] = None):
    paths = Paths.from_env_variables()

    # coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg',
    #               'grinsztajn-class', 'grinsztajn-reg',
    #               # 'grinsztajn-cat-class', 'grinsztajn-num-class', 'grinsztajn-cat-reg', 'grinsztajn-num-reg',
    #               # 'grinsztajn-cat-class-15k', 'grinsztajn-num-class-15k', 'grinsztajn-cat-reg-15k',
    #               # 'grinsztajn-num-reg-15k'
    #               ]

    if coll_name is None:
        coll_names = [dir.stem for dir in paths.tasks().iterdir()]

        for coll_name in coll_names:
            print_task_analysis(coll_name, paths)
            plot_tasks(coll_name, paths)
    else:
        print_task_analysis(coll_name, paths)

    # plot_tasks_multi(coll_names, paths)


if __name__ == '__main__':
    fire.Fire(analyze_tasks)

    # paths = Paths.from_env_variables()
    #
    # # coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg',
    # #               'grinsztajn-class', 'grinsztajn-reg',
    # #               # 'grinsztajn-cat-class', 'grinsztajn-num-class', 'grinsztajn-cat-reg', 'grinsztajn-num-reg',
    # #               # 'grinsztajn-cat-class-15k', 'grinsztajn-num-class-15k', 'grinsztajn-cat-reg-15k',
    # #               # 'grinsztajn-num-reg-15k'
    # #               ]
    #
    # coll_names = [dir.stem for dir in paths.tasks().iterdir()]
    #
    # for coll_name in coll_names:
    #     print_task_analysis(coll_name, paths)
    #     plot_tasks(coll_name, paths)
    #
    # plot_tasks_multi(coll_names, paths)

    # print_task_analysis('cc18-bin-class', paths)
    # print_task_analysis('cc18-multi-class', paths)


================================================
FILE: scripts/check_missing_values.py
================================================
from typing import Optional

import fire
import openml

from pytabkit.bench.data.import_tasks import set_openml_cache_dir, PandasTask
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection


def check_missing_values(openml_cache_dir: Optional[str] = None):
    paths = Paths.from_env_variables()
    for coll_name in ['meta-test-class', 'meta-test-reg']:
        task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
        # task_infos = [task_info for task_info in task_infos if task_info.n_samples < 5000]
        task_infos_no_missing_numeric = []
        task_infos_no_missing = []
        for task_info in task_infos:
            openml_task_id = task_info.more_info_dict['openml_task_id']
            with paths.new_tmp_folder() as tmp_folder:
                set_openml_cache_dir(openml_cache_dir or tmp_folder)
                task = openml.tasks.get_task(openml_task_id, download_data=False)
                dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
                print(f'Analyzing {dataset.name}:')
                pd_task = PandasTask.from_openml_task_id(openml_task_id)
                has_column_nan = pd_task.x_df.isna().any()
                has_numeric_nan = has_column_nan[pd_task.cont_indicator].any(axis=None)
                has_categorical_nan = has_column_nan[pd_task.cat_indicator].any(axis=None)
                print(f'{has_numeric_nan=}, {has_categorical_nan=}')
                if not has_numeric_nan:
                    task_infos_no_missing_numeric.append(task_info)
                    if not has_categorical_nan:
                        task_infos_no_missing.append(task_info)
                # task = openml.tasks.get_task(openml_task_id, download_data=False)
                # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
                # x_df, y_df, cat_indicator, names = dataset.get_data(target=task.target_name, dataset_format='dataframe')
                # has_column_nan = x_df.isna().any()

        TaskCollection(coll_name + '-no-missing-numeric',
                       [task_info.task_desc for task_info in task_infos_no_missing_numeric]).save(paths)
        TaskCollection(coll_name + '-no-missing',
                       [task_info.task_desc for task_info in task_infos_no_missing]).save(paths)


if __name__ == '__main__':
    fire.Fire(check_missing_values)
    pass


================================================
FILE: scripts/copy_algs.py
================================================
import shutil
from typing import List

import fire

from pytabkit.bench.data.paths import Paths


def copy_algs_in_paths(paths_1: Paths, paths_2: Paths, alg_names: List[str]):
    for alg_name in alg_names:
        print(f'Copying alg {alg_name}')
        shutil.copytree(paths_1.algs() / alg_name, paths_2.algs() / alg_name)
        shutil.copytree(paths_1.results() / alg_name, paths_2.results() / alg_name)
        shutil.copytree(paths_1.result_summaries() / alg_name, paths_2.result_summaries() / alg_name)


def copy_specific_algs():
    paths_1 = Paths('first_path')
    paths_2 = Paths('second_path')

    alg_names = [f'{method}-{version}'
                 for method in ['XGB', 'LGBM', 'CatBoost']
                 for version in ['D', 'TD-class', 'TD-reg', 'HPO']]
    alg_names.extend(
        [an + suffix for an in ['MLP-RTDL-D', 'ResNet-RTDL-D', 'TabR-S-D'] for suffix in ['-class', '-reg']])
    alg_names.extend(['MLP-HPO', 'MLP-RTDL-HPO', 'RF-SKL-D', 'XGB-PBB-D'])

    copy_algs_in_paths(paths_1, paths_2, alg_names)

def copy_algs(path_1: str, path_2: str, *alg_names):
    paths_1 = Paths(path_1)
    paths_2 = Paths(path_2)

    copy_algs_in_paths(paths_1, paths_2, list(alg_names))


if __name__ == '__main__':
    fire.Fire(copy_algs)


================================================
FILE: scripts/create_plots_and_tables.py
================================================
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.eval.analysis import ResultsTables
from pytabkit.bench.eval.plotting import plot_schedule, plot_schedules, plot_benchmark_bars, plot_scatter, \
    plot_pareto, plot_winrates, plot_stopping, plot_cumulative_ablations, plot_cdd
from pytabkit.bench.eval.tables import generate_ds_table, generate_collections_table, generate_individual_results_table, \
    generate_ablations_table, generate_refit_table, generate_preprocessing_table, generate_stopping_table, \
    generate_architecture_table

if __name__ == '__main__':
    paths = Paths.from_env_variables()
    coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', 'grinsztajn-class-filtered',
                  'grinsztajn-reg']

    tables = ResultsTables(paths)

    arrow_alg_names = [('MLP-PLR-D', 'RealMLP-TD'), ('TabR-S-D', 'RealTabR-D'), ('XGB-D', 'XGB-TD'),
                       ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'), ('MLP-PLR-HPO', 'RealMLP-HPO')]

    alg_names = [f'{method}-{version}'
                 for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel', 'Ensemble']
                 for version in ['D', 'TD', 'HPO']]
    alg_names.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO',
                      'MLP-PLR-D', 'MLP-PLR-HPO', 'RealTabR-D', 'FTT-D', 'FTT-HPO',
                      'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D', 'TabR-HPO'])

    alg_names_short = [f'{method}-{version}'
                       for method in ['XGB', 'LGBM', 'CatBoost']
                       for version in ['D', 'TD', 'HPO']]
    alg_names_short.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO',
                            'MLP-PLR-D', 'MLP-PLR-HPO', 'FTT-D', 'FTT-HPO',
                            'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D',
                            'RealTabR-D',
                            'TabR-HPO'])

    alg_names_hpo_vs_tpe = [f'{method}-{version}'
                            for method in ['XGB', 'LGBM', 'CatBoost']
                            for version in ['D', 'TD', 'HPO', 'HPO-TPE']]
    alg_names_hpo_vs_tpe.extend(['RealMLP-TD', 'RealMLP-HPO'])

    # extra plot for the README.md
    plot_pareto(paths, tables, coll_names=['meta-test-class', 'meta-test-reg'], alg_names=alg_names,
                use_ranks=False, use_normalized_errors=False,
                use_grinnorm_errors=False,
                use_geometric_mean=True, use_validation_errors=False, arrow_alg_names=arrow_alg_names)

    for use_ranks, use_normalized_errors, use_geometric_mean, use_grinnorm_errors in [[False, False, False, False],
                                                                                      [False, False, True, False],
                                                                                      [True, False, False, False],
                                                                                      [False, True, False, False],
                                                                                      [False, False, False, True]]:
        plot_pareto(paths, tables, coll_names=['grinsztajn-class-filtered', 'grinsztajn-reg'], alg_names=alg_names,
                    use_ranks=use_ranks, use_normalized_errors=use_normalized_errors,
                    use_grinnorm_errors=use_grinnorm_errors,
                    use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names)
        plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names,
                    use_ranks=use_ranks, use_normalized_errors=use_normalized_errors,
                    use_grinnorm_errors=use_grinnorm_errors,
                    use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names)
        plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names,
                    use_ranks=use_ranks, use_normalized_errors=use_normalized_errors,
                    use_grinnorm_errors=use_grinnorm_errors,
                    use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names, use_2x3=True)
        plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names,
                    use_ranks=use_ranks, use_normalized_errors=use_normalized_errors,
                    use_grinnorm_errors=use_grinnorm_errors,
                    use_geometric_mean=use_geometric_mean, use_validation_errors=True, arrow_alg_names=arrow_alg_names)

    # alg_names_rssc = alg_names + ['MLP-RTDL-D_rssc', 'ResNet-RTDL-D_rssc', 'TabR-S-D_rssc']
    without_rssc = ['MLP-RTDL-D', 'ResNet-RTDL-D', 'TabR-S-D', 'FTT-D', 'MLP-PLR-D']
    alg_names_rssc = without_rssc + [an + '_rssc' for an in without_rssc] + ['BestModel_' + an + '_prep' for an in
                                                                             without_rssc]
    alg_names_rssc = alg_names_rssc + ['RealMLP-TD', 'RealTabR-D']
    # alg_names_rssc = alg_names_rssc + ['MLP-RTDL-HPO', 'ResNet-RTDL-HPO', 'FTT-D-HPO', 'MLP-PLR-HPO', 'TabR-HPO']

    plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names_rssc,
                filename='pareto_rssc.pdf')
    # plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-train-reg'], alg_names=alg_names_rssc,
    #             filename='pareto_rssc_meta-train.pdf')
    # plot_pareto(paths, tables, coll_names=['meta-test-class', 'meta-test-reg'], alg_names=alg_names_rssc,
    #             filename='pareto_rssc_meta-test.pdf')

    plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg'],
                alg_names=alg_names_hpo_vs_tpe, plot_pareto_frontier=False,
                use_ranks=False, use_normalized_errors=False,
                use_geometric_mean=True, filename='pareto_hpo-rs-vs-tpe.pdf')

    plot_pareto(paths, tables, coll_names=['meta-test-class-no-missing', 'meta-test-reg-no-missing'],
                alg_names=alg_names, arrow_alg_names=arrow_alg_names,
                filename='pareto_no-missing_geometric.pdf')

    alg_names_auc = [f'{method}-{version}'
                     for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel']
                     for version in ['D', 'TD', 'HPO_best-1-auc-ovr']]
    alg_names_auc.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO_best-1-auc-ovr',
                          'RealMLP-TD_no-ls', 'RealMLP-TD-S_no-ls',
                          'MLP-RTDL-D', 'MLP-RTDL-HPO_best-1-auc-ovr',
                          'MLP-PLR-D', 'MLP-PLR-HPO_best-1-auc-ovr',
                          'ResNet-RTDL-D', 'ResNet-RTDL-HPO_best-1-auc-ovr',
                          'RF-SKL-D', 'RF-HPO_best-1-auc-ovr', 'XGB-PBB-D',
                          'TabR-S-D', 'RealTabR-D', 'RealTabR-D_no-ls', 'TabR-HPO_best-1-auc-ovr', 'BestModel-HPO'])

    arrow_alg_names_auc = [('MLP-PLR-D', 'RealMLP-TD_no-ls'), ('TabR-S-D', 'RealTabR-D_no-ls'), ('XGB-D', 'XGB-TD'),
                           ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'),
                           ('MLP-PLR-HPO_best-1-auc-ovr', 'RealMLP-HPO_best-1-auc-ovr')]

    plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class'], alg_names=alg_names_auc,
                arrow_alg_names=arrow_alg_names_auc,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr',
                filename='pareto_mtrc_mtec_auc-ovr_val-acc.pdf')
    plot_pareto(paths, tables, coll_names=['meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_auc,
                arrow_alg_names=arrow_alg_names_auc,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr',
                filename='pareto_mtec_gcf_auc-ovr_val-acc.pdf')
    plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class', 'grinsztajn-class-filtered'],
                alg_names=alg_names_auc,
                arrow_alg_names=arrow_alg_names_auc,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr',
                filename='pareto_mtrc_mtec_gcf_auc-ovr_val-acc.pdf')

    alg_names_ext = [an + '_val-ce' for an in alg_names] + ['RealMLP-TD_val-ce_no-ls', 'RealMLP-TD-S_val-ce_no-ls',
                                                            'RealTabR-D_val-ce_no-ls',
                                                            'BestModel-TD_val-ce', 'BestModel-D_val-ce']
    arrow_alg_names_valce = [('MLP-PLR-D_val-ce', 'RealMLP-TD_val-ce_no-ls'),
                             ('TabR-S-D_val-ce', 'RealTabR-D_val-ce_no-ls'), ('XGB-D_val-ce', 'XGB-TD_val-ce'),
                             ('LGBM-D_val-ce', 'LGBM-TD_val-ce'), ('CatBoost-D_val-ce', 'CatBoost-TD_val-ce')]
    plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class'], alg_names=alg_names_ext,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce',
                arrow_alg_names=arrow_alg_names_valce,
                filename='pareto_mtrc_mtec_auc-ovr_val-cross-entropy.pdf')
    plot_pareto(paths, tables, coll_names=['meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_ext,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce',
                arrow_alg_names=arrow_alg_names_valce,
                filename='pareto_mtec_gcf_auc-ovr_val-cross-entropy.pdf')
    plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class', 'grinsztajn-class-filtered'],
                alg_names=alg_names_ext,
                val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce',
                arrow_alg_names=arrow_alg_names_valce,
                filename='pareto_mtrc_mtec_gcf_auc-ovr_val-cross-entropy.pdf')

    # ----- other plots -----

    plot_cumulative_ablations(paths, tables)

    plot_cdd(paths, tables, coll_names=coll_names, alg_names=alg_names_short)
    plot_cdd(paths, tables, coll_names=coll_names[0:2], alg_names=alg_names_short)
    plot_cdd(paths, tables, coll_names=coll_names[2:4], alg_names=alg_names_short)

    generate_architecture_table(paths, tables)

    plot_stopping(paths, tables, classification=True)
    plot_stopping(paths, tables, classification=False)

    generate_preprocessing_table(paths, tables)

    generate_refit_table(paths, tables, 'RealMLP')
    generate_refit_table(paths, tables, 'LGBM')

    generate_ablations_table(paths, tables)

    generate_collections_table(paths)

    for coll_name in coll_names:
        plot_winrates(paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names)

    for coll_name in coll_names:
        for algs_name, new_alg_names in [
            ('defaults',
             ['RealMLP-TD', 'RealTabR-D', 'TabR-S-D', 'MLP-PLR-D', 'MLP-RTDL-D', 'CatBoost-TD', 'LGBM-TD', 'XGB-TD',
              'RF-SKL-D']),
            ('hpo',
             ['RealMLP-HPO', 'TabR-HPO', 'MLP-PLR-HPO', 'FTT-HPO', 'ResNet-RTDL-HPO', 'MLP-RTDL-HPO', 'CatBoost-HPO',
              'LGBM-HPO',
              'XGB-HPO', 'RF-HPO'])]:
            generate_individual_results_table(paths, tables, f'individual_results_{coll_name}_{algs_name}.tex',
                                              coll_name=coll_name,
                                              alg_names=new_alg_names)

    generate_ds_table(paths, TaskCollection.from_name('meta-train-class', paths), include_openml_ids=False)
    generate_ds_table(paths, TaskCollection.from_name('meta-train-reg', paths), include_openml_ids=False)
    generate_ds_table(paths, TaskCollection.from_name('meta-test-class', paths), include_openml_ids=True)
    generate_ds_table(paths, TaskCollection.from_name('meta-test-reg', paths), include_openml_ids=True)
    generate_ds_table(paths, TaskCollection.from_name('grinsztajn-class-filtered', paths), include_openml_ids=True)
    generate_ds_table(paths, TaskCollection.from_name('grinsztajn-reg', paths), include_openml_ids=True)
    plot_schedule(paths, filename='coslog4.pdf', sched_name='coslog4')
    plot_schedules(paths, filename='coslog4_and_flatcos.pdf', sched_names=['coslog4', 'flat_cos'],
                   sched_labels=[r'$\mathrm{coslog}_4$', r'$\mathrm{flat\_cos}$'])

    for coll_name in ['meta-test-class', 'meta-test-reg']:
        plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_BestModel-TD_CatBoost-HPO.pdf',
                     coll_names=[coll_name],
                     alg_name_1='BestModel-TD', alg_name_2='CatBoost-HPO')
        # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_MLP-TD-HPO.pdf',
        #              coll_names=[coll_name],
        #              alg_name_2='RealMLP-HPO', alg_name_1='HPO-on-BestModel-TD')
        # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_BestModel-HPO.pdf',
        #              coll_names=[coll_name],
        #              alg_name_2='BestModel-HPO', alg_name_1='HPO-on-BestModel-TD')
        # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_BestModel-TD.pdf',
        #              coll_names=[coll_name],
        #              alg_name_2='BestModel-TD', alg_name_1='HPO-on-BestModel-TD')
    for coll_name in coll_names:
        for alg_name_1, alg_name_2 in [('RealMLP-TD', 'CatBoost-TD'), ('RealMLP-TD', 'RealMLP-HPO'),
                                       ('RealMLP-HPO', 'CatBoost-HPO'),
                                       ('CatBoost-TD', 'CatBoost-HPO'), ('BestModel-TD', 'BestModel-HPO'),
                                       ('Ensemble-TD', 'BestModel-TD'), ('BestModel-TD', 'CatBoost-HPO'),
                                       ('RealMLP-TD', 'MLP-RTDL-D'), ('CatBoost-TD', 'LGBM-TD'),
                                       ('BestModel-TD', 'BestModel-D')]:
            plot_scatter(paths, tables=tables, filename=f'scatter_3x2_{alg_name_1}_{alg_name_2}.pdf',
                         coll_names=coll_names,
                         alg_name_1=alg_name_1, alg_name_2=alg_name_2)
    plot_scatter(paths, tables=tables, filename=f'scatter_3x2_CatBoost-TD_CatBoost-HPO_valid-errors.pdf',
                 coll_names=coll_names,
                 alg_name_1='CatBoost-TD', alg_name_2='CatBoost-HPO', use_validation_errors=True)


================================================
FILE: scripts/create_probclass_plots.py
================================================
from typing import Optional, List

import numpy as np
import pandas as pd
import torch
from adjustText import adjust_text

from tueplots import bundles, fonts, fontsizes, figsizes
import matplotlib

matplotlib.rcParams.update(bundles.icml2024())
matplotlib.rcParams.update(fonts.icml2024_tex())
matplotlib.rcParams.update(fontsizes.icml2024())

from matplotlib import pyplot as plt, ticker
import matplotlib.colors as mcolors
import matplotlib.patheffects

import seaborn as sns

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results
from pytabkit.bench.run.results import ResultManager
from pytabkit.models import utils


def load_stopping_times(paths: Paths, alg_name: str, n_cv: int, n_tt_splits: int, val_metric_name: str,
                        coll_name: str = 'talent-class-small') -> np.ndarray:
    results = []
    coll = TaskCollection.from_name(coll_name, paths)
    for task_desc in coll.task_descs:
        for split_id in range(n_tt_splits):
            results_path = paths.results_alg_task_split(task_desc, alg_name, n_cv=n_cv, split_type=SplitType.RANDOM,
                                                        split_id=split_id)
            rm = ResultManager.load(results_path, load_other=True, load_preds=False)
            fit_params = rm.other_dict['cv']['fit_params']

            while True:
                if 'sub_fit_params' in fit_params:
                    fit_params = fit_params['sub_fit_params']
                elif isinstance(fit_params, list):
                    assert len(fit_params) == 1
                    fit_params = fit_params[0]
                else:
                    break

            result = None
            if 'stop_epoch' in fit_params:
                result = fit_params['stop_epoch']
            elif 'n_estimators' in fit_params:
                result = fit_params['n_estimators']
            else:
                print(f'No stopping epoch found in {fit_params=}')

            if isinstance(result, dict):
                result = result[val_metric_name]
            results.append(result)

    return np.asarray(results)


def get_desired_symlog_ticks():
    pos_small = np.arange(0.1, 1.0, 0.1)
    pos_mid = np.arange(1, 10, 1)
    pos_large = np.arange(10, 101, 10)
    pos_ticks = np.concatenate([pos_small, pos_mid, pos_large])
    neg_ticks = -pos_ticks[::-1]
    return np.concatenate([neg_ticks, [0], pos_ticks])


def plot_barscatter_ax(ax: plt.Axes, df: pd.DataFrame, xlabel: Optional[str], ylabel: str,
                       threshold: Optional[float] = None, use_symlog: bool = False):
    # hues = list(cal_methods.values())
    hues = df['hue'].unique().tolist()

    # adapted from https://cduvallet.github.io/posts/2018/03/boxplots-in-python

    sns.set_style('white')

    # colors = ['#B25116', '#FB84D1']
    # colors = ['tab:blue', 'tab:orange']
    colors = [(0.6, 0.8, 1.0), (1.0, 0.8, 0.6), (0.6, 1.0, 0.8)]

    if len(hues) == 1:
        if 'XGB' in hues[0]:
            colors = colors[2:3]
        elif hues[0].startswith('MLP'):
            colors = colors[1:2]
    pal = {key: value for key, value in zip(hues, colors[:len(hues)])}

    # Set up another palette for the boxplots, with slightly lighter shades
    # light_colors = ['#E5B699', '#FFC9EC']
    light_colors = colors
    face_pal = {key: value for key, value in zip(hues, light_colors[:len(hues)])}

    hue_order = hues

    # Make sure to remove the 'facecolor': 'w' property here, otherwise
    # the palette gets overridden
    boxprops = {'edgecolor': 'k', 'linewidth': 1}
    lineprops = {'color': 'k', 'linewidth': 1}

    boxplot_kwargs = {'boxprops': boxprops, 'medianprops': lineprops,
                      'whiskerprops': lineprops, 'capprops': lineprops,
                      'width': 0.75, 'palette': face_pal,
                      'whis': (10, 90),  # use 10% and 90% quantiles for whiskers
                      'hue_order': hue_order}

    stripplot_kwargs = {'linewidth': 0.4, 'size': 2.5, 'alpha': 0.6,
                        'palette': pal, 'hue_order': hue_order}

    ax.axhline(y=0, color='#888888', linestyle='--')
    ax.grid(True, which='both')

    sns.boxplot(x='label', y='value', hue='hue', data=df, ax=ax,
                fliersize=0, **boxplot_kwargs)
    sns.stripplot(x='label', y='value', hue='hue', data=df, ax=ax,
                  dodge=True, jitter=0.18, **stripplot_kwargs)

    if threshold is not None:
        ax.set_ylim(-threshold, threshold)

    if use_symlog:
        ax.set_yscale('symlog', linthresh=1)
        ax.yaxis.set_minor_formatter(matplotlib.ticker.ScalarFormatter())
        ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())
        # ax.yaxis.set_minor_locator(matplotlib.ticker.AutoMinorLocator())
        ax.ticklabel_format(style='plain', axis='y')

        # Get your custom minor tick positions
        minor_ticks = get_desired_symlog_ticks()

        # Exclude major ticks (1, 10, 100 and their negatives)
        major_ticks = np.array([-100, -10, -1, 0, 1, 10, 100])
        minor_ticks = [tick for tick in minor_ticks if tick not in major_ticks]

        # Set the minor ticks
        ax.yaxis.set_minor_locator(plt.FixedLocator(minor_ticks))

        # Remove minor tick labels
        ax.yaxis.set_minor_formatter(plt.NullFormatter())

        # Disable minor grid lines
        ax.yaxis.grid(False, which='minor')

    print(f'{len(hues)=}')

    # Fix the legend, keep only the first len(hues) legend elements
    # (there would be twice as many because there are also the ones for the scatter plot
    if len(hues) > 1:
        handles, hues_ax = ax.get_legend_handles_labels()

        # ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
        #           fancybox=True, shadow=True, ncol=5)
        lgd = ax.legend(handles[:len(hues)], hues_ax[:len(hues)],
                        loc='upper center',
                        bbox_to_anchor=(0.5, -0.25 if xlabel is not None else -0.15),
                        ncol=len(hues),
                        # fancybox=True, shadow=True
                        # fontsize='large',
                        # handletextpad=0.5,
                        )
        # lgd.legend_handles[0]._sizes = [40]
        # lgd.legend_handles[1]._sizes = [40]
    else:
        ax.get_legend().remove()

    ax.set_ylabel(ylabel, fontsize='small')
    if xlabel is not None:
        ax.set_xlabel(xlabel, fontsize='small')
    else:
        ax.set_xlabel('', fontsize='small')


    # Draw a clean downward arrow labeled "better" under "Logloss+TS"
    x_labels = df['label'].unique().tolist()
    y_min, _ = ax.get_ylim()
    if 'Logloss+TS' in x_labels:
        x_idx = x_labels.index('Logloss+TS')
        # Position arrow so it is visually below all data, regardless of y scale
        arrow_tip_y = 8e-3 * y_min  # it's a symlog scale
        arrow_base_y = 0.2 * y_min
        text_y = arrow_base_y + 0.02 * y_min

        ax.annotate(
            '', xy=(x_idx, arrow_base_y), xytext=(x_idx, arrow_tip_y),
            arrowprops=dict(arrowstyle='-|>', color='black', lw=1.1, shrinkA=0, shrinkB=0)
        )
        ax.text(x_idx, text_y, 'lower=better', ha='center', va='top',
                fontsize='small', style='italic', color='black')


def plot_results(paths: Paths, tables: ResultsTables, base_names: List[str], n_hpo_steps: int,
                 n_tt_splits: int, coll_name: str = 'talent-class-small',
                 metric_name: str = 'n_cross_entropy', use_mean_results: bool = False,
                 use_percentages: bool = False,
                 plot_stopping_times: bool = False,
                 n_cv: int = 1, threshold: Optional[float] = 0.02, use_validation_errors: bool = False,
                 use_small_plot: bool = False, use_medium_plot: bool = False, title: Optional[str] = None):
    val_metrics = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier', 'ref-ll-ts': 'TS-Ref.',
                   'ref-br-ts': 'Brier-ref.', 'class_error': 'Accuracy'}

    cal_methods = {'': 'No post-hoc cal.', '_ts-mix': 'Temp. scaling'}

    metric_display_name_dict = {'n_cross_entropy': 'normalized Logloss', 'cross_entropy': 'Logloss',
                                'n_brier': 'normalized Brier loss', 'brier': 'Brier loss', 'class_error': 'Class. err.',
                                '1-auroc-ovr': '1-AUROC'}

    metric_display_name = metric_display_name_dict.get(metric_name, metric_name)

    # assert use_small_plot or all(len(bn) == 1 for bn in base_names)
    assert use_small_plot or use_medium_plot or len(base_names) == 1

    with (plt.rc_context(figsizes.icml2024_half() if use_small_plot else figsizes.icml2024_full())):
        # fig, axs = plt.subplots(1, len(base_names))
        fig, ax = plt.subplots()

        dfs = []

        for base_name in base_names:

            cv_suffix = '' if n_cv == 1 else f'-cv{n_cv}'
            bag_suffix = f' [bag-{n_cv}]'
            hpo_steps_suffix = f'-{n_hpo_steps}' if 'HPO' in base_name else ''

            means = None

            if not plot_stopping_times:
                if use_mean_results:
                    means_dicts = []
                    for tag in [f'paper_hpo_{base_name}{cv_suffix}', f'paper_hpo-calib_{base_name}{cv_suffix}']:
                        table = tables.get(coll_name, tag=tag, n_cv=n_cv)

                        means, intervals = get_benchmark_results(paths, table, coll_name=coll_name,
                                                                 use_relative_score=False,
                                                                 test_metric_name=metric_name,
                                                                 val_metric_name=metric_name,
                                                                 n_splits=n_tt_splits,
                                                                 # don't replace '-class' because it occurs in val-class_error
                                                                 # also don't replace ' [bag-1]' for the cv case
                                                                 simplify_name_fn=lambda s: s,
                                                                 return_percentages=False, use_task_mean=False,
                                                                 use_validation_errors=use_validation_errors,
                                                                 use_geometric_mean=False)
                        means_dicts.append(means)

                    all_means = utils.join_dicts(*means_dicts)
                    means = dict()

                    print(f'Available alg names before aggregating:')
                    for alg_name in all_means:
                        print(alg_name)
                    print()

                    for val_metric_key, val_metric_label in val_metrics.items():
                        for cal_method_key, cal_method_name in cal_methods.items():
                            alg_name = f'{base_name}{cv_suffix}-{n_hpo_steps}_val-{val_metric_key}{cal_method_key}{bag_suffix}'
                            alg_names_source = [
                                f'{base_name}{cv_suffix}_step-{i}_val-{val_metric_key}{cal_method_key}{bag_suffix}' for
                                i in
                                range(n_hpo_steps)]
                            means[alg_name] = np.mean(np.stack([all_means[an] for an in alg_names_source], axis=1),
                                                      axis=1)

                else:
                    table = tables.get(coll_name,
                                       tag=f'paper_{base_name}{cv_suffix}' if n_cv == 1 else f'paper{cv_suffix}',
                                       n_cv=n_cv)

                    means, intervals = get_benchmark_results(paths, table, coll_name=coll_name,
                                                             use_relative_score=False,
                                                             test_metric_name=metric_name, val_metric_name=metric_name,
                                                             n_splits=n_tt_splits,
                                                             # don't replace '-class' because it occurs in val-class_error
                                                             # also don't replace ' [bag-1]' for the cv case
                                                             use_validation_errors=use_validation_errors,
                                                             simplify_name_fn=lambda s: s,
                                                             return_percentages=False, use_task_mean=False,
                                                             use_geometric_mean=False)

            # df should contain columns 'value', 'val_metric', 'cal_method'
            alg_dfs = []

            if means is not None:
                print(f'Available alg names:')
                for alg_name in means:
                    print(alg_name)
                print()

            rel_alg = f'{base_name}{cv_suffix}{hpo_steps_suffix}_val-cross_entropy_ts-mix{bag_suffix}'

            if use_small_plot:
                if plot_stopping_times:
                    combinations = [
                        ('cross_entropy', '', 'Logloss'),
                        ('brier', '', 'Brier'),
                        ('1-auroc-ovr', '', 'AUROC'),
                        ('ref-ll-ts', '', 'TS-Ref.'),
                        ('ref-br-ts', '', 'Brier-Ref.'),
                        ('class_error', '', 'Accuracy'),
                    ]
                else:
                    combinations = [
                        ('cross_entropy', '', 'Logloss'),
                        ('cross_entropy', '_ts-mix', 'Logloss+TS'),
                        ('ref-ll-ts', '_ts-mix', 'TS-Ref.+TS'),
                        ('class_error', '_ts-mix', 'Accuracy+TS'),
                    ]
            elif use_medium_plot:
                combinations = [
                    ('cross_entropy', '', 'Logloss'),
                    ('cross_entropy', '_ts-mix', 'Logloss+TS'),
                    ('brier', '_ts-mix', 'Brier+TS'),
                    ('1-auroc-ovr', '_ts-mix', 'AUROC+TS'),
                    ('ref-ll-ts', '_ts-mix', 'TS-Ref.+TS'),
                    ('ref-br-ts', '_ts-mix', 'Brier-Ref.+TS'),
                    ('class_error', '_ts-mix', 'Accuracy+TS'),
                ]
                if not any('-HPO' in base_name for base_name in base_names):
                    combinations.insert(5, ('ref-ll-ts-cv5', '_ts-mix', 'TS-Ref.-5CV+TS'))
            else:
                combinations = [(val_metric_key, cal_method_key, val_metric_label) for val_metric_key, val_metric_label
                                in val_metrics.items() for cal_method_key in cal_methods]

            for val_metric_key, cal_method_key, label in combinations:
                alg_name = f'{base_name}{cv_suffix}{hpo_steps_suffix}_val-{val_metric_key}{cal_method_key}'
                print(f'Adding results for {alg_name}')
                if plot_stopping_times:
                    assert not use_mean_results
                    values = load_stopping_times(paths, alg_name=alg_name, n_cv=n_cv, n_tt_splits=n_tt_splits,
                                                 val_metric_name=val_metric_key,
                                                 coll_name=coll_name)
                else:
                    if use_percentages:
                        values = 100 * (means[alg_name + bag_suffix] / means[rel_alg] - 1)
                    else:
                        values = means[alg_name + bag_suffix] - means[rel_alg]

                    if threshold is not None:
                        values = np.clip(values, -threshold, threshold)

                if use_small_plot or use_medium_plot:
                    hue = base_name.split('-')[0]
                    if hue == 'XGB':
                        hue = 'XGBoost'
                else:
                    hue = cal_methods[cal_method_key]

                alg_dfs.append(pd.DataFrame(dict(
                    value=values.tolist(),
                    label=[label] * len(values),
                    hue=[hue] * len(values),
                )))

            df = pd.concat(alg_dfs, axis='index', ignore_index=True)
            dfs.append(df)

        df = pd.concat(dfs, axis='index', ignore_index=True)

        ylabel = ('Stopping iteration' if 'XGB' in base_name else f'Stopping epoch') \
            if plot_stopping_times else f'{metric_display_name} diff.\\ to baseline'
        if use_percentages:
            ylabel = ylabel + r' [\%]'
        plot_barscatter_ax(ax=ax, df=df, xlabel=None,  # 'Validation and optimization metric',
                           ylabel=ylabel, use_symlog=use_percentages,
                           threshold=threshold if plot_stopping_times else None)

        if title:
            ax.set_title(title)

        suffix = '_mean' if use_mean_results else ''
        suffix = suffix + ('_rel' if use_percentages else '')
        suffix = suffix + ('_stoptime' if plot_stopping_times else '')
        suffix = suffix + ('_valid' if use_validation_errors else '')
        suffix = suffix + ('' if coll_name == 'talent-class-small' else '_' + coll_name)
        suffix = suffix + ('_small' if use_small_plot else ('_medium' if use_medium_plot else ''))

        threshold_str = f'None' if threshold is None else f'{threshold:g}'
        file_path = paths.plots() / f'boxplot_{"-".join(base_names)}{cv_suffix}_{metric_name}_{threshold_str}{suffix}.pdf'

        plt.tight_layout()
        utils.ensureDir(file_path)
        plt.savefig(file_path)
        plt.close()


def plot_calib_benchmark(paths: Paths, tables: ResultsTables, metric_name: str = 'cross_entropy',
                         n_tt_splits: int = 5, use_validation_errors: bool = False, use_extra_methods: bool = False):
    times_df = pd.read_csv(paths.base() / 'calib_times' / 'times.csv')
    methods = list(times_df['calib_name'].unique())

    coll_name = 'talent-class-small'
    table = tables.get(coll_name, tag=f'paper_calib-bench', n_cv=1)

    means, _ = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False,
                                     test_metric_name=metric_name, val_metric_name=metric_name,
                                     n_splits=n_tt_splits,
                                     # don't replace '-class' because it occurs in val-class_error
                                     # also don't replace ' [bag-1]' for the cv case
                                     simplify_name_fn=lambda s: s,
                                     return_percentages=False, use_task_mean=True,
                                     use_validation_errors=use_validation_errors,
                                     use_geometric_mean=False)

    # ----- get reference score without post-hoc calibration

    means_nocalib, _ = get_benchmark_results(paths, tables.get(coll_name, tag=f'paper_XGB-D', n_cv=1),
                                             coll_name=coll_name, use_relative_score=False,
                                             test_metric_name=metric_name, val_metric_name=metric_name,
                                             n_splits=n_tt_splits,
                                             # don't replace '-class' because it occurs in val-class_error
                                             # also don't replace ' [bag-1]' for the cv case
                                             simplify_name_fn=lambda s: s,
                                             return_percentages=False, use_task_mean=True,
                                             use_validation_errors=use_validation_errors,
                                             use_geometric_mean=False)

    orig_score = means_nocalib['XGB-D_val-class_error [bag-1]']

    avg_times = dict()

    min_n_val = 10_000
    df = times_df.loc[times_df['n_val'] >= min_n_val]
    for method in methods:
        where = df['calib_name'] == method
        # * 1000 for per 1K, *1000 for milliseconds
        avg_times[method] = np.mean(df.loc[where, 'time'] / df.loc[where, 'n_val']) * 1_000_000

    print(repr(means))
    print(repr(avg_times))

    val_metrics = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier', 'ref-ll-ts': 'TS-Ref.',
                   'n_cross_entropy': 'norm. Logloss', 'n_brier': 'norm. Brier',
                   'ref-br-ts': 'Brier-ref.', 'class_error': 'Accuracy'}

    methods_with_labels = {'ts': r'TS (ours)',
                           # 'ts-mix': r'Bisection + smoothing (ours)',
                           'ag-ts': r'TS (AutoGluon)',
                           # 'ag-inv-ts': r'AutoGluon + inv. temp.',
                           'torchunc-ts': 'TS (TorchUncertainty)',
                           'torchcal-ts': 'TS (TorchCal)',
                           'guo-ts': 'TS (Guo et al., 2017)',
                           }

    if use_extra_methods:
        methods_with_labels = utils.join_dicts(methods_with_labels, {
            'ir-mix': 'Isotonic (sklearn) + LS',
            'ts-mix': 'TS+LS (ours)'
        })

    labels_list = list(methods_with_labels.values())

    with plt.rc_context(figsizes.icml2024_half(height_to_width_ratio=0.5 if use_extra_methods else 0.4)):
        fig, ax = plt.subplots()

        # sns.set_theme(style="whitegrid", font_scale=2)

        plt.ylabel(f'Mean {val_metrics[metric_name]}')
        plt.xlabel(f'Mean runtime (ms) per 1K samples')

        colors = ['tab:green', 'tab:blue', 'tab:orange', 'tab:red', 'tab:purple', 'tab:cyan', 'tab:olive']

        lines = []

        lines.append(
            ax.axhline(y=means['XGB-D_val-class_error_calib-bench_ts [bag-1]'], color=colors[0], linestyle='--',
                       linewidth=1.0,
                       zorder=-50))

        times_list = [avg_times[method] for method in methods_with_labels.keys()]
        metrics_list = [means[f'XGB-D_val-class_error_calib-bench_{method} [bag-1]'] for method in
                        methods_with_labels.keys()]

        plt.scatter(times_list, metrics_list, c=colors[:len(times_list)], s=10)

        # Prepare to annotate the points
        texts = []
        for i, point in enumerate(ax.collections[0].get_offsets()):
            model_name = labels_list[i]
            x, y = point
            if x < np.mean(times_list):
                # x = 0.7 * x + 0.3 * np.max(times_list)
                x += 0.15 * (np.max(times_list) - np.min(times_list))
            else:
                # x = 0.7 * x + 0.3 * np.min(times_list)
                x -= 0.15 * (np.max(times_list) - np.min(times_list))
            y = 0.8 * y + 0.2 * np.mean(metrics_list)
            text_color = colors[i]
            # Annotate the model names
            display_name = model_name
            # with plt.rc_context({'font.family': 'sans-serif', "font.sans-serif": "DejaVu Sans"}):
            # from matplotlib import font_manager
            # font_path = font_manager.findfont("DejaVu Sans")
            # print(f'{font_path=}')
            with plt.rc_context({'font.family': 'sans-serif', "text.usetex": False}):
                text = ax.text(x, y, display_name, color=text_color, fontsize=8, ha='center', va='center', font='Arial')
            # text.set_path_effects([matplotlib.patheffects.withStroke(linewidth=1.2, foreground='white')])
            texts.append(text)

        # import matplotlib.font_manager as fm
        # print([f.name for f in fm.fontManager.ttflist])

        lines.append(ax.axhline(y=orig_score, color='tab:gray', linestyle='--', linewidth=1.0,
                                zorder=-50))
        # with plt.rc_context({'font.family': 'sans-serif', "font.sans-serif": "DejaVu Sans"}):
        with plt.rc_context({'font.family': 'sans-serif', "text.usetex": False}):
            text = ax.text(np.mean(times_list), orig_score - 0.1 * (np.max(metrics_list) - np.min(metrics_list)),
                           'No post-hoc cal.', color='tab:gray', fontsize=8, ha='center', va='center', font='Arial')
        texts.append(text)

        plt.xlim(left=0)
        # plt.grid(True, which='both', zorder=-100)
        ax.set_axisbelow(True)

        print(ax.collections)

        # line = ax.axhline(y=means['XGB-D_val-class_error_calib-bench_ts [bag-1]']-0.01, color='white', linestyle='--',
        #                   linewidth=1.5,
        #                   zorder=-50)

        # Use adjust_text to repel the labels from each other and the points
        adjust_text(texts,
                    # force_text=(0.01, 0.02),
                    # objects=lines,
                    x=times_list,
                    y=metrics_list,
                    # force_pull=(0.1, 0.1),
                    # force_explode=(0.1, 0.2),
                    avoid_self=False,
                    expand=(1.15, 1.3),
                    ax=ax,
                    )

        if use_extra_methods:
            ymin, ymax = ax.get_ylim()
            ymin = ymin - 0.15 * (ymax - ymin)
            plt.ylim(ymin, ymax)

        suffix = '_extra' if use_extra_methods else ''
        filename = f'calib_benchmark_{coll_name}_{metric_name}{suffix}'
        if use_validation_errors:
            filename = filename + '_valid'
        filename = filename + '.pdf'

        file_path = paths.plots() / filename
        utils.ensureDir(file_path)

        plt.tight_layout()
        plt.savefig(file_path)

        plt.close(fig)


def plot_gap_vs_ds_size(paths: Paths, tables: ResultsTables, base_name: str, metric_name: str, n_hpo_steps: int,
                        use_smallest_class: bool = False, use_2nd_largest_class: bool = False,
                        use_entropy: bool = False, use_percentages: bool = False, color_by_total_loss: bool = False):
    table = tables.get('talent-class-small', tag=f'paper_{base_name}',
                       n_cv=1)
    coll_name = 'talent-class-small'
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)

    means, intervals = get_benchmark_results(paths, table, coll_name='talent-class-small', use_relative_score=False,
                                             test_metric_name=metric_name, val_metric_name=metric_name,
                                             n_splits=5,
                                             # don't replace '-class' because it occurs in val-class_error
                                             # also don't replace ' [bag-1]' for the cv case
                                             use_validation_errors=False,
                                             simplify_name_fn=lambda s: s,
                                             return_percentages=False, use_task_mean=False,
                                             use_geometric_mean=False)

    print(f'Available alg names:')
    for alg_name in means:
        print(alg_name)

    extended_base_name = f'{base_name}-{n_hpo_steps}' if 'HPO' in base_name else base_name

    alg_name_1 = f'{extended_base_name}_val-cross_entropy_ts-mix [bag-1]'
    alg_name_2 = f'{extended_base_name}_val-ref-ll-ts_ts-mix [bag-1]'

    if use_percentages:
        diffs = 100 * (means[alg_name_2] / means[alg_name_1] - 1)
    else:
        diffs = means[alg_name_2] - means[alg_name_1]

    suffix = '_rel' if use_percentages else ''

    if use_smallest_class:
        suffix = suffix + '_smallest-class'
        x = []
        for task_info in task_infos:
            class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy()
            x.append(np.min(class_frequencies))
    elif use_2nd_largest_class:
        suffix = suffix + '_2nd-largest-class'
        x = []
        for task_info in task_infos:
            class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy()
            x.append(np.sort(class_frequencies)[-2])
    elif use_entropy:
        suffix = suffix + '_entropy'
        x = []
        for task_info in task_infos:
            class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy()
            class_probs = class_frequencies.astype(np.float32) / task_info.n_samples
            x.append(-task_info.n_samples * np.dot(class_probs, np.log2(class_probs + 1e-30)))
    else:
        x = [ti.n_samples for ti in task_infos]

    if color_by_total_loss:
        cbar_label = 'Sum of losses of both versions'
        suffix = suffix + '_col-loss'
        colors = means[alg_name_1] + means[alg_name_2]
    else:
        cbar_label = 'Total Entropy of Y'
        colors = []
        for task_info in task_infos:
            class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy().astype(
                np.float32)
            p = class_frequencies / np.sum(class_frequencies)
            entropy = -np.dot(p, np.log(p))
            colors.append(entropy)

    metric_display_names = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier',
                            'ref-ll-ts': 'TS-Ref.',
                            'n_cross_entropy': 'norm. Logloss', 'n_brier': 'norm. Brier',
                            'ref-br-ts': 'Brier-Ref.', 'class_error': 'Accuracy'}

    with (plt.rc_context(figsizes.icml2024_half(height_to_width_ratio=0.8))):
        with plt.rc_context(fontsizes.icml2024(default_smaller=0)):
            fig, ax = plt.subplots()

            norm = matplotlib.colors.LogNorm(vmin=np.min(colors), vmax=np.max(colors))
            cmap = plt.cm.plasma_r  # You can use other colormaps like 'plasma', 'coolwarm', etc.
            colors = cmap(norm(colors))

            # Plot with color based on z
            for i in range(len(x)):
                ax.plot(x[i], diffs[i], '.', color=colors[i])

            if use_percentages:
                ax.set_yscale('symlog', linthresh=1)

            method_display_name = base_name.replace('-HPO', ' (tuned)')
            method_display_name = method_display_name.replace('-TD', ' (default)')
            method_display_name = method_display_name.replace('-D', ' (default)')

            ax.set_title(r'\textbf{' + method_display_name + r'}')
            ax.set_xscale('log')

            sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
            sm.set_array([])  # Set an empty array as required

            # Add the colorbar
            plt.colorbar(sm, label=cbar_label, ax=ax)
            # plt.semilogx(x, diffs, '.', color='tab:blue')
            plt.xlabel('Number of samples')
            plt.ylabel(f'Relative difference in {metric_display_names[metric_name]} [\\%]')
            plt.axhline(y=0, color='k', linestyle='--', zorder=-1)
            # plt.tight_layout()
            file_path = paths.plots() / f'gap_vs_ds_size_{base_name}_{metric_name}{suffix}.pdf'
            utils.ensureDir(file_path)
            plt.savefig(file_path)
            plt.close()


if __name__ == '__main__':
    paths = Paths.from_env_variables()
    tables = ResultsTables(paths)

    # calibration methods  (label positions can be improved post-hoc using Inkscape)
    for metric_name in ['cross_entropy', 'n_cross_entropy', 'brier', 'n_brier']:
        for use_extra_methods in [True, False]:
            plot_calib_benchmark(paths, tables, metric_name=metric_name, use_extra_methods=use_extra_methods)

    # results for individual datasets
    for base_name in ['MLP-HPO', 'XGB-HPO', 'RealMLP-HPO', 'MLP-D', 'XGB-D', 'RealMLP-TD']:
        plot_gap_vs_ds_size(paths, tables, base_name=base_name, metric_name='cross_entropy', n_hpo_steps=30,
                            use_smallest_class=False,
                            use_2nd_largest_class=False,
                            use_entropy=False, use_percentages=True,
                            color_by_total_loss=True)

    # plot main benchmark results
    for use_small_plot in [False, True]:
        for base_names in [['RealMLP-HPO', 'MLP-HPO', 'XGB-HPO'], ['RealMLP-TD', 'MLP-D', 'XGB-D']]:
            for coll_name in ['talent-class-small-above10k', 'talent-class-small']:
                for metric_name in ['cross_entropy', 'class_error', '1-auroc-ovr', 'brier']:
                    plot_results(paths, tables, base_names, n_hpo_steps=30, n_tt_splits=5,
                                 use_percentages=True,
                                 metric_name=metric_name, coll_name=coll_name, use_validation_errors=False,
                                 use_small_plot=use_small_plot, use_medium_plot=not use_small_plot,
                                 use_mean_results=False, threshold=100, n_cv=1,
                                 title=r'\textbf{Tabular data, tuned hyperparameters}' if 'RealMLP-HPO' in base_names
                                 else r'\textbf{Tabular data, default hyperparameters}')

    # plot stopping times
    for base_names in [  # ['RealMLP-HPO'], ['MLP-HPO'], ['XGB-HPO'],
        ['RealMLP-TD'], ['MLP-D'], ['XGB-D']]:
        for coll_name in ['talent-class-small-above10k', 'talent-class-small']:
            plot_results(paths, tables, base_names, n_hpo_steps=30, n_tt_splits=5,
                         use_percentages=False,
                         metric_name='cross_entropy', coll_name=coll_name, use_validation_errors=False,
                         use_small_plot=True, use_medium_plot=False, use_mean_results=False, n_cv=1,
                         plot_stopping_times=True, threshold=None,
                         title=r'\textbf{Tabular data, tuned hyperparameters}' if any(
                             '-HPO' in base_name for base_name in base_names)
                         else r'\textbf{Tabular data, default hyperparameters}')


================================================
FILE: scripts/create_xrfm_ablations_table.py
================================================
from typing import List, Optional

import numpy as np
from pytabkit.bench.run.results import ResultManager

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results
from pytabkit.bench.eval.tables import _get_table_str
from pytabkit.models import utils


def generate_xrfm_ablations_results_table(paths: Paths, tables: ResultsTables, filename: str, coll_name: str,
                                          test_metric_name: Optional[str] = None,
                                          val_metric_name: Optional[str] = None):
    table = tables.get(coll_name, tag='default')

    alg_display_names = {
        'xRFM-HPO-paper-large_new': 'AGOP',
        'xRFM-HPO-large-temptune_new': 'AGOP + TT',
        'xRFM-HPO-large-temptune-pca_new': 'PCA + TT',
        'xRFM-HPO-large-temptune-rf_new': 'RF + TT'
    }
    alg_names = list(alg_display_names.keys())

    means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False,
                                             test_metric_name=test_metric_name, val_metric_name=val_metric_name,
                                             return_percentages=False, use_task_mean=False, use_geometric_mean=False,
                                             n_splits=1)

    alg_names = [an for an in alg_names if an in means]

    table_head = [['', r'\multicolumn{4}{c}{Splitting method}'],
                  ['Dataset'] + [alg_display_names[an] for an in alg_names]]
    table_body = []

    enumerated_task_infos = list(enumerate(table.test_table.task_infos))
    enumerated_task_infos.sort(key=lambda tup: tup[1].task_desc.task_name.lower())

    print(f'{coll_name=}')
    print(f'{list(means.keys())=}')

    def get_score_strings(scores: List[float], maximize: bool = False, use_int: bool = False) -> List[str]:
        best_row_score = np.max(scores) if maximize else np.min(scores)
        is_best_list = [score == best_row_score for score in scores]
        row_strs = []
        for is_best, row_score in zip(is_best_list, scores):
            cur_str = str(round(row_score)) if use_int else f'{row_score:5.4f}'
            if is_best:
                cur_str = r'\textbf{' + cur_str + r'}'
            row_strs.append(cur_str)
        return row_strs

    for task_idx, task_info in enumerated_task_infos:
        row_scores = [means[alg_name][task_idx] for alg_name in alg_names]
        table_body.append([task_info.task_desc.task_name] + get_score_strings(row_scores))

    # escape underscores for latex
    table_head = [[val.replace('_', r'\_') for val in row] for row in table_head]
    table_body = [[val.replace('_', r'\_') for val in row] for row in table_body]

    # generate bottom part, first average scores
    # indexed by [task][alg]
    scores_matrix = np.asarray(
        [[means[alg_name][task_idx] for alg_name in alg_names] for task_idx, _ in enumerated_task_infos])
    n_wins = (scores_matrix == np.min(scores_matrix, axis=1)[:, None]).astype(np.int32).sum(axis=0).tolist()
    table_foot = [['Number of wins:'] + get_score_strings(n_wins, maximize=True, use_int=True),
                  ['Shifted geometric mean:'] \
                  + get_score_strings(np.exp(np.mean(np.log(scores_matrix + 0.01), axis=0)).tolist()),
                  ['Arithmetic mean:'] \
                  + get_score_strings(np.mean(scores_matrix, axis=0).tolist())
                  ]
    # get runtimes
    mean_fit_times = []
    mean_eval_times = []
    for alg_name in alg_names:
        fit_times = []
        eval_times = []
        for task_idx, task_info in enumerated_task_infos:
            fit_time = 0.0
            eval_time = 0.0
            for hpo_step in range(30):
                path = paths.results_alg_task_split(task_desc=task_info.task_desc,
                                                    alg_name=alg_name + f'_step-{hpo_step}', n_cv=1,
                                                    split_type=SplitType.RANDOM, split_id=0)
                rm = ResultManager.load(path, load_preds=False)
                fit_time += rm.other_dict['cv']['fit_time_s']
                eval_time += rm.other_dict['cv']['eval_time_s']
            fit_times.append(fit_time)
            eval_times.append(eval_time)
        mean_fit_times.append(np.mean(fit_times))
        mean_eval_times.append(np.mean(eval_times))
    table_foot.append(['Average fit time [s]:'] + get_score_strings(mean_fit_times, use_int=True))
    table_foot.append(['Average eval time [s]:'] + get_score_strings(mean_eval_times, use_int=True))

    table_str = _get_table_str(table_head, table_body, table_foot)
    file_path = paths.plots() / filename
    utils.writeToFile(file_path, table_str)


if __name__ == '__main__':
    paths = Paths.from_env_variables()

    tables = ResultsTables(paths)

    for coll_name in ['meta-test-large-class', 'meta-test-large-reg']:
        generate_xrfm_ablations_results_table(paths, tables, f'individual_results_{coll_name}.tex',
                                              coll_name=coll_name)


================================================
FILE: scripts/custom_paths.py.default
================================================
def get_base_folder():
    return 'tab_bench_data'


================================================
FILE: scripts/download_data.py
================================================
from typing import Optional

import fire

from pytabkit.bench.data.common import TaskSource
from pytabkit.bench.data.get_uci import download_all_uci
from pytabkit.bench.data.import_talent_benchmark import import_talent_benchmark
from pytabkit.bench.data.import_tasks import import_uci_tasks, get_openml_task_ids, import_openml, get_openml_ds_names
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection, TaskDescription, TaskInfo


def run_import(openml_cache_dir: str = None, import_meta_train: bool = False, import_meta_test: bool = False,
               import_openml_class_bin_extra: bool = False,
               import_grinsztajn: bool = False, import_grinsztajn_medium: bool = False,
               import_tabzilla_hard: bool = False, import_automl_class_small: bool = False,
               import_talent_class_small: bool = False, import_talent_reg_small: bool = False,
               import_tabarena: bool = False,
               talent_folder: Optional[str] = None):
    paths = Paths.from_env_variables()
    min_n_samples = 1000

    if import_meta_train:
        print(f'Importing meta-train')
        # import UCI
        download_all_uci(paths)
        import_uci_tasks(paths)

        # generate task collections
        uci_multi_class_descs = TaskCollection.from_source(TaskSource.UCI_MULTI_CLASS, paths).task_descs
        uci_bin_class_descs = TaskCollection.from_source(TaskSource.UCI_BIN_CLASS, paths).task_descs
        uci_multi_class_task_names = [td.task_name for td in uci_multi_class_descs]
        uci_class_descs = uci_multi_class_descs + [td for td in uci_bin_class_descs
                                                   if td.task_name not in uci_multi_class_task_names]
        uci_class_descs = [td for td in uci_class_descs if td.load_info(paths).n_samples >= min_n_samples]
        TaskCollection('meta-train-class', uci_class_descs).save(paths)

        uci_reg_descs = TaskCollection.from_source(TaskSource.UCI_REGRESSION, paths).task_descs
        uci_reg_descs = [td for td in uci_reg_descs if td.load_info(paths).n_samples >= min_n_samples]
        TaskCollection('meta-train-reg', uci_reg_descs).save(paths)

    # maybe could use faster pyarrow backend for pandas if v2 is available?
    # pd.options.mode.dtype_backend = "pyarrow"

    if import_meta_test or import_openml_class_bin_extra or import_automl_class_small:
        # import AutoML Benchmark and CTR-23 benchmark
        # could also import the TabZilla suite
        # https://www.openml.org/search?type=study&study_type=task&id=379&sort=tasks_included
        # but the selection criteria for this one are based a lot on the performance of different algorithms
        automl_class_task_ids = get_openml_task_ids(271)
        automl_reg_task_ids = get_openml_task_ids(269)
        ctr23_reg_task_ids = get_openml_task_ids(353)
        sarcos_duplicated_task_id = 361254
        sarcos_deduplicated_task_id = 361011
        if sarcos_duplicated_task_id in ctr23_reg_task_ids:
            # use the version of sarcos without the duplicated test set
            print(f'Using a different version of the sarcos data set for the CTR-23 benchmark')
            ctr23_reg_task_ids.remove(sarcos_duplicated_task_id)
            ctr23_reg_task_ids.append(sarcos_deduplicated_task_id)
        all_reg_task_ids = list(set(automl_reg_task_ids + ctr23_reg_task_ids))  # todo
        automl_class_ds_names = get_openml_ds_names(automl_class_task_ids)
        automl_reg_ds_names = get_openml_ds_names(automl_reg_task_ids)
        ctr23_reg_ds_names = get_openml_ds_names(ctr23_reg_task_ids)

        def check_task(td: TaskDescription, min_n_samples: Optional[int] = None,
                       max_one_hot_size: Optional[int] = None) -> bool:
            task_info = td.load_info(paths)
            if min_n_samples is not None and task_info.n_samples < min_n_samples:
                print(f'Ignoring task {str(td)} because it has too few samples')
                return False
            n_cont = task_info.tensor_infos['x_cont'].get_n_features()
            cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy()
            # ignore 'missing' categories
            # todo: is this really the way we should handle this?
            d_one_hot = n_cont + sum([1 if cs == 3 else cs - 1 for cs in cat_sizes])
            if max_one_hot_size is not None and d_one_hot > max_one_hot_size:
                print(f'Ignoring task {str(td)} because it is too high-dimensional after one-hot encoding')
                return False
            return True

        if import_meta_test:
            print(f'Importing meta-test')
            # treat dionis separately because we want to subsample it to 100k instead of 500k samples for speed and RAM reasons
            automl_class_task_ids_not_dionis = [id for id, name in zip(automl_class_task_ids, automl_class_ds_names)
                                                if name != 'dionis']
            automl_class_task_ids_dionis = [id for id, name in zip(automl_class_task_ids, automl_class_ds_names)
                                            if name == 'dionis']
            assert len(automl_class_task_ids_dionis) == 1
            assert len(automl_class_task_ids_not_dionis) == len(automl_class_task_ids) - 1

            import_openml(automl_class_task_ids_not_dionis, TaskSource.OPENML_CLASS, paths, openml_cache_dir,
                          max_n_samples=500_000, rerun=False)
            import_openml(automl_class_task_ids_dionis, TaskSource.OPENML_CLASS, paths, openml_cache_dir,
                          max_n_samples=100_000, rerun=True)

            import_openml(all_reg_task_ids, TaskSource.OPENML_REGRESSION, paths, openml_cache_dir, normalize_y=True,
                          max_n_samples=500000, rerun=False)

            class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs

            # generate task collections
            exclude_automl_class = ['kr-vs-kp', 'wilt', 'ozone-level-8hr', 'first-order-theorem-proving',
                                    'GesturePhaseSegmentationProcessed', 'PhishingWebsites', 'wine-quality-white',
                                    'nomao',
                                    'bank-marketing', 'adult']
            filtered_class_descs = [td for td in class_descs if td.task_name not in exclude_automl_class
                                    and td.task_name in automl_class_ds_names
                                    and check_task(td, min_n_samples=min_n_samples, max_one_hot_size=10000)]
            TaskCollection('meta-test-class', filtered_class_descs).save(paths)

            # we exclude Brazilian_houses because there is already brazilian_houses in ctr-23,
            # and Brazilian_houses includes three features that should not be used for predicting the target,
            # while brazilian_houses should not contain them
            exclude_automl_reg = ['wine_quality', 'abalone', 'OnlineNewsPopularity', 'Brazilian_houses']
            exclude_ctr23_reg = ['abalone', 'physiochemical_protein', 'naval_propulsion_plant', 'superconductivity',
                                 'white_wine', 'red_wine', 'grid_stability']
            reg_descs = TaskCollection.from_source(TaskSource.OPENML_REGRESSION, paths).task_descs
            filtered_reg_descs = [td for td in reg_descs if td.task_name not in exclude_automl_reg + exclude_ctr23_reg
                                  and td.task_name in automl_reg_ds_names + ctr23_reg_ds_names
                                  and check_task(td, min_n_samples=min_n_samples, max_one_hot_size=10000)]
            TaskCollection('meta-test-reg', filtered_reg_descs).save(paths)

        if import_openml_class_bin_extra:
            print(f'Importing openml-class-bin-extra')
            # also import binary version of multiclass tasks
            # requires that meta_test has already been imported
            class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs
            multiclass_names = [td.task_name for td in class_descs if td.load_info(paths).get_n_classes() > 2]
            # print(f'{multiclass_names=}')
            import_openml(automl_class_task_ids, TaskSource.OPENML_CLASS_BIN_EXTRA, paths, openml_cache_dir,
                          max_n_classes=2, include_only_ds_names=multiclass_names)

        if import_automl_class_small:
            print(f'Importing automl-class-small')
            import_openml(automl_class_task_ids, TaskSource.AUTOML_CLASS_SMALL, paths, openml_cache_dir,
                          ignore_above_n_classes=50, min_n_samples=1000, max_n_samples=100_000)
            descs = TaskCollection.from_source(TaskSource.AUTOML_CLASS_SMALL, paths).task_descs
            filtered_descs = [td for td in descs if check_task(td, max_one_hot_size=1000)]
            TaskCollection('automl-class-small-filtered', filtered_descs).save(paths)

    if import_grinsztajn:
        print(f'Importing grinsztain benchmark')
        import_grinsztajn_datasets(openml_cache_dir)

    if import_grinsztajn_medium:
        print(f'Importing grinsztain medium benchmark')
        import_grinsztajn_medium_datasets(openml_cache_dir)

    if import_tabzilla_hard:
        print(f'Importing TabZilla hard benchmark')
        import_tabzilla_hard_datasets(openml_cache_dir)

    if import_talent_class_small:
        if talent_folder is None:
            raise ValueError(f'Please specify talent_folder to import datasets from the TALENT benchmark')
        import_talent_benchmark(paths, talent_folder=talent_folder, source_name='talent-class-small',
                                allow_regression=False,
                                min_n_samples=1000, max_n_samples=100_000, ignore_above_n_classes=100)
        task_infos = TaskCollection.from_source('talent-class-small', paths).load_infos(paths)
        bin_task_descs = [ti.task_desc for ti in task_infos if ti.get_n_classes() == 2]
        multi_task_descs = [ti.task_desc for ti in task_infos if ti.get_n_classes() != 2]
        TaskCollection('talent-bin-class-small', bin_task_descs).save(paths)
        TaskCollection('talent-multi-class-small', multi_task_descs).save(paths)
        above10k_descs = [ti.task_desc for ti in task_infos if ti.n_samples >= 10_000]
        below10k_descs = [ti.task_desc for ti in task_infos if ti.n_samples < 10_000]
        TaskCollection('talent-class-small-above10k', above10k_descs).save(paths)
        TaskCollection('talent-class-small-below10k', below10k_descs).save(paths)

        talent_reg_tabpfn_task_descs = [ti.task_desc for ti in task_infos if
                                        ti.get_n_classes() <= 10 and ti.n_samples <= 10_000 and ti.tensor_infos[
                                            'x_cont'].get_n_features() + ti.tensor_infos[
                                            'x_cat'].get_n_features() <= 500]

        TaskCollection('talent-class-tabpfn', talent_reg_tabpfn_task_descs).save(paths)

    if import_talent_reg_small:
        if talent_folder is None:
            raise ValueError(f'Please specify talent_folder to import datasets from the TALENT benchmark')
        import_talent_benchmark(paths, talent_folder=talent_folder, source_name='talent-reg-small',
                                allow_regression=True, allow_classification=False,
                                min_n_samples=1000, max_n_samples=100_000)

        task_infos = TaskCollection.from_source('talent-reg-small', paths).load_infos(paths)
        talent_reg_tabpfn_task_descs = [ti.task_desc for ti in task_infos if
                                        ti.n_samples <= 10_000 and ti.tensor_infos[
                                            'x_cont'].get_n_features() + ti.tensor_infos[
                                            'x_cat'].get_n_features() <= 500]

        TaskCollection('talent-reg-tabpfn', talent_reg_tabpfn_task_descs).save(paths)

    if import_tabarena:
        all_ids = {
            TaskSource.TABARENA_REG: [363611, 363612, 363615, 363622, 363625, 363631, 363672, 363675, 363678, 363686, 363693, 363697,
                    363698, 363701, 363705, 363708, 363709],
            TaskSource.TABARENA_CLASS: [363613, 363614, 363616, 363617, 363618, 363619, 363620, 363621, 363623, 363624, 363626, 363627,
                      363628, 363629, 363630, 363632, 363671, 363673, 363674, 363676, 363677, 363679, 363680, 363681,
                      363682, 363683, 363684, 363685, 363687, 363688, 363689, 363691, 363692, 363694, 363695, 363696,
                      363699, 363700, 363702, 363703, 363704, 363706, 363707]
        }

        for task_source, ids in all_ids.items():
            print(f'Importing {task_source}')
            class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs
            multiclass_names = [td.task_name for td in class_descs if td.load_info(paths).get_n_classes() > 2]
            # print(f'{multiclass_names=}')
            import_openml(ids, task_source, paths, openml_cache_dir, min_n_samples=500)
            TaskCollection.from_source(task_source, paths).save(paths)


def import_grinsztajn_datasets(openml_cache_dir: str = None):
    # import data sets from the benchmark of Grinsztajn et al.
    paths = Paths.from_env_variables()
    import_openml(get_openml_task_ids(334), 'grinsztajn-cat-class', paths, openml_cache_dir,
                  max_n_samples=500000,
                  rerun=False)
    import_openml(get_openml_task_ids(335), 'grinsztajn-cat-reg', paths, openml_cache_dir,
                  normalize_y=True, max_n_samples=500000,
                  rerun=False)
    import_openml(get_openml_task_ids(336), 'grinsztajn-num-reg', paths, openml_cache_dir,
                  normalize_y=True, max_n_samples=500000,
                  rerun=False)
    import_openml(get_openml_task_ids(337), 'grinsztajn-num-class', paths, openml_cache_dir,
                  max_n_samples=500000,
                  rerun=False)

    import_openml(get_openml_task_ids(334), 'grinsztajn-cat-class-15k', paths, openml_cache_dir,
                  max_n_samples=15_000,
                  rerun=False)
    import_openml(get_openml_task_ids(335), 'grinsztajn-cat-reg-15k', paths, openml_cache_dir,
                  normalize_y=True, max_n_samples=15_000,
                  rerun=False)
    import_openml(get_openml_task_ids(336), 'grinsztajn-num-reg-15k', paths, openml_cache_dir,
                  normalize_y=True, max_n_samples=15_000,
                  rerun=False)
    import_openml(get_openml_task_ids(337), 'grinsztajn-num-class-15k', paths, openml_cache_dir,
                  max_n_samples=15_000,
                  rerun=False)


def import_grinsztajn_medium_datasets(openml_cache_dir: str = None):
    paths = Paths.from_env_variables()
    for bench_name, bench_id_cat, bench_id_num in [('grinsztajn-class', 334, 337), ('grinsztajn-reg', 335, 336)]:
        task_ids_cat = get_openml_task_ids(bench_id_cat)
        task_ids_num = get_openml_task_ids(bench_id_num)
        task_ids = task_ids_cat + [task_id for task_id in task_ids_num if task_id not in task_ids_cat]
        import_openml(task_ids, bench_name, paths, openml_cache_dir,
                      max_n_samples=500_000,  # normalize_y=(bench_name=='grinsztajn-reg'),
                      rerun=False)
        task_infos = TaskCollection.from_source(bench_name, paths).load_infos(paths)
        for task_info in task_infos:
            # use 13333 so the 75%-25% train-val split will use 10k training samples
            task_info.max_n_trainval = 13_333
            task_info.save(paths)

    tc_orig = TaskCollection.from_source('grinsztajn-class', paths)
    tc_orig.save(paths)
    # exclude eye_movements because it has a leak according to the TabR paper
    tc = TaskCollection('grinsztajn-class-filtered',
                        [task_desc for task_desc in tc_orig.task_descs if task_desc.task_name != 'eye_movements'])
    tc.save(paths)


def import_tabzilla_hard_datasets(openml_cache_dir: str = None):
    # import data sets from the benchmark of Grinsztajn et al.
    paths = Paths.from_env_variables()
    import_openml(get_openml_task_ids(379), 'tabzilla-hard-class', paths, openml_cache_dir,
                  rerun=False)


def split_meta_test(paths: Paths):
    for task_type in ['class', 'reg']:
        coll_name = f'meta-test-{task_type}'
        task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)

        def is_ood(task_info: TaskInfo):
            if task_info.n_samples < 1500 or task_info.n_samples > 60000:
                return True
            n_features = (task_info.tensor_infos['x_cont'].get_n_features()
                          + task_info.tensor_infos['x_cat'].get_n_features())
            if n_features > 750:
                return True
            x_cat_info = task_info.tensor_infos['x_cat']
            if x_cat_info.get_n_features() > 0 and x_cat_info.get_cat_sizes().max().item() > 50:
                return True
            return False

        id_task_descs = [task_info.task_desc for task_info in task_infos if not is_ood(task_info)]
        ood_task_descs = [task_info.task_desc for task_info in task_infos if is_ood(task_info)]

        TaskCollection(f'{coll_name}-indist', id_task_descs).save(paths)
        TaskCollection(f'{coll_name}-oodist', ood_task_descs).save(paths)

        print(f'{len(id_task_descs)=}, {len(ood_task_descs)=}')


# could extend this for other task collections like openml-cc18, pmlb, uci121 or uci-small

if __name__ == '__main__':
    fire.Fire(run_import)
    # import_grinsztajn_datasets()
    # paths = Paths.from_env_variables()
    # split_meta_test(paths)

    # meta_train = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    # only_bin_class = [info.task_desc for info in meta_train if info.get_n_classes() == 2]
    # only_multi_class = [info.task_desc for info in meta_train if info.get_n_classes() > 2]
    # TaskCollection('meta-train-bin-class', only_bin_class).save(paths)
    # TaskCollection('meta-train-multi-class', only_multi_class).save(paths)

    # print(get_openml_ds_names([361011]))
    # ctr23_reg_task_ids = get_openml_task_ids(353)
    # ctr23_reg_ds_names = get_openml_ds_names(ctr23_reg_task_ids)
    # for ds_name in ctr23_reg_ds_names:
    #     print(ds_name)

    # test brazilian houses data set
    # import openml
    # import pandas as pd
    # task = openml.tasks.get_task(361267, download_data=False)
    # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=True)
    # df: pd.DataFrame = dataset.get_data()[0]
    # print(df.head())
    # print(dataset.dataset_id)

    # test sarcos dataset
    # import openml
    # task = openml.tasks.get_task(361011, download_data=False)
    # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)
    # print(dataset.dataset_id)


================================================
FILE: scripts/estimate_resource_params.py
================================================
import multiprocessing
import time
from typing import List, Dict, Any, Callable

import numpy as np
import sklearn
import torch

from sklearn.base import BaseEstimator

from pytabkit.models.utils import FunctionProcess
from pytabkit.models.alg_interfaces.resource_computation import UniformSampler, FeatureSpec, get_resource_features, \
    process_resource_features, \
    Sampler, ds_to_xy, fit_resource_factors, TimeWrapper
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskCollection
from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset
from pytabkit.models.sklearn.sklearn_interfaces import CatBoost_TD_Classifier, XGB_TD_Classifier, LGBM_TD_Classifier


def get_param_grid(grids_1d: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
    configs = [dict()]
    for key, values in grids_1d.items():
        configs = [utils.update_dict(c, {key: val}) for val in values for c in configs]
    return configs


def estimate_params(paths: Paths, exp_name: str, coll_name: str, estimator: BaseEstimator, is_lgbm: bool = False,
                    rerun: bool = False):
    if is_lgbm:
        # use num_leaves instead of max_depth
        learner_space = dict(
            n_estimators=UniformSampler(2, 2, log=True, is_int=True),
            n_threads=UniformSampler(4, 4, log=True, is_int=True),
            num_leaves=UniformSampler(10, 100, log=True, is_int=True),
        )
        time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                               FeatureSpec.product('n_cv_refit', 'n_splits',
                                                                   ['', 'log_num_leaves', 'num_leaves'],
                                                                   'n_estimators', '1/n_threads',
                                                                   FeatureSpec.powerset_products('n_features',
                                                                                                 'n_samples',
                                                                                                 'n_tree_repeats')))
        ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                              FeatureSpec.product(['', 'log_num_leaves', 'num_leaves'],
                                                                  FeatureSpec.powerset_products('n_features',
                                                                                                'n_samples',
                                                                                                'n_tree_repeats')))
    else:
        learner_space = dict(
            n_estimators=UniformSampler(2, 2, log=True, is_int=True),
            n_threads=UniformSampler(4, 4, log=True, is_int=True),
            max_depth=UniformSampler(3, 10, is_int=True),
        )
        time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                               FeatureSpec.product('n_cv_refit', 'n_splits',
                                                                   ['', 'max_depth', '2_power_maxdepth'],
                                                                   'n_estimators', '1/n_threads',
                                                                   FeatureSpec.powerset_products('n_features',
                                                                                                 'n_samples',
                                                                                                 'n_tree_repeats')))
        ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                              FeatureSpec.product(['', 'max_depth', '2_power_maxdepth'],
                                                                  FeatureSpec.powerset_products('n_features',
                                                                                                'n_samples',
                                                                                                'n_tree_repeats')))

    coefs = calibrate_resources(exp_name, paths=paths, learner_space=learner_space,
                                coll_name=coll_name,
                                time_feature_spec=time_feature_spec,
                                ram_feature_spec=ram_feature_spec, sklearn_learner=estimator, n_combinations=300,
                                rerun=rerun)
    print(f'time_params={coefs["time_s"]}')
    print(f'cpu_ram_params={coefs["ram_gb"]}')

    ram_params = coefs['ram_gb']
    time_params = coefs['time_s']

    print(f'Analyzing dionis:')
    task_info = TaskDescription('openml-class', 'dionis').load_info(paths)
    # task_info = TaskDescription('uci-bin-class', 'madelon').load_info(paths)
    ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples)
    config = dict(n_estimators=1000, n_threads=4, max_depth=6, num_leaves=31)
    raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1)
    ram_features = process_resource_features(raw_features, ram_feature_spec)
    ram_gb = sum([ram_features[key] * ram_params[key] for key in ram_params])
    time_features = process_resource_features(raw_features, time_feature_spec)
    time_s = sum([time_features[key] * time_params[key] for key in time_params])
    print(f'{ram_gb=}, {time_s=}')


def estimate_params_new(paths: Paths, exp_name: str, coll_name: str, estimator: BaseEstimator,
                        hparam_grid: List[Dict[str, Any]], short_name: str, is_lgbm: bool = False, rerun: bool = False):
    if is_lgbm:
        # use num_leaves instead of max_depth
        time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                               FeatureSpec.product('n_cv_refit', 'n_splits',
                                                                   ['', 'log_num_leaves', 'num_leaves'],
                                                                   'n_estimators', '1/n_threads',
                                                                   FeatureSpec.powerset_products('n_features',
                                                                                                 'n_samples',
                                                                                                 'n_tree_repeats')))
        ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                              FeatureSpec.product(['', 'log_num_leaves', 'num_leaves'],
                                                                  FeatureSpec.powerset_products('n_features',
                                                                                                'n_samples',
                                                                                                'n_tree_repeats')))
    else:
        time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                               FeatureSpec.product('n_cv_refit', 'n_splits',
                                                                   ['', 'max_depth', '2_power_maxdepth'],
                                                                   'n_estimators', '1/n_threads',
                                                                   FeatureSpec.powerset_products('n_features',
                                                                                                 'n_samples',
                                                                                                 'n_tree_repeats')))
        ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb',
                                              FeatureSpec.product(['', 'max_depth', '2_power_maxdepth'],
                                                                  FeatureSpec.powerset_products('n_features',
                                                                                                'n_samples',
                                                                                                'n_tree_repeats')))

    coefs = calibrate_resources_new_2(exp_name, paths=paths, hparam_grid=hparam_grid,
                                      coll_name=coll_name,
                                      time_feature_spec=time_feature_spec,
                                      ram_feature_spec=ram_feature_spec, sklearn_learner=estimator,
                                      rerun=rerun)
    print(f'{short_name}_time={coefs["time_s"]}')
    print(f'{short_name}_ram={coefs["ram_gb"]}')

    ram_params = coefs['ram_gb']
    time_params = coefs['time_s']

    print(f'Analyzing dionis:')
    task_info = TaskDescription('openml-class', 'dionis').load_info(paths)
    # task_info = TaskDescription('uci-bin-class', 'madelon').load_info(paths)
    ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples)
    config = dict(n_estimators=1000, n_threads=4, max_depth=6, num_leaves=31)
    raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1)
    ram_features = process_resource_features(raw_features, ram_feature_spec)
    ram_gb = sum([ram_features[key] * ram_params[key] for key in ram_params])
    time_features = process_resource_features(raw_features, time_feature_spec)
    time_s = sum([time_features[key] * time_params[key] for key in time_params])
    print(f'{ram_gb=}, {time_s=}')


if __name__ == '__main__':
    print(get_param_grid(dict(n_estimators=[2], max_depth=[4, 6, 7, 9])))
    paths = Paths.from_env_variables()
    # estimate_catboost_params(paths)
    # estimate_params(paths, 'CB-class-7', 'meta-test-class', CatBoostTDClassifier(verbosity=2))
    # # estimate_params(paths, 'CB-reg-7', 'meta-test-reg', CatBoostTDClassifier(verbosity=2))
    # estimate_params(paths, 'XGB-class-2', 'meta-test-class',
    #                 XGBTDClassifier(verbosity=2, subsample=1.0, colsample_bytree=1.0, colsample_bylevel=1.0))
    # estimate_params(paths, 'LGBM-class-3', 'meta-test-class',
    #                 LGBMTDClassifier(subsample=1.0), is_lgbm=True)
    estimate_params_new(paths, 'CB-class-11', 'meta-test-class',
                        CatBoost_TD_Classifier(subsample=1.0),
                        hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4], max_depth=[4, 6, 7, 9])),
                        short_name='cb_class')
    # estimate_params(paths, 'CB-reg-7', 'meta-test-reg', CatBoostTDClassifier(verbosity=2))
    estimate_params_new(paths, 'XGB-class-3', 'meta-test-class',
                        XGB_TD_Classifier(subsample=1.0, colsample_bytree=1.0, colsample_bylevel=1.0),
                        hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4], max_depth=[4, 6, 8, 11])),
                        short_name='xgb_class')
    estimate_params_new(paths, 'LGBM-class-4', 'meta-test-class',
                        LGBM_TD_Classifier(subsample=1.0, colsample_bytree=1.0),
                        hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4],
                                                        num_leaves=[31, 100, 300, 1000])),
                        short_name='lgbm_class',
                        is_lgbm=True)
    pass


def calibrate_resources(exp_name: str, paths: Paths,
                        learner_space: Dict[str, Sampler],
                        coll_name: str,
                        time_feature_spec: List[str],
                        ram_feature_spec: List[str],
                        sklearn_learner: BaseEstimator, n_combinations: int,
                        rerun: bool) \
        -> Dict[str, Dict[str, float]]:
    if multiprocessing.get_start_method() != 'spawn':
        multiprocessing.set_start_method('spawn', force=True)
    all_results = []
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    for i in range(n_combinations):
        np.random.seed(i)
        torch.manual_seed(i)
        file_path = paths.resources_exp_it(exp_name, i) / 'results.yaml'
        learner_params = {key: value.sample() for key, value in learner_space.items()}
        task_idx = np.random.randint(len(task_infos))
        task_info = task_infos[task_idx]
        print(f'Iteration {i + 1}/{n_combinations}: Evaluating {type(sklearn_learner)} with \n'
              f'{str(task_info.task_desc)=}\n'
              f'{learner_params=}', flush=True)

        if utils.existsFile(file_path) and not rerun:
            print(f'Loading saved result')
            all_results.append(utils.deserialize(file_path, use_yaml=True))
        else:
            print(f'Running estimator...')
            # compute it

            learner: BaseEstimator = sklearn.base.clone(sklearn_learner)
            learner.set_params(**learner_params)
            ds = task_info.load_task(paths).ds
            X, y = ds_to_xy(ds)
            f = lambda learner_=learner, X_=X, y_=y[:, 0]: learner_.fit(X_, y_)
            new_results: Dict[str, Dict[str, Any]] = dict()
            new_results['measured'] = measure_resources(f)
            new_results['features'] = get_resource_features(config=learner_params, ds=ds,
                                                            n_cv=1, n_refit=0, n_splits=1)
            # new_results['features'] = {'time_s': time_feature_map.get_features(ds),
            #                            'ram_gb': ram_feature_map.get_features(ds)}

            all_results.append(new_results)
            utils.serialize(file_path, new_results, use_yaml=True)

        print(all_results[-1]['measured'])

    coefs = dict()

    coefs['time_s'] = fit_resource_factors([(process_resource_features(results['features'], time_feature_spec),
                                             results['measured']['time_s'])
                                            for results in all_results], pessimistic=False)
    coefs['ram_gb'] = fit_resource_factors([(process_resource_features(results['features'], ram_feature_spec),
                                             results['measured']['ram_gb'])
                                            for results in all_results], pessimistic=True)
    return coefs


def calibrate_resources_new_2(exp_name: str, paths: Paths,
                              hparam_grid: List[Dict[str, Any]],
                              coll_name: str,
                              time_feature_spec: List[str],
                              ram_feature_spec: List[str],
                              sklearn_learner: BaseEstimator,
                              rerun: bool) \
        -> Dict[str, Dict[str, float]]:
    if multiprocessing.get_start_method() != 'spawn':
        multiprocessing.set_start_method('spawn', force=True)
    all_results = []
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    for idx_1, task_info in enumerate(task_infos):
        for idx_2, learner_params in enumerate(hparam_grid):
            i = idx_1 * len(hparam_grid) + idx_2
            np.random.seed(i)
            torch.manual_seed(i)
            file_path = paths.resources_exp_it(exp_name, i) / 'results.yaml'
            print(f'Iteration {i + 1}/{len(task_infos)*len(hparam_grid)}: Evaluating {type(sklearn_learner)} with \n'
                  f'{str(task_info.task_desc)=}\n'
                  f'{learner_params=}', flush=True)

            if utils.existsFile(file_path) and not rerun:
                print(f'Loading saved result')
                all_results.append(utils.deserialize(file_path, use_yaml=True))
            else:
                print(f'Running estimator...')
                # compute it

                learner: BaseEstimator = sklearn.base.clone(sklearn_learner)
                learner.set_params(**learner_params)
                ds = task_info.load_task(paths).ds
                X, y = ds_to_xy(ds)
                f = lambda learner_=learner, X_=X, y_=y[:, 0]: learner_.fit(X_, y_)
                new_results: Dict[str, Dict[str, Any]] = dict()
                new_results['measured'] = measure_resources(f)
                new_results['features'] = get_resource_features(config=learner_params, ds=ds,
                                                                n_cv=1, n_refit=0, n_splits=1)
                # new_results['features'] = {'time_s': time_feature_map.get_features(ds),
                #                            'ram_gb': ram_feature_map.get_features(ds)}

                all_results.append(new_results)
                utils.serialize(file_path, new_results, use_yaml=True)

            print(all_results[-1]['measured'])

    coefs = dict()

    coefs['time_s'] = fit_resource_factors([(process_resource_features(results['features'], time_feature_spec),
                                             results['measured']['time_s'])
                                            for results in all_results], pessimistic=True)
    coefs['ram_gb'] = fit_resource_factors([(process_resource_features(results['features'], ram_feature_spec),
                                             results['measured']['ram_gb'])
                                            for results in all_results], pessimistic=True, coef_factor=1.6)
    return coefs


def measure_resources(f: Callable[[], None]) -> Dict[str, float]:
    # open function in one process (that measures the time), poll the RAM usages from another process
    process = FunctionProcess(TimeWrapper(f))
    process.start()
    time_interval = 0.01
    max_ram_usage_gb = 0.0
    while not process.is_done():
        max_ram_usage_gb = max(max_ram_usage_gb, process.get_ram_usage_gb())
        time.sleep(time_interval)
    process_time = process.pop_result()
    return {'time_s': process_time, 'ram_gb': max_ram_usage_gb}


================================================
FILE: scripts/get_sklearn_names.py
================================================
import importlib
# get the names of all sklearn interfaces, for exporting them in __all__ to import them from a higher-level module

if __name__ == '__main__':
    # Import the module
    module = importlib.import_module("pytabkit.models.sklearn.sklearn_interfaces")

    # Get all top-level attributes of the module (like classes, functions)
    attrs = [attr_name for attr_name in dir(module)
             if not attr_name.startswith('_') and not 'Mixin' in attr_name
             and hasattr(getattr(module, attr_name), '__module__')
             and getattr(module, attr_name).__module__ == module.__name__]
    print(f'"' + '", "'.join(attrs) + '"')


================================================
FILE: scripts/make_plot_animation.py
================================================
from typing import List

from pytabkit.bench.eval.plotting import plot_pareto
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.eval.analysis import ResultsTables
from pathlib import Path

def plot_animations(coll_names: List[str]):
    paths = Paths.from_env_variables()

    tables = ResultsTables(paths)

    arrow_alg_names = [('MLP-PLR-D', 'RealMLP-TD'), ('TabR-S-D', 'RealTabR-D'), ('XGB-D', 'XGB-TD'),
                        ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'), ('MLP-PLR-HPO', 'RealMLP-HPO')]

    alg_names = [f'{method}-{version}'
                    for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel', 'Ensemble']
                    for version in ['D', 'TD', 'HPO']]
    alg_names.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO',
                        'MLP-PLR-D', 'MLP-PLR-HPO', 'RealTabR-D', 'FTT-D', 'FTT-HPO',
                        'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D', 'TabR-HPO'])

    alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO",
                        "MLP-RTDL-HPO"]

    # #all
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             plot_pareto_frontier=False,
    #             filename_suffix='_1',
    #             subfolder='animations',
    #             alg_names_to_hide=[])#alg_name for alg_name in alg_names if alg_name not in black_border_alg_names])
    #
    # # show pareto frontier
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_2',
    #             subfolder='animations',
    #             alg_names_to_hide=[])#alg_name for alg_name in alg_names if alg_name not in black_border_alg_names])
    #
    # # show only MLP models
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_3',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])
    #
    # # add NN baselines
    # alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO",
    #                     "MLP-RTDL-HPO", "TabR-S-D", "TabR-HPO", "FTT-D", "FTT-HPO"]
    #
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_4',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])
    #
    # # show that we can also improve TabR with RealTabr
    # alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO",
    #                     "MLP-RTDL-HPO", "TabR-S-D", "TabR-HPO", "RealTabR-D", "FTT-D", "FTT-HPO"]
    #
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_5',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])
    #
    # #show that we can also create TD for trees
    # alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", "CatBoost-HPO",
    #                     "XGB-D", "XGB-TD", "XGB-HPO",
    #                     "LGBM-D", "LGBM-TD", "LGBM-HPO"]
    #
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_6',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])
    #
    # # show that ensembles work well for td
    # alg_names_to_keep = ["CatBoost-TD", "CatBoost-HPO",
    #                     "XGB-TD", "XGB-HPO",
    #                     "LGBM-TD", "LGBM-HPO",
    #                     "RealMLP-TD", "RealMLP-HPO",
    #                     "Ensemble-D", "BestModel-D",
    #                     "Ensemble-TD", "Ensemble-HPO",
    #                     "BestModel-TD", "BestModel-HPO"]
    #
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_7',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])

    # alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", #"CatBoost-HPO",
    #                      "XGB-D", "XGB-TD", #"XGB-HPO",
    #                      "LGBM-D", "LGBM-TD", #"LGBM-HPO",
    #                      "MLP-PLR-D", "MLP-PLR-HPO",
    #                      "RealMLP-TD", "RealMLP-HPO",
    #                      "TabR-S-D", "RealTabR-D"]
    #
    # plot_pareto(paths, tables,
    #             coll_names=coll_names,
    #             alg_names=alg_names,
    #             use_ranks=False, use_normalized_errors=False,
    #             use_grinnorm_errors=False,
    #             use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
    #             pareto_frontier_width=4.,
    #             filename_suffix='_8',
    #             subfolder='animations',
    #             alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])

    alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", "CatBoost-HPO",
                         "MLP-PLR-D", "MLP-PLR-HPO",
                         "RealMLP-TD", "RealMLP-HPO",
                         "TabR-S-D", "RealTabR-D", "TabR-HPO",
                         "BestModel-D", "BestModel-TD", "BestModel-HPO"]

    plot_pareto(paths, tables,
                coll_names=coll_names,
                alg_names=alg_names,
                use_ranks=False, use_normalized_errors=False,
                use_grinnorm_errors=False,
                use_geometric_mean=True, arrow_alg_names=arrow_alg_names,
                pareto_frontier_width=4.,
                filename_suffix='_9',
                subfolder='animations',
                alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep])


    # animation
    # everything
    # then bigger pareto front
    # then remove everything except the algorithms of interest


if __name__ == '__main__':
    coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', 'grinsztajn-class-filtered',
                    'grinsztajn-reg']

    plot_animations(['meta-test-class', 'meta-test-reg'])
    plot_animations(['grinsztajn-class-filtered', 'grinsztajn-reg'])
    plot_animations(['meta-train-class', 'meta-train-reg'])
    plot_animations(['meta-test-class', 'grinsztajn-class-filtered'])
    plot_animations(['meta-test-reg', 'grinsztajn-reg'])


================================================
FILE: scripts/meta_hyperopt.py
================================================
from typing import Optional, Tuple, Any, Dict

import numpy as np

from pytabkit.bench.alg_wrappers.interface_wrappers import LGBMInterfaceWrapper, XGBInterfaceWrapper, \
    CatBoostInterfaceWrapper
from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskCollection
from pytabkit.bench.eval.evaluation import FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector, \
    MeanTableAnalyzer
from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models import utils
from pytabkit.models.hyper_opt.coord_opt import Hyperparameter, CoordOptimizer
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer, f_unpack_dict
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.models.nn_models.categorical import EncodingFactory, SingleOrdinalEncodingFactory
from pytabkit.models.training.logging import StdoutLogger


def load_score(alg_name: Optional[str] = None, coll_name: str = 'meta-train-class', n_cv: int = 1,
               val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None,
               split_type: str = SplitType.RANDOM, use_task_weighting: bool = True,
               data_path: Optional[str] = None) -> Tuple[float, Any]:
    paths = Paths(data_path) if data_path is not None else Paths.from_env_variables()
    if '/' in coll_name:
        # use a single task
        parts = coll_name.split('/')
        if len(parts) != 2:
            raise ValueError(f'Too many / in coll_name {coll_name}')
        task_collection = TaskCollection(coll_name, [TaskDescription(*parts)])
    else:
        task_collection = TaskCollection.from_name(coll_name, paths)
    # print('load table')
    # table = MultiResultsTable.load_summaries(task_collection, n_cv=n_cv, paths=paths)
    alg_filter = FunctionAlgFilter(lambda an, tags, aw: an == alg_name)
    table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=paths, split_type=split_type, alg_filter=alg_filter)
    # print('process table')
    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict={},
                                              val_metric_name=val_metric_name,
                                              test_metric_name=test_metric_name)
    analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + 1e-2) - np.log(1e-2), use_weighting=use_task_weighting)
    means = analyzer.get_means(test_table)
    print(f'Mean scores for {alg_name}: {means}')
    return means[0], None


class AlgConfigRunner:
    def __init__(self, paths: Paths, coll_name: str, create_wrapper, base_name: str, tag: Optional[str] = None,
                 short_key_map: Dict[str, str] = None, **default_params):
        self.paths = paths
        self.coll_name = coll_name
        self.create_wrapper = create_wrapper
        self.base_name = base_name
        self.tag = tag or base_name
        self.default_params = default_params
        self.short_key_map = short_key_map or {}

    def __call__(self, config):
        config = f_unpack_dict(config)
        print(f'HPO config: {config}')
        # compute alg_name, potentially round config arguments
        alg_name_parts = [self.base_name]
        rounded_config = {}
        for key, value in config.items():
            if key in self.short_key_map:
                short_key = self.short_key_map[key]
            else:
                short_key = key

            if isinstance(value, float):
                alg_name_parts.append(f'{short_key}-{value:g}')
                rounded_config[key] = float(f'{value:g}')
            else:
                alg_name_parts.append(f'{short_key}-{value}')
                rounded_config[key] = value
        alg_name = '_'.join(alg_name_parts)

        try:
            # if already computed, return the computed result
            return load_score(alg_name, self.coll_name)
        except IndexError:
            pass

        # call wrapper with alg_name, tag, default_params and config
        wrapper = self.create_wrapper(**utils.join_dicts(self.default_params, config))

        # run on task_infos
        task_infos = TaskCollection.from_name(self.coll_name, self.paths).load_infos(self.paths)
        job_mgr = TabBenchJobManager(self.paths)
        scheduler = SimpleJobScheduler(RayJobManager())
        config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0)
        job_mgr.add_jobs(task_infos, config_10_1_0, alg_name, wrapper, tags=[self.tag])
        job_mgr.run_jobs(scheduler)
        # load result
        return load_score(alg_name, self.coll_name)


def test_hyperopt_seed():
    from hyperopt import hp
    space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(5e-3), np.log(3e-1)),
        'num_leaves': hp.qloguniform('num_leaves', np.log(7), np.log(256), 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1),
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
        'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5),
        'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
        'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
    }
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100, hyperopt_algo='tpe')
    def print_params(params):
        print(params)
        return 0.0, None
    opt.optimize(print_params, seed=1234, opt_desc='LGBM-tuning-1', logger=StdoutLogger(verbosity_level=1))


def run_lgbm_train_class():
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='train-class',
                          create_wrapper=LGBMInterfaceWrapper,
                          base_name='LGBM-tuning-1',
                          short_key_map=short_key_map)
    from hyperopt import hp
    space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(5e-3), np.log(3e-1)),
        'num_leaves': hp.qloguniform('num_leaves', np.log(7), np.log(256), 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1),
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
        'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5),
        'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
        'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
    }
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100, hyperopt_algo='tpe')
    opt.optimize(acr, seed=1234, opt_desc='LGBM-tuning-1', logger=StdoutLogger(verbosity_level=1))


def run_lgbm_train_class_smac():
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-class',
                          create_wrapper=LGBMInterfaceWrapper,
                          base_name='LGBM-tuning-smac-1',
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (5e-3, 3e-1), log=True),
        Integer('num_leaves', (7, 256), log=True),
        Float('feature_fraction', (0.3, 1)),
        Float('bagging_fraction', (0.3, 1)),
        Integer('min_data_in_leaf', (1, 64), log=True),
        Float('min_sum_hessian_in_leaf', (np.exp(-16), np.exp(5)), log=True),
        Float('lambda_l1', (np.exp(-16), np.exp(2)), log=True),
        Float('lambda_l2', (np.exp(-16), np.exp(2)), log=True),
    ])
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder)
        opt.optimize(acr, seed=1234, opt_desc='LGBM-tuning-smac-1', logger=StdoutLogger(verbosity_level=1))


def run_lgbm_train_class_smac_2(use_reg: bool = False):
    base_name = 'LGBM-tuning-smac-2-reg' if use_reg else 'LGBM-tuning-smac-2'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=LGBMInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2),
        Integer('num_leaves', (16, 64), log=True, default=31),
        Float('feature_fraction', (0.5, 1), default=0.75),
        Float('bagging_fraction', (0.5, 1), default=0.75),
        Integer('min_data_in_leaf', (1, 64), log=True, default=5),
        Float('min_sum_hessian_in_leaf', (1e-7, 1e-2), log=True, default=1e-5),
        Float('lambda_l1', (1e-7, 1e-3), log=True, default=1e-7),
        Float('lambda_l2', (1e-7, 1e-3), log=True, default=1e-7),
    ])
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_lgbm_train_class_smac_3(use_reg: bool = False):
    base_name = 'LGBM-tuning-smac-3-reg' if use_reg else 'LGBM-tuning-smac-3'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=LGBMInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2),
        Integer('num_leaves', (16, 128), log=True, default=31),  # larger max num_leaves than for smac-2
        Float('feature_fraction', (0.5, 1), default=0.75),
        Float('bagging_fraction', (0.5, 1), default=0.75),
        Integer('min_data_in_leaf', (1, 64), log=True, default=5),
        Float('min_sum_hessian_in_leaf', (1e-7, 1e-2), log=True, default=1e-5),
        Float('lambda_l1', (1e-7, 1e-3), log=True, default=1e-7),
        Float('lambda_l2', (1e-7, 1e-3), log=True, default=1e-7),
    ])
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder,
                            n_initial_design=25)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_lgbm_train_class_coord():
    base_name = 'LGBM-tuning-coord-1'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-class',
                          create_wrapper=LGBMInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    space = {
        'learning_rate': Hyperparameter(start_value=np.log(0.1), min_step_size=0.1, importance=1.0, log_scale=True),
        'num_leaves': Hyperparameter(np.log(31), 0.1, 0.2, log_scale=True, only_int=True),
        'feature_fraction': Hyperparameter(1.0, 0.01, 0.4, min_value=0.3, max_value=1.0),
        'bagging_fraction': Hyperparameter(1.0, 0.01, 0.4, min_value=0.3, max_value=1.0),
        'min_data_in_leaf': Hyperparameter(np.log(20), 0.1, 0.2, log_scale=True, only_int=True, max_value=np.log(128)),
        'min_sum_hessian_in_leaf': Hyperparameter(np.log(1e-3), 0.1, 0.6, log_scale=True),
        'lambda_l1': Hyperparameter(np.log(1e-5), 0.1, 0.2, log_scale=True, min_value=-16.0, max_value=2.0),
        'lambda_l2': Hyperparameter(np.log(1e-5), 0.1, 0.2, log_scale=True, min_value=-16.0, max_value=2.0),
    }
    fixed_params = {
        'n_estimators': 1000,
        'bagging_freq': 1
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = CoordOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_xgb_train_class_smac(use_reg: bool = False):
    # XGB-tuning-smac-1 accidentally used LightGBM
    base_name = 'XGB-tuning-smac-2-reg' if use_reg else 'XGB-tuning-smac-2'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    oe_perm_factory = EncodingFactory(SingleOrdinalEncodingFactory(permute_ordinal_encoding=True))

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=lambda **kwargs: XGBInterfaceWrapper(factory=oe_perm_factory, **kwargs),
                          base_name=base_name,
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2),
        Integer('max_depth', (4, 8), default=6),
        Float('subsample', (0.5, 1), default=0.75),
        Float('colsample_bytree', (0.6, 1), default=1.0),
        Float('colsample_bylevel', (0.6, 1), default=1.0),
        Float('colsample_bynode', (0.6, 1), default=1.0),
        Float('min_child_weight', (1e-7, 1e-2), log=True, default=1e-5),
        Float('reg_alpha', (1e-7, 1e-2), log=True, default=1e-7),
        Float('reg_lambda', (1e-7, 1e-2), log=True, default=1e-7),
        Float('reg_gamma', (1e-7, 1e-2), log=True, default=1e-7),
    ])
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder,
                            n_initial_design=25)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_xgb_train_class_smac_3(use_reg: bool = False):
    # XGB-tuning-smac-1 accidentally used LightGBM
    base_name = 'XGB-tuning-smac-3-reg' if use_reg else 'XGB-tuning-smac-3'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    oe_perm_factory = EncodingFactory(SingleOrdinalEncodingFactory(permute_ordinal_encoding=True))

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=lambda **kwargs: XGBInterfaceWrapper(factory=oe_perm_factory, **kwargs),
                          base_name=base_name,
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2),
        Integer('max_depth', (4, 10), default=6),  # increased upper bound to 10
        Float('subsample', (0.5, 1), default=0.75),
        Float('colsample_bytree', (0.6, 1), default=1.0),
        Float('colsample_bylevel', (0.6, 1), default=1.0),
        Float('colsample_bynode', (0.6, 1), default=1.0),
        Float('min_child_weight', (1e-7, 1e-2), log=True, default=1e-5),
        Float('reg_alpha', (1e-7, 1e-2), log=True, default=1e-7),
        Float('reg_lambda', (1e-7, 1e-2), log=True, default=1e-7),
        Float('reg_gamma', (1e-7, 1e-2), log=True, default=1e-7),
    ])
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder,
                            n_initial_design=25)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_catboost_train_class_smac(use_reg: bool = False):
    base_name = 'CatBoost-tuning-smac-reg' if use_reg else 'CatBoost-tuning-smac'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs',
                         one_hot_max_size='ohms', leaf_estimation_iterations='lei',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=CatBoostInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from ConfigSpace import Float, Integer, ConfigurationSpace
    space = ConfigurationSpace()
    space.add_hyperparameters([
        Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2),
        Integer('max_depth', (4, 10), default=8),  # increased upper bound to 10
        Float('l2_leaf_reg', (1e-7, 1e-2), log=True, default=1e-5),
        Float('bagging_temperature', (0.0, 1.0), default=1.0),
        Float('random_strength', (1e-2, 20.0), log=True, default=1.0),
        Integer('one_hot_max_size', (0, 25), default=10),
        Integer('leaf_estimation_iterations', (1, 20), default=1)
    ])
    # todo: also try min_child_samples?
    # todo: try boosting_type and bootstrap_type?  ("Bayesian", "Bernoulli", "MVS")
    # possibly subsample for other bootstrap_type?
    #  https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder,
                            n_initial_design=25)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_catboost_train_class_hyperopt(use_reg: bool = False):
    base_name = 'CatBoost-tuning-hyperopt-reg' if use_reg else 'CatBoost-tuning-hyperopt'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         bootstrap_type='boot', boosting_type='boost',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs',
                         one_hot_max_size='ohms', leaf_estimation_iterations='lei',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=CatBoostInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from hyperopt import hp
    space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)),
        'max_depth': hp.quniform('max_depth', 4, 10, 1),  # this was ignored due to an implementation error
        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)),
        'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)),
        'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1),
        'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1),
        'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': hp.choice('bootstrap_type', [
            {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)},
            {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)}
        ]),
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1.0), np.log(100.0), 1),
    }
    # todo: also try min_child_samples?
    # todo: try boosting_type and bootstrap_type?  ("Bayesian", "Bernoulli", "MVS")
    # possibly subsample for other bootstrap_type?
    #  https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_catboost_train_class_hyperopt_2(use_reg: bool = False):
    base_name = 'CatBoost-tuning-hyperopt-2-reg' if use_reg else 'CatBoost-tuning-hyperopt-2'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         bootstrap_type='boot', boosting_type='boost',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs',
                         one_hot_max_size='ohms', leaf_estimation_iterations='lei',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=CatBoostInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from hyperopt import hp
    space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)),
        'max_depth': hp.quniform('max_depth', 4, 10, 1),  # this was ignored due to an implementation error
        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)),
        'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)),
        'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1),
        'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1),
        'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': hp.choice('bootstrap_type', [
            {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)},
            {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)}
        ]),
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1.0), np.log(100.0), 1),
    }
    #  https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


def run_catboost_train_class_hyperopt_3(use_reg: bool = False):
    base_name = 'CatBoost-tuning-hyperopt-3-reg' if use_reg else 'CatBoost-tuning-hyperopt-3'
    short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl',
                         colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn',
                         bootstrap_type='boot', boosting_type='boost',
                         max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam',
                         subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs',
                         one_hot_max_size='ohms', leaf_estimation_iterations='lei',
                         feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil',
                         min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2')

    acr = AlgConfigRunner(paths=Paths.from_env_variables(),
                          coll_name='meta-train-reg' if use_reg else 'meta-train-class',
                          create_wrapper=CatBoostInterfaceWrapper,
                          base_name=base_name,
                          short_key_map=short_key_map)
    from hyperopt import hp
    space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)),
        'max_depth': hp.quniform('max_depth', 4, 10, 1),  # this was ignored due to an implementation error
        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)),
        'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)),
        'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1),
        'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1),
        'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': hp.choice('bootstrap_type', [
            {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)},
            {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)}
        ]),  # removed min_data_in_leaf since it is not used with SymmetricTree
    }
    #  https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook
    fixed_params = {
        'n_estimators': 1000,
    }
    paths = Paths.from_env_variables()
    with paths.new_tmp_folder() as tmp_folder:
        opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100)
        opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1))


if __name__ == '__main__':
    # load_score('NN-class-special-2', 'train-class')
    # run_lgbm_train_class()
    # run_lgbm_train_class_smac()
    # run_lgbm_train_class_smac_2()
    # run_lgbm_train_class_coord()
    # run_xgb_train_class_smac()
    # run_lgbm_train_class_smac_3(use_reg=True)
    # run_xgb_train_class_smac_3(use_reg=True)
    # run_catboost_train_class_smac(use_reg=True)
    # run_catboost_train_class_hyperopt_2(use_reg=True)
    # run_catboost_train_class_hyperopt_2(use_reg=False)
    run_catboost_train_class_hyperopt_3(use_reg=False)
    # test_hyperopt_seed()
    pass


================================================
FILE: scripts/move_algs.py
================================================
import shutil
from typing import Optional

import fire

from pytabkit.bench.data.paths import Paths
from pytabkit.models import utils


def move_algs(base_path_1: str, base_path_2: str, *alg_names, startswith: Optional[str] = None, dry_run: bool = False):
    paths_1 = Paths(base_folder=base_path_1)
    paths_2 = Paths(base_folder=base_path_2)

    if startswith is not None:
        all_alg_names = [path.name for path in paths_1.algs().iterdir()]
        alg_names = list(alg_names) + [alg_name for alg_name in all_alg_names if alg_name.startswith(startswith)]

    for alg_name in alg_names:
        print(f'Moving alg {alg_name}')

        if dry_run:
            continue

        assert isinstance(alg_name, str)
        assert utils.existsDir(base_path_1)
        assert utils.existsDir(base_path_2)
        assert not utils.existsDir(paths_2.algs() / alg_name)
        assert not utils.existsDir(paths_2.results() / alg_name)
        assert not utils.existsDir(paths_2.result_summaries() / alg_name)

        if utils.existsDir(paths_1.algs() / alg_name):
            shutil.move(paths_1.algs() / alg_name, paths_2.algs() / alg_name)
        if utils.existsDir(paths_1.results() / alg_name):
            shutil.move(paths_1.results() / alg_name, paths_2.results() / alg_name)
        if utils.existsDir(paths_1.result_summaries() / alg_name):
            shutil.move(paths_1.result_summaries() / alg_name, paths_2.result_summaries() / alg_name)


def move_specific_algs(base_path_1: str, base_path_2: str):
    paths_1 = Paths(base_folder=base_path_1)
    alg_names = []
    for path in paths_1.algs().iterdir():
        name = path.name
        # if name.startswith('MLP-cumul-abl-') and not name.startswith('MLP-cumul-abl-new'):
        if name.startswith('MLP-RTDL-HPO') and not name.startswith('MLP-cumul-abl-new'):
            alg_names.append(name)
    # print(alg_names)
    move_algs(base_path_1, base_path_2, *alg_names)


if __name__ == '__main__':
    fire.Fire(move_algs)
    # fire.Fire(move_specific_algs)


================================================
FILE: scripts/move_many_algs.py
================================================
from typing import Optional

import fire

from pytabkit.models import utils
from scripts.move_algs import move_algs


def move_many_algs(base_path_1: str, base_path_2: str, algs_filename: Optional[str] = None, prefixes_filename: Optional[str] = None,
                   dry_run: bool = False):
    if algs_filename is None:
        algs = []
    else:
        algs = [name.strip() for name in utils.readFromFile(algs_filename).split('\n') if name.strip() != '']

    if prefixes_filename is None:
        prefixes = []
    else:
        pprefixes = [name.strip() for name in utils.readFromFile(prefixes_filename).split('\n') if name.strip() != '']

    move_algs(base_path_1, base_path_2, *algs, dry_run=dry_run)
    for prefix in prefixes:
        move_algs(base_path_1, base_path_2, startswith=prefix, dry_run=dry_run)


if __name__ == '__main__':
    fire.Fire(move_many_algs)


================================================
FILE: scripts/print_complete_results.py
================================================
import fire

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.eval.analysis import ResultsTables
from pytabkit.bench.eval.evaluation import DefaultEvalModeSelector


def print_complete_results(coll_name: str, n_splits: int = 10):
    """
    Only show alg_names for which results for all splits exist.
    :param coll_name:
    :param n_splits:
    :return:
    """
    paths = Paths.from_env_variables()
    tables = ResultsTables(paths)
    table = tables.get(coll_name)
    test_table = table.get_test_results_table(DefaultEvalModeSelector())
    test_table = test_table.filter_n_splits(n_splits)
    alg_names = test_table.alg_names
    alg_names.sort(key=lambda x: x.lower())
    print(f'Algorithms with {n_splits} splits available on all datasets of {coll_name}:')
    for alg_name in alg_names:
        print(alg_name)


if __name__ == '__main__':
    fire.Fire(print_complete_results)


================================================
FILE: scripts/print_runtimes.py
================================================
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.eval.runtimes import get_avg_train_times, get_avg_predict_times

if __name__ == '__main__':
    paths = Paths.from_env_variables()
    for coll_name in ['meta-train-class', 'meta-train-reg']:
        times_dict = get_avg_train_times(paths, coll_name, per_1k_samples=True)
        print(f'Average training times per 1K samples for {coll_name}:')
        for alg_name, time_s in times_dict.items():
            print(f'{alg_name}: {time_s:g} s')
        print(times_dict)
    for coll_name in ['meta-train-class', 'meta-train-reg']:
        times_dict = get_avg_predict_times(paths, coll_name, per_1k_samples=True)
        print(f'Average inference times per 1K samples for {coll_name}:')
        for alg_name, time_s in times_dict.items():
            print(f'{alg_name}: {time_s:g} s')
        print(times_dict)


================================================
FILE: scripts/ray_slurm_launch.py
================================================
# from https://docs.ray.io/en/latest/cluster/examples/slurm-launch.html#slurm-launch
# slurm-launch.py
# Usage:
# python slurm-launch.py --exp-name test \
#     --command "rllib train --run PPO --env CartPole-v0"

import argparse
# import subprocess
import sys
import time
import os

from pathlib import Path

from pytabkit.models import utils

template_file = Path(__file__).parent / "ray_slurm_template.sh"
JOB_NAME = "${JOB_NAME}"
NUM_NODES = "${NUM_NODES}"
NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}"
PARTITION_OPTION = "${PARTITION_OPTION}"
ACCOUNT_OPTION = "${ACCOUNT_OPTION}"
COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}"
GIVEN_NODE = "${GIVEN_NODE}"
LOAD_ENV = "${LOAD_ENV}"
TIME = "${TIME}"
MEM_CMD = "${MEM_CMD}"
MAIL_USER = "${MAIL_USER}"
LOG_FOLDER = "${LOG_FOLDER}"
CONDA_ENV_NAME = "${CONDA_ENV_NAME}"

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--exp_name",
        type=str,
        required=True,
        help="The job name and path to logging file (exp_name.out / exp_name.err).")
    parser.add_argument(
        "--conda-env-name",
        type=str,
        required=True,
        help="Conda environment name")
    parser.add_argument(
        "--num_nodes",
        "-n",
        type=int,
        default=1,
        help="Number of nodes to use.")
    parser.add_argument(
        "--mem",
        type=str,
        default=None,
        help="Memory (int + suffix 'mb').")
    parser.add_argument(
        "--time",
        "-t",
        type=str,
        help="Maximum time of job")
    # parser.add_argument(
    #     "--mem",
    #     type=str,
    #     help="Maximum memory of job")
    parser.add_argument(
        "--mail_user",
        "-m",
        type=str,
        default="",
        help="Mail address to which job updates will be sent")
    parser.add_argument(
        "--log_folder",
        "-l",
        type=str,
        default="",
        help="Folder in which to save log files"
    )
    parser.add_argument(
        "--node",
        "-w",
        type=str,
        help="The specified nodes to use. Same format as the "
        "return of 'sinfo'. Default: ''.")
    parser.add_argument(
        "--num-gpus",
        type=int,
        default=0,
        help="Number of GPUs to use in each node. (Default: 0)")
    parser.add_argument(
        "--queue",
        "-q",
        type=str,
        default=None
    )
    parser.add_argument(
        "--partition",
        "-p",
        type=str,
        default="",
    )
    parser.add_argument(
        "--account",
        "-a",
        type=str,
        default="",
    )
    parser.add_argument(
        "--load-env",
        type=str,
        default="",
        help="The script to load your environment ('module load cuda/10.1')")
    parser.add_argument(
        "--command",
        type=str,
        required=True,
        help="The command you wish to execute. For example: "
        " --command 'python test.py'. "
        "Note that the command must be a string.")
    args = parser.parse_args()

    if args.node:
        # assert args.num_nodes == 1
        node_info = "#SBATCH -w {}".format(args.node)
    else:
        node_info = ""

    job_name = "{}_{}".format(args.exp_name,
                              time.strftime("%y%m%d-%H%M", time.localtime()))

    partition_option = "#SBATCH --partition={}".format(
        args.partition) if args.partition else ""
    
    account_option = "#SBATCH --account={}".format(
        args.account) if args.account else ""

    # ===== Modified the template script =====
    with open(template_file, "r") as f:
        text = f.read()
    text = text.replace(JOB_NAME, job_name)
    text = text.replace(NUM_NODES, str(args.num_nodes))
    text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
    text = text.replace(PARTITION_OPTION, partition_option)
    text = text.replace(ACCOUNT_OPTION, account_option)
    text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
    text = text.replace(LOAD_ENV, str(args.load_env))
    text = text.replace(GIVEN_NODE, node_info)
    text = text.replace(TIME, args.time)
    mem_cmd = '' if args.mem is None else f'SBATCH --mem={args.mem}'
    text = text.replace(MEM_CMD, mem_cmd)
    text = text.replace(MAIL_USER, args.mail_user)
    text = text.replace(LOG_FOLDER, args.log_folder)
    text = text.replace(CONDA_ENV_NAME, args.conda_env_name)
    text = text.replace(
        "# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO "
        "PRODUCTION!",
        "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
        "RUNNABLE!")

    # ===== Save the script =====
    script_file = "slurm_scripts/{}.sh".format(job_name)
    # os.makedirs("slurm_scripts")  # todo: ensure this
    utils.ensureDir(Path('slurm_scripts') / 'test.sh')  # ensure that slurm_scripts directory exists
    with open(script_file, "w") as f:
        f.write(text)

    # ===== Submit the job =====
    print("Starting to submit job!")
    cmd = f"sbatch {script_file}" if args.queue is None else f"sbatch -p {args.queue} {script_file}"
    # subprocess.Popen(cmd)
    os.system(cmd)
    print(
        "Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(
            script_file, "{}.log".format(job_name)))
    sys.exit(0)


================================================
FILE: scripts/ray_slurm_template.sh
================================================
#!/bin/bash
# shellcheck disable=SC2206
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION!
${PARTITION_OPTION}
${ACCOUNT_OPTION}
#SBATCH --job-name=${JOB_NAME}
#SBATCH --output=${LOG_FOLDER}/${JOB_NAME}.out
#SBATCH --error=${LOG_FOLDER}/${JOB_NAME}.err
${GIVEN_NODE}
### This script works for any number of nodes, Ray will find and manage all resources
#SBATCH --nodes=${NUM_NODES}
#SBATCH --time=${TIME}
#SBATCH --mail-user=${MAIL_USER}
#SBATCH --exclusive
### Give all resources to a single Ray task, ray can manage the resources internally
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=${NUM_GPUS_PER_NODE}
#${MEM_CMD}

# Load modules or your own conda environment here
# module load pytorch/v1.4.0-gpu
# conda activate ${CONDA_ENV}
# ${LOAD_ENV}
module load devel/miniconda
conda init bash
source ~/.bashrc
conda deactivate
conda activate ${CONDA_ENV_NAME}
cd ~/git/pytabkit
export RAY_DEDUP_LOGS=0  # to disable ray from trying to deduplicate log messages

# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING =====
# This script is a modification to the implementation suggest by gregSchwartz18 here:
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
redis_password=$(uuidgen)
export redis_password

nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
nodes_array=($nodes)

node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address

# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$ip" == *" "* ]]; then
  IFS=' ' read -ra ADDR <<< "$ip"
  if [[ ${#ADDR[0]} -gt 16 ]]; then
    ip=${ADDR[1]}
  else
    ip=${ADDR[0]}
  fi
  echo "IPV6 address detected. We split the IPV4 address as $ip"
fi

port=6379
ip_head=$ip:$port
export ip_head
echo "IP Head: $ip_head"

echo "STARTING HEAD at $node_1"
srun --nodes=1 --ntasks=1 -w "$node_1" \
  ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block &
sleep 30

worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for ((i = 1; i <= worker_num; i++)); do
  node_i=${nodes_array[$i]}
  echo "STARTING WORKER $i at $node_i"
  srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block &
  sleep 5
done

# ===== Call your code below =====
${COMMAND_PLACEHOLDER}


================================================
FILE: scripts/rename_alg.py
================================================
import os
import shutil
from pathlib import Path

import fire

from pytabkit.bench.data.paths import Paths
from pytabkit.models import utils


def rename_alg(old_name: str, new_name: str, copy: bool = False, rename_prefixes: bool = False):
    # what to rename:
    # results folder
    # result_summaries folder
    # alg_name in algs/alg_name/extended_config.yaml and in the path
    # cannot realistically change the code in src/
    # maybe change alg_name in algs/alg_name/wrapper.pkl (if it can be loaded)
    paths = Paths.from_env_variables()

    if rename_prefixes:
        alg_names = [path.name for path in paths.algs().iterdir()]
        for alg_name in alg_names:
            if alg_name.startswith(old_name):
                rename_alg(alg_name, new_name + alg_name[len(old_name):], copy=copy, rename_prefixes=False)
        return

    if utils.existsDir(paths.algs() / new_name):
        raise ValueError(f'Directory for new name {new_name} already exists')

    def rename_or_copy(src: Path, dst: Path):
        if copy:
            shutil.copytree(src, dst)
        else:
            os.rename(src, dst)

    rename_or_copy(paths.algs() / old_name, paths.algs() / new_name)
    if utils.existsDir(paths.results() / old_name):
        rename_or_copy(paths.results() / old_name, paths.results() / new_name)
    if utils.existsDir(paths.result_summaries() / old_name):
        rename_or_copy(paths.result_summaries() / old_name, paths.result_summaries() / new_name)

    # change alg_name in extended_config.yaml
    extended_config_path = paths.algs() / new_name / 'extended_config.yaml'
    extended_config = utils.deserialize(extended_config_path, use_yaml=True)
    extended_config['alg_name'] = new_name
    utils.serialize(extended_config_path, extended_config, use_yaml=True)

    # try to change alg_name in wrapper.pkl
    try:
        alg_wrapper_path = paths.algs() / new_name / 'wrapper.pkl'
        alg_wrapper = utils.deserialize(alg_wrapper_path)
        alg_wrapper.config['alg_name'] = new_name
        utils.serialize(alg_wrapper_path, alg_wrapper)
    except Exception as e:
        print(f'Could not modify alg_wrapper.pkl, got an exception: {e}')


if __name__ == '__main__':
    fire.Fire(rename_alg)


================================================
FILE: scripts/rename_tag.py
================================================
import fire

from pytabkit.bench.data.paths import Paths
from pytabkit.models import utils


def rename_tag(old_name: str, new_name: str):
    paths = Paths.from_env_variables()
    for alg_path in paths.algs().iterdir():
        tags_path = alg_path / 'tags.yaml'
        if utils.existsFile(tags_path):
            tags = utils.deserialize(tags_path, use_yaml=True)
            tags = [tag if tag != old_name else new_name for tag in tags]
            utils.serialize(tags_path, tags, use_yaml=True)


if __name__ == '__main__':
    fire.Fire(rename_tag)


================================================
FILE: scripts/run_evaluation.py
================================================
import time
from typing import Optional

import numpy as np

import fire

from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskCollection
from pytabkit.bench.eval.analysis import get_opt_groups
from pytabkit.bench.eval.evaluation import MultiResultsTable, DefaultEvalModeSelector, MeanTableAnalyzer, \
    alg_results_str, \
    alg_comparison_str, WinsTableAnalyzer, RankTableAnalyzer, NormalizedLossTableAnalyzer, \
    GreedyAlgSelectionTableAnalyzer


def show_eval(coll_name: str = 'meta-train-class', n_cv: int = 1, show_alg_groups: bool = True,
              val_metric_name: str = None, metric_name: str = None, split_type: str = SplitType.RANDOM,
              use_task_weighting: Optional[bool] = None, shift_eps: float = 0.01,
              data_path: Optional[str] = None, alg_name: Optional[str] = None,
              alg_name_2: Optional[str] = None, tag: Optional[str] = None, max_n_splits: Optional[int] = None,
              max_n_algs: Optional[int] = None, show_val_results: bool = False, show_train_results: bool = False,
              algs_prefix: Optional[str] = None, algs_suffix: Optional[str] = None, algs_contains: Optional[str] = None,
              exclude_datasets: Optional[str] = None):
    """
    Prints evaluation tables on the selected datasets/algorithms.
    The following aggregate statistics will be printed, all of which are
    based on the specified metric and validation metric:

    - log shifted geometric mean test metric when greedily creating an algorithm portfolio based on the validation
      results. The algorithms are sorted by order of inclusion into the portfolio.
      The scores are the scores of selecting the best algorithm out of the portfolio up to this point
      on every dataset separately, based on the validation sets.
    - Win fraction: Fraction of datasets (may be weighted) on which this algorithm is the best one.
    - Arithmetic mean rank
    - Arithmetic mean normalized test metric: The best method is normalized to 0 and the worst one to 1.
    - Arithmetic mean test metric
    - Log shifted geometric mean test metric: mean(log(metric+shift_eps))
    - Shifted geometric mean test metric: exp(mean(log(metric+shift_eps)))

    :param coll_name: Name of the task collection, e.g., 'meta-train-class'
    :param n_cv: Number of cross-validation folds.
        Will only print results for algorithms that have been evaluated with this number of cross-validation folds.
    :param show_alg_groups: Whether to show aggregate algorithms,
        such as the one that picks the best method on the validation set out of the displayed methods.
    :param val_metric_name: Name of the validation metric, used for the algorithm groups. By default, the same value as
        metric_name will be used.
    :param metric_name: Name of the metric that should be displayed (default = classification error / RMSE).
    :param split_type: Type of the split, normally random_split.
    :param use_task_weighting: Whether to weight tasks for the evaluation. If false, uniform weights are used.
        If True, weights based on prefixes are used. By default, weights are used only for meta-train collections.
    :param shift_eps: Epsilon parameter used in the shifted geometric mean.
    :param data_path: Path to the data folder where results are saved.
        By default, this function will take the path from Paths.from_env_variables().
    :param alg_name: Algorithm for which results on individual datasets should be printed
    :param alg_name_2: Second algorithm for which results on individual datasets should be printed.
    :param tag: If specified, only print algorithms whose tags include the given tag.
    :param max_n_splits: If specified, only evaluate the given number of train-test splits.
    :param max_n_algs: Maximum number of methods that should be processed and displayed.
    This does not contain groups of methods (e.g. "all algs") that will be added on top later.
    :param show_val_results: Whether to show validation errors instead of test errors.
    :param show_train_results: Whether to show training errors instead of test errors.
    :param algs_prefix: If specified, only methods with this prefix will be displayed.
    :param algs_suffix: If specified, only methods with this suffix will be displayed.
    :param algs_contains: If specified, only methods containing this substring will be displayed.
    :param exclude_datasets: Optional comma-separated list of datasets that will be excluded from the analysis.
    :return:
    """
    print('start show eval')
    paths = Paths(data_path) if data_path is not None else Paths.from_env_variables()
    start_time = time.time()
    if '/' in coll_name:
        # use a single task
        parts = coll_name.split('/')
        if len(parts) != 2:
            print(f'Too many / in coll_name {coll_name}')
            return
        task_collection = TaskCollection(coll_name, [TaskDescription(*parts)])
    else:
        task_collection = TaskCollection.from_name(coll_name, paths)
    if exclude_datasets:
        exclude_names = exclude_datasets.split(',')
        task_collection = TaskCollection(task_collection.coll_name,
                                         [td for td in task_collection.task_descs if td.task_name not in exclude_names])
    print('load table')
    # table = MultiResultsTable.load_summaries(task_collection, n_cv=n_cv, paths=paths)
    # commas are converted to tuples in the command line, apparently
    show_tags = tag.split(',') if isinstance(tag, str) else (list(tag) if tag is not None else [])
    alg_filter = lambda an, tags, config: ((tag is None or np.any([show_tag in tags for show_tag in show_tags]))
                                           and (algs_prefix is None or an.startswith(algs_prefix))
                                           and (algs_suffix is None or an.endswith(algs_suffix))
                                           and (algs_contains is None or algs_contains in an))
    table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=paths, max_n_algs=max_n_algs,
                                   split_type=split_type, alg_filter=alg_filter, max_n_splits=max_n_splits)
    print('process table')
    # alg_group_dict = {'all algs': (lambda an, tags, config: True)} if show_alg_groups else None
    task_type_name = 'class' if 'class' in coll_name else 'reg'
    opt_groups = get_opt_groups(task_type_name)
    alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{
        f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans)
        for group_name, alg_names in opt_groups.items()
    }}
    if not show_alg_groups:
        alg_group_dict = None
    if alg_name is not None and alg_name_2 is not None and show_alg_groups:
        alg_group_dict['selected algs'] = (lambda an, tags, config, grp=[alg_name, alg_name_2]:
                                           np.any([g.startswith(an) for g in grp]))

    val_test_groups = {f'HPO-on-BestModel-TD-{task_type_name}': {f'{family}-TD-{task_type_name}': f'{family}-HPO'
                                                                 for family in ['XGB', 'LGBM', 'CatBoost', 'MLP']}
                       for task_type_name in ['class', 'reg']}

    if val_metric_name is None:
        val_metric_name = metric_name

    test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict,
                                              test_metric_name=metric_name, val_metric_name=val_metric_name,
                                              val_test_groups=val_test_groups, use_validation_errors=show_val_results,
                                              use_train_errors=show_train_results)
    val_table_single = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=dict(),
                                                    test_metric_name=metric_name, val_metric_name=val_metric_name,
                                                    val_test_groups=val_test_groups, use_validation_errors=True)
    test_table_single = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=dict(),
                                                     test_metric_name=metric_name, val_metric_name=val_metric_name,
                                                     val_test_groups=val_test_groups,
                                                     use_validation_errors=show_val_results,
                                                     use_train_errors=show_train_results)

    if len(test_table.alg_task_results) == 0:
        print(f'No results found')
        return

    subset = 'train' if show_train_results else ('val' if show_val_results else 'test')

    if use_task_weighting is None:
        use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci')
    separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares']
    if n_cv == 1:
        # fails for n_cv > 1 because proper selection on the validation set is not implemented
        print(
            f'Greedy algorithm selection cumulative best log shifted geometric mean (err+{shift_eps:g}) {subset} error:')
        analyzer = GreedyAlgSelectionTableAnalyzer(use_weighting=use_task_weighting,
                                                   separate_task_names=separate_task_names,
                                                   f=lambda x: np.log(x + shift_eps))
        analyzer.print_analysis(test_table_single, val_table_single)
        print()
    print('Win fraction:')
    analyzer = WinsTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names)
    analyzer.print_analysis(test_table)
    print()
    print('Arithmetic mean rank:')
    analyzer = RankTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names)
    analyzer.print_analysis(test_table)
    print()
    print(f'Arithmetic mean normalized {subset} metric:')
    analyzer = NormalizedLossTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names)
    analyzer.print_analysis(test_table)
    print()
    print(f'Arithmetic mean {subset} metric:')
    analyzer = MeanTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names)
    analyzer.print_analysis(test_table)
    print()
    print(f'Shifted geometric mean (err+{shift_eps:g}) {subset} metric:')
    analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + shift_eps), use_weighting=use_task_weighting,
                                 separate_task_names=separate_task_names,
                                 post_f=lambda x: np.exp(x))
    analyzer.print_analysis(test_table)
    print()
    print(f'Log shifted geometric mean (err+{shift_eps:g}) {subset} metric:')
    analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + shift_eps), use_weighting=use_task_weighting,
                                 separate_task_names=separate_task_names)
    analyzer.print_analysis(test_table)
    print()
    # print('Mean modlog test error:')  # todo: name modlog is suboptimal, people could associate mod with modulo
    # analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + 1e-3) - np.log(1e-3), use_weighting=use_task_weighting)
    # analyzer.print_analysis(test_table)
    if alg_name is not None:
        if alg_name_2 is None:
            print(f'Errors for alg {alg_name}:')
            print(alg_results_str(test_table, alg_name))
        else:
            print(f'Comparison: {alg_name} vs. {alg_name_2}')
            print(alg_comparison_str(test_table, [alg_name, alg_name_2]))
    print(f'Time for printing: {time.time() - start_time:g} s')


if __name__ == '__main__':
    fire.Fire(show_eval)


================================================
FILE: scripts/run_experiments.py
================================================
from typing import Optional, Dict, Any, List

import numpy as np

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.alg_wrappers.interface_wrappers import \
    LGBMInterfaceWrapper, \
    XGBInterfaceWrapper, LGBMHyperoptInterfaceWrapper, XGBHyperoptInterfaceWrapper, CatBoostHyperoptInterfaceWrapper, \
    CatBoostInterfaceWrapper, RFInterfaceWrapper, XGBSklearnInterfaceWrapper, LGBMSklearnInterfaceWrapper, \
    CatBoostSklearnInterfaceWrapper, SklearnMLPInterfaceWrapper, NNInterfaceWrapper, CaruanaEnsembleWrapper, \
    LoadResultsWrapper, RandomParamsNNInterfaceWrapper, AlgorithmSelectionWrapper, ResNetRTDLInterfaceWrapper, \
    MLPRTDLInterfaceWrapper, RandomParamsRTDLMLPInterfaceWrapper, RandomParamsResnetInterfaceWrapper, \
    TabRInterfaceWrapper, RandomParamsXGBInterfaceWrapper, RandomParamsLGBMInterfaceWrapper, \
    RandomParamsCatBoostInterfaceWrapper, AutoGluonModelInterfaceWrapper, RandomParamsTabRInterfaceWrapper, \
    RandomParamsRFInterfaceWrapper, FTTransformerInterfaceWrapper, RandomParamsFTTransformerInterfaceWrapper
from pytabkit.bench.eval.analysis import get_ensemble_groups
from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager, run_alg_selection
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.models.sklearn.default_params import DefaultParams


def run_gbdt_rs_configs(paths: Optional[Paths] = None, min_step_idx: int = 0, n_steps: int = 50, rerun: bool = False,
                        with_lgbm: bool = True,
                        with_xgb: bool = True, with_cb: bool = True, min_split_idx: int = 0, n_splits: int = 10,
                        only_meta_train: bool = False):
    if paths is None:
        paths = Paths.from_env_variables()
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5))
    run_config = RunConfig(min_split_idx=min_split_idx, n_tt_splits=min_split_idx + n_splits, n_cv=1, n_refit=0,
                           save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    if only_meta_train:
        all_task_infos = train_task_infos
    else:
        all_task_infos = class_task_infos + reg_task_infos

    for step_idx in range(min_step_idx, min_step_idx + n_steps):
        if with_xgb:
            job_mgr.add_jobs(all_task_infos, run_config,
                             f'XGB-HPO_step-{step_idx}',
                             RandomParamsXGBInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_xgb_rs'], rerun=rerun)
        if with_lgbm:
            job_mgr.add_jobs(all_task_infos, run_config,
                             f'LGBM-HPO_step-{step_idx}',
                             RandomParamsLGBMInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_lgbm_rs'], rerun=rerun)
        if with_cb:
            job_mgr.add_jobs(all_task_infos, run_config,
                             f'CatBoost-HPO_step-{step_idx}',
                             RandomParamsCatBoostInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_cb_rs'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_rf_rs_configs(paths: Optional[Paths] = None, min_step_idx: int = 0, n_steps: int = 50, rerun: bool = False,
                      min_split_idx: int = 0, n_splits: int = 10):
    # took 18h30m on the Grinsztajn benchmark
    if paths is None:
        paths = Paths.from_env_variables()
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5))
    run_config = RunConfig(min_split_idx=min_split_idx, n_tt_splits=min_split_idx + n_splits, n_cv=1, n_refit=0,
                           save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos

    for step_idx in range(min_step_idx, min_step_idx + n_steps):
        job_mgr.add_jobs(grinsztajn_reg_task_infos + grinsztajn_class_task_infos, run_config,
                         f'RF-HPO_step-{step_idx}',
                         RandomParamsRFInterfaceWrapper(model_idx=step_idx),
                         tags=['paper_rf-hpo'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_realmlp_tuning_configs(paths: Paths, n_steps: int = 50, tag: str = 'paper', rerun: bool = False):
    # 2h37m for 10 steps on meta-train-class
    # for 5 steps on all: 1h20m + 13h4m
    # 1h50m for 10 steps on grinsztajn-benchmark
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for step_idx in range(n_steps):
        job_mgr.add_jobs(all_task_infos, config_10_1_0,
                         f'RealMLP-HPO_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_rtdl_tuning_configs(paths: Paths, n_steps: int = 50, rerun: bool = False,
                            with_mlp: bool = True, with_resnet: bool = True, with_mlp_plr: bool = True,
                            with_ftt: bool = True,
                            only_meta_train: bool = False,
                            only_meta_test: bool = False, start_split=0, end_split=10):
    # MLP-PLR takes about 1h5m per step
    # takes around 4d6h for MLP-HPO and MLP-PLR-HPO together
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos
    all_train_task_infos = train_class_task_infos + train_reg_task_infos
    grinsztajn_task_infos = grinsztajn_class_task_infos + grinsztajn_reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
        all_task_infos = train_class_task_infos + train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos
        all_task_infos = test_class_task_infos + test_reg_task_infos

    for step_idx in range(n_steps):
        if with_mlp:
            job_mgr.add_jobs(all_task_infos, config_10_1_0,
                             f'MLP-RTDL-HPO_step-{step_idx}',
                             RandomParamsRTDLMLPInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_mlp-rtdl-hpo'], rerun=rerun)
        if with_resnet:
            job_mgr.add_jobs(all_task_infos, config_10_1_0,
                             f'ResNet-RTDL-HPO_step-{step_idx}',
                             RandomParamsResnetInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_resnet-hpo'], rerun=rerun)
        if with_mlp_plr:
            job_mgr.add_jobs(all_task_infos, config_10_1_0,
                             f'MLP-PLR-HPO_step-{step_idx}',
                             RandomParamsRTDLMLPInterfaceWrapper(model_idx=step_idx, num_emb_type='plr'),
                             tags=['paper_mlp-plr-hpo'], rerun=rerun)

        if with_ftt:
            job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0,
                             f'FTT-HPO_step-{step_idx}',
                             RandomParamsFTTransformerInterfaceWrapper(model_idx=step_idx),
                             tags=['paper_ftt-hpo'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_tabr_tuning_configs(paths: Paths, n_steps: int = 50, rerun: bool = False,
                            start_split=0, end_split=10):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    grinsztajn_task_infos = grinsztajn_class_task_infos + grinsztajn_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos
    all_train_task_infos = train_class_task_infos + train_reg_task_infos

    for step_idx in range(n_steps):
        job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0,
                         f'TabR-HPO_step-{step_idx}',
                         RandomParamsTabRInterfaceWrapper(model_idx=step_idx),
                         tags=['paper_tabr-hpo'], rerun=rerun)
        job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0,
                         f'RealTabR-HPO_step-{step_idx}',
                         RandomParamsTabRInterfaceWrapper(model_idx=step_idx, hpo_space_name='realtabr'),
                         tags=['paper_realtabr-hpo'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_refit_configs(paths: Paths, tag: str = 'paper', rerun: bool = False):
    # refit experiments took 3 to 3.5 days
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5))
    config_10_5_5 = RunConfig(n_tt_splits=10, n_cv=5, n_refit=5, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for mean_cv, mean_refit in [(False, False), (True, True)]:
        extra_str = f'mean-cv-{mean_cv}_mean-refit-{mean_refit}'
        job_mgr.add_jobs(class_task_infos, config_10_5_5,
                         f'RealMLP-TD-class_{extra_str}',
                         NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS,
                                            use_best_mean_epoch_for_cv=mean_cv,
                                            use_best_mean_epoch_for_refit=mean_refit,
                                            ),
                         tags=[tag], rerun=rerun)

        job_mgr.add_jobs(reg_task_infos, config_10_5_5,
                         f'RealMLP-TD-reg_{extra_str}',
                         NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG,
                                            use_best_mean_epoch_for_cv=mean_cv,
                                            use_best_mean_epoch_for_refit=mean_refit,
                                            ),
                         tags=[tag], rerun=rerun)

        job_mgr.add_jobs(class_task_infos, config_10_5_5, f'LGBM-TD-class_{extra_str}',
                         LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_CLASS,
                                              use_best_mean_iteration_for_cv=mean_cv,
                                              use_best_mean_iteration_for_refit=mean_refit,
                                              ),
                         tags=[tag], rerun=rerun)

        job_mgr.add_jobs(reg_task_infos, config_10_5_5, f'LGBM-TD-reg_{extra_str}',
                         LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_REG,
                                              use_best_mean_iteration_for_cv=mean_cv,
                                              use_best_mean_iteration_for_refit=mean_refit,
                                              ),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_ablations(paths: Paths, param_configs: Dict[str, Any], with_class: bool = True, with_reg: bool = True,
                  tune_lr: bool = True,
                  tag: str = 'paper_mlp_ablations', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False)  # todo: it's false

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    combinations = []
    if with_class:
        combinations.append((train_class_task_infos, DefaultParams.RealMLP_TD_CLASS, 'class'))
    if with_reg:
        combinations.append((train_reg_task_infos, DefaultParams.RealMLP_TD_REG, 'reg'))

    # lr_factors = [1.5**k for k in range(-3, 4)] if tune_lr else [1]
    # lr_factors = [0.3, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0] if tune_lr else [1.0]
    # lr_factors = [0.3, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0, 4.0, 6.0] if tune_lr else [1.0]
    lr_factors = [0.1, 0.15, 0.25, 0.35, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0, 4.0] if tune_lr else [1.0]

    for task_infos, default_params, task_type_name in combinations:
        for param_config_name, extra_params in param_configs.items():
            for lr_factor_idx, lr_factor in enumerate(lr_factors):
                params = utils.update_dict(default_params, extra_params)
                params['lr'] *= lr_factor  # todo: what if the lr is a dict?
                alg_name = f'RealMLP-TD-{task_type_name}-ablation_{param_config_name}_lrfactor-{lr_factor}'
                job_mgr.add_jobs(task_infos, config_10_1_0,
                                 alg_name,
                                 NNInterfaceWrapper(**params),
                                 tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_td_configs(paths: Paths, tag: str = 'paper', rerun: bool = False):
    # this took around 17h24m
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-class',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-S-class',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_S_CLASS),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'RealMLP-TD-reg',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'RealMLP-TD-S-reg',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_S_REG),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-TD-class',
                     LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_CLASS),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'XGB-TD-class',
                     XGBInterfaceWrapper(**DefaultParams.XGB_TD_CLASS),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'CatBoost-TD-class',
                     CatBoostInterfaceWrapper(**DefaultParams.CB_TD_CLASS),
                     tags=[tag], rerun=rerun)

    # regression
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'LGBM-TD-reg',
                     LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_REG),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'XGB-TD-reg',
                     XGBInterfaceWrapper(**DefaultParams.XGB_TD_REG),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'CatBoost-TD-reg',
                     CatBoostInterfaceWrapper(**DefaultParams.CB_TD_REG),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_default_ce_configs(paths: Paths, tag: str = 'paper_val_ce', rerun: bool = False):
    # this took around 17h24m
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-class_val-ce',
                     NNInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-class_val-ce_no-ls',
                     NNInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS,
                                            dict(val_metric_name='cross_entropy', use_ls=False, ls_eps=0.0))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-S-class_val-ce',
                     NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS,
                                                           dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-S-class_val-ce_no-ls',
                     NNInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS,
                                            dict(val_metric_name='cross_entropy', use_ls=False, ls_eps=0.0))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-TD-class_val-ce',
                     LGBMInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'XGB-TD-class_val-ce',
                     XGBInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'CatBoost-TD-class_val-ce',
                     CatBoostInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-D-class_val-ce',
                     LGBMInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.LGBM_D, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'XGB-D-class_val-ce',
                     XGBInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.XGB_D, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'CatBoost-D-class_val-ce',
                     XGBInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.CB_D, dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'XGB-PBB-D_val-ce',  # Probst, Boulestix, and Bischl, "Tunability: Importance of ..."
                     XGBInterfaceWrapper(n_estimators=4168, lr=0.018, min_child_weight=2.06,
                                         max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839,
                                         colsample_bytree=0.752, colsample_bylevel=0.585,
                                         tree_method='hist', max_n_threads=64,
                                         val_metric_name='cross_entropy',
                                         tfms=['one_hot'], max_one_hot_cat_size=20),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'MLP-RTDL-D-class_val-ce',
                     MLPRTDLInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.MLP_RTDL_D_CLASS_TabZilla,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'MLP-PLR-D-class_val-ce',
                     MLPRTDLInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.MLP_PLR_D_CLASS,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'ResNet-RTDL-D-class_val-ce',
                     ResNetRTDLInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RESNET_RTDL_D_CLASS_TabZilla,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'TabR-S-D-class_val-ce',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_CLASS,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealTabR-D-class_val-ce',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealTABR_D_CLASS,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealTabR-D-class_val-ce_no-ls',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealTABR_D_CLASS,
                                            dict(ls_eps=0.0, val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0,
                     'FTT-D-class_val-ce',
                     FTTransformerInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.FTT_D_CLASS,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_nns_no_ls(paths: Paths, tag: str = 'paper', rerun: bool = False):
    # this took around 48m
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-S-class_no-ls',
                     NNInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS,
                                            dict(use_ls=False, ls_eps=0.0))),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-class_no-ls',
                     NNInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS,
                                            dict(use_ls=False, ls_eps=0.0))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealTabR-D-class_no-ls',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.RealTABR_D_CLASS,
                                            dict(ls_eps=0.0))),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_tabr_configs(paths: Paths, rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealTabR-D-class',
                     TabRInterfaceWrapper(**DefaultParams.RealTABR_D_CLASS),
                     tags=['paper'], rerun=rerun)

    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'RealTabR-D-reg',
                     TabRInterfaceWrapper(**DefaultParams.RealTABR_D_REG),
                     tags=['paper'], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'TabR-S-D-class_val-ce',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_CLASS,
                                            dict(val_metric_name='cross_entropy'))),
                     tags=['paper_val_ce'], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'TabR-S-D-class_rssc',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_CLASS,
                                            dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                     tags=['paper'], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'TabR-S-D-reg_rssc',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_REG,
                                            dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                     tags=['paper'], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'TabR-S-D-class',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_CLASS,
                                            dict())),
                     tags=['paper'], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'TabR-S-D-reg',
                     TabRInterfaceWrapper(
                         **utils.join_dicts(DefaultParams.TABR_S_D_REG,
                                            dict())),
                     tags=['paper'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_early_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False):
    # around 4h
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for esr in [10, 20, 50, 100, 300, 1000]:
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_esr-{esr}',
                         LGBMInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'XGB-TD-class_esr-{esr}',
                         XGBInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'CatBoost-TD-class_esr-{esr}',
                         CatBoostInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)

        # regression
        job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                         f'LGBM-TD-reg_esr-{esr}',
                         LGBMInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.LGBM_TD_REG, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                         f'XGB-TD-reg_esr-{esr}',
                         XGBInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.XGB_TD_REG, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                         f'CatBoost-TD-reg_esr-{esr}',
                         CatBoostInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.CB_TD_REG, dict(early_stopping_rounds=esr))),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_brier_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False):
    # around 4h
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for esr in [10, 20, 50, 100, 300, 1000]:
        # for esr in [300]:
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_val-brier_esr-{esr}',
                         LGBMInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.LGBM_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='brier'))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'XGB-TD-class_val-brier_esr-{esr}',
                         XGBInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.XGB_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='brier'))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'CatBoost-TD-class_val-brier_esr-{esr}',
                         CatBoostInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.CB_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='brier'))),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_cross_entropy_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False):
    # around 4h
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for esr in [10, 20, 50, 100, 300, 1000]:
        # for esr in [300]:
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_val-ce_esr-{esr}',
                         LGBMInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.LGBM_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'XGB-TD-class_val-ce_esr-{esr}',
                         XGBInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.XGB_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'CatBoost-TD-class_val-ce_esr-{esr}',
                         CatBoostInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.CB_TD_CLASS,
                                                dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_ensemble_configs(paths: Paths, tag: str = 'paper', rerun: bool = False):
    # around 20 minutes or so
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for task_infos, task_type_name in [(class_task_infos, 'class'), (reg_task_infos, 'reg')]:
        for alg_group_name, alg_names in get_ensemble_groups(task_type_name).items():
            job_mgr.add_jobs(task_infos, config_10_1_0, f'Ensemble{alg_group_name}',
                             CaruanaEnsembleWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names]),
                             tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_realmlp_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32))
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    alg_names = [f'RealMLP-HPO_step-{i}' for i in range(n_hpo_steps)]

    for task_infos, val_metric_name in [(reg_task_infos, 'rmse'), (class_task_infos, 'class_error')]:
        run_alg_selection(paths, config_10_1_0, task_infos, f'RealMLP-HPO', alg_names, val_metric_name)

    run_alg_selection(paths, config_10_1_0, class_task_infos, f'RealMLP-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr')

    msd_alg_names = [f'RealMLP-HPO-moresigmadim_step-{i}' for i in range(n_hpo_steps)]
    for task_infos, val_metric_name in [(train_reg_task_infos, 'rmse'), (train_class_task_infos, 'class_error')]:
        run_alg_selection(paths, config_10_1_0, task_infos, f'RealMLP-HPO-moresigmadim', msd_alg_names, val_metric_name,
                          tags=[tag])


def run_rtdl_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32))
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    alg_names = [f'MLP-RTDL-HPO_step-{i}' for i in range(n_hpo_steps)]
    plr_alg_names = [f'MLP-PLR-HPO_step-{i}' for i in range(n_hpo_steps)]
    resnet_alg_names = [f'ResNet-RTDL-HPO_step-{i}' for i in range(n_hpo_steps)]
    ftt_alg_names = [f'FTT-HPO_step-{i}' for i in range(n_hpo_steps)]

    for task_infos, val_metric_name in [(reg_task_infos, 'rmse'), (class_task_infos, 'class_error')]:
        run_alg_selection(paths, config_10_1_0, task_infos, f'MLP-RTDL-HPO', alg_names, val_metric_name)
        run_alg_selection(paths, config_10_1_0, task_infos, f'MLP-PLR-HPO', plr_alg_names, val_metric_name)
        run_alg_selection(paths, config_10_1_0, task_infos, f'ResNet-RTDL-HPO', resnet_alg_names, val_metric_name)

    for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'), (grinsztajn_class_task_infos, 'class_error')]:
        run_alg_selection(paths, config_10_1_0, task_infos, f'FTT-HPO', ftt_alg_names, val_metric_name)

    run_alg_selection(paths, config_10_1_0, class_task_infos, f'MLP-RTDL-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr')
    run_alg_selection(paths, config_10_1_0, class_task_infos, f'MLP-PLR-HPO_best-1-auc-ovr', plr_alg_names, '1-auc_ovr')
    run_alg_selection(paths, config_10_1_0, class_task_infos, f'ResNet-RTDL-HPO_best-1-auc-ovr', resnet_alg_names,
                      '1-auc_ovr')
    run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'FTT-HPO_best-1-auc-ovr', ftt_alg_names,
                      '1-auc_ovr')


def run_tabr_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32))
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    alg_names = [f'TabR-HPO_step-{i}' for i in range(n_hpo_steps)]
    realtabr_alg_names = [f'RealTabR-HPO_step-{i}' for i in range(n_hpo_steps)]

    for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'),
                                        (grinsztajn_class_task_infos, 'class_error')]:
        run_alg_selection(paths, config_10_1_0, task_infos, f'TabR-HPO', alg_names, val_metric_name, tags=[tag],
                          rerun=rerun)
        run_alg_selection(paths, config_10_1_0, task_infos, f'RealTabR-HPO', realtabr_alg_names, val_metric_name,
                          tags=[tag],
                          rerun=rerun)

    run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'TabR-HPO_best-1-auc-ovr', alg_names,
                      '1-auc_ovr', tags=[tag], rerun=rerun)
    run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'RealTabR-HPO_best-1-auc-ovr',
                      realtabr_alg_names,
                      '1-auc_ovr', tags=[tag], rerun=rerun)


def run_gbdt_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=16))
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for gbdt_name in ['XGB', 'LGBM', 'CatBoost']:
        alg_names = [f'{gbdt_name}-HPO_step-{i}' for i in range(n_hpo_steps)]
        job_mgr.add_jobs(all_task_infos, config_10_1_0, f'{gbdt_name}-HPO',
                         AlgorithmSelectionWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names]),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(class_task_infos, config_10_1_0, f'{gbdt_name}-HPO_best-1-auc-ovr',
                         AlgorithmSelectionWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names],
                                                   alg_sel_metric_name='1-auc_ovr'),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_rf_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=16))
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'),
                                        (grinsztajn_class_task_infos, 'class_error')]:
        alg_names = [f'RF-HPO_step-{i}' for i in range(n_hpo_steps)]
        run_alg_selection(paths, config_10_1_0, task_infos, f'RF-HPO', alg_names, val_metric_name, tags=[tag],
                          rerun=rerun)
    run_alg_selection(paths, config_10_1_0, task_infos, f'RF-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr', tags=[tag],
                      rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_rtdl_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, with_mlp: bool = True,
                             with_resnet: bool = True, only_meta_train: bool = False, only_meta_test: bool = False,
                             tabzilla_defaults: bool = True, with_plr: bool = True, with_ftt: bool = True):
    # ca 50 min for meta-train-reg
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos

    if with_resnet:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'ResNet-RTDL-D-class_grinsztajn' if not tabzilla_defaults else
                         'ResNet-RTDL-D-class',
                         ResNetRTDLInterfaceWrapper(
                             **DefaultParams.RESNET_RTDL_D_CLASS_Grinsztajn if not tabzilla_defaults else
                             DefaultParams.RESNET_RTDL_D_CLASS_TabZilla),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'ResNet-RTDL-D-reg_grinsztajn' if not tabzilla_defaults else
                         'ResNet-RTDL-D-reg',
                         ResNetRTDLInterfaceWrapper(
                             **DefaultParams.RESNET_RTDL_D_REG_Grinsztajn if not tabzilla_defaults else
                             DefaultParams.RESNET_RTDL_D_REG_TabZilla),
                         tags=[tag], rerun=rerun)

    if with_mlp:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'MLP-RTDL-D-class_grinsztajn' if not tabzilla_defaults else
                         'MLP-RTDL-D-class',
                         MLPRTDLInterfaceWrapper(
                             **DefaultParams.MLP_RTDL_D_CLASS_Grinsztajn if not tabzilla_defaults else
                             DefaultParams.MLP_RTDL_D_CLASS_TabZilla),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'MLP-RTDL-D-reg_grinsztajn' if not tabzilla_defaults else
                         'MLP-RTDL-D-reg',
                         MLPRTDLInterfaceWrapper(
                             **DefaultParams.MLP_RTDL_D_REG_Grinsztajn if not tabzilla_defaults else
                             DefaultParams.MLP_RTDL_D_REG_TabZilla),
                         tags=[tag], rerun=rerun)

    if with_plr:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'MLP-PLR-D-class',
                         MLPRTDLInterfaceWrapper(
                             **DefaultParams.MLP_PLR_D_CLASS),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'MLP-PLR-D-reg',
                         MLPRTDLInterfaceWrapper(
                             **DefaultParams.MLP_PLR_D_REG),
                         tags=[tag], rerun=rerun)

    if with_ftt:
        job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0,
                         'FTT-D-class',
                         FTTransformerInterfaceWrapper(
                             **DefaultParams.FTT_D_CLASS),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(grinsztajn_reg_task_infos + train_reg_task_infos, config_10_1_0,
                         'FTT-D-reg',
                         FTTransformerInterfaceWrapper(
                             **DefaultParams.FTT_D_REG),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_rtdl_rssc_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, with_mlp: bool = True,
                                  with_resnet: bool = True, with_plr: bool = True, with_tabr: bool = True, with_ftt: bool = True,
                                  only_meta_train: bool = False, only_meta_test: bool = False):
    # ca 50 min for meta-train-reg (without TabR/FTT)
    # ca 8h30m for FTT (on meta-train + grinsztajn)
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos

    if with_resnet:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'ResNet-RTDL-D-class_rssc',
                         ResNetRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.RESNET_RTDL_D_CLASS_TabZilla,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'ResNet-RTDL-D-reg_rssc',
                         ResNetRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.RESNET_RTDL_D_REG_TabZilla,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)

    if with_mlp:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'MLP-RTDL-D-class_rssc',
                         MLPRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.MLP_RTDL_D_CLASS_TabZilla,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'MLP-RTDL-D-reg_rssc',
                         MLPRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.MLP_RTDL_D_REG_TabZilla,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)

    if with_plr:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'MLP-PLR-D-class_rssc',
                         MLPRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.MLP_PLR_D_CLASS,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'MLP-PLR-D-reg_rssc',
                         MLPRTDLInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.MLP_PLR_D_REG,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)

    if with_tabr:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'TabR-S-D-class_rssc',
                         TabRInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.TABR_S_D_CLASS,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'TabR-S-D-reg_rssc',
                         TabRInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.TABR_S_D_REG,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)

    if with_ftt:
        job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0,
                         'FTT-D-class_rssc',
                         FTTransformerInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.FTT_D_CLASS,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(grinsztajn_reg_task_infos + train_reg_task_infos, config_10_1_0,
                         'FTT-D-reg_rssc',
                         FTTransformerInterfaceWrapper(
                             **utils.join_dicts(DefaultParams.FTT_D_REG,
                                                dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_tabr_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False,
                             only_meta_train: bool = False, only_meta_test: bool = False,
                             start_split: int = 0, end_split: int = 10):
    # ca 50 min for meta-train-reg
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'TabR-S-D-class',
                     TabRInterfaceWrapper(
                         **DefaultParams.TABR_S_D_CLASS),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'TabR-S-D-reg',
                     TabRInterfaceWrapper(
                         **DefaultParams.TABR_S_D_REG),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False):
    # took 12h55s
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(all_task_infos, config_10_1_0, 'LGBM-D',
                     LGBMInterfaceWrapper(**DefaultParams.LGBM_D),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(all_task_infos, config_10_1_0,
                     'XGB-D',
                     XGBInterfaceWrapper(**DefaultParams.XGB_D),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(all_task_infos, config_10_1_0,
                     'CatBoost-D',
                     CatBoostInterfaceWrapper(**DefaultParams.CB_D),
                     tags=[tag], rerun=rerun)

    # it was too bad to include in the plots
    # job_mgr.add_jobs(all_task_infos, config_10_1_0,
    #                  'MLP-SKL-D',
    #                  SklearnMLPInterfaceWrapper(tfms=['mean_center', 'l2_normalize', 'one_hot']),
    #                  tags=[tag], rerun=rerun)

    job_mgr.add_jobs(all_task_infos, config_10_1_0,
                     'RF-SKL-D',
                     RFInterfaceWrapper(tfms=['ordinal_encoding'], permute_ordinal_encoding=True),
                     tags=[tag, 'paper_val_ce'], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'XGB-PBB-D',  # Probst, Boulestix, and Bischl, "Tunability: Importance of ..."
                     XGBInterfaceWrapper(n_estimators=4168, lr=0.018, min_child_weight=2.06,
                                         max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839,
                                         colsample_bytree=0.752, colsample_bylevel=0.585,
                                         tree_method='hist', max_n_threads=64,
                                         tfms=['one_hot'], max_one_hot_cat_size=20),
                     tags=['paper'])

    job_mgr.run_jobs(scheduler)


def run_gbdts_hpo_tpe(paths: Paths, n_estimators: int = 1000, early_stopping_rounds: int = 300,
                      tag: str = 'paper'):
    # this generates about 10GB of data
    # took 7h17m for n_estimators=2
    # took about 6h30m for n_estimators=1  (but slightly more tasks were run for that because of the rerun=True)
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)
    config_5_1_0 = RunConfig(n_tt_splits=5, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for task_infos, config in [(train_task_infos, config_10_1_0), (test_task_infos, config_10_1_0)]:
        job_mgr.add_jobs(task_infos, config, f'XGB-HPO-TPE',
                         XGBHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                     early_stopping_rounds=early_stopping_rounds,
                                                     tree_method='hist', space='grinsztajn'),
                         tags=[tag])
        job_mgr.add_jobs(task_infos, config, f'CatBoost-HPO-TPE',
                         CatBoostHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                          early_stopping_rounds=early_stopping_rounds,
                                                          space='shwartz-ziv'),
                         tags=[tag])
        job_mgr.add_jobs(task_infos, config, f'LGBM-HPO-TPE',
                         LGBMHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                      early_stopping_rounds=early_stopping_rounds,
                                                      space='catboost_quality_benchmarks'),
                         tags=[tag])

    job_mgr.run_jobs(scheduler)


def run_preprocessing_experiments(paths: Paths, tag: str = 'paper_preprocessing'):
    # this took 7h9m for just two different scikit-learn based transformation configurations!
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)

    for task_infos, defaults in [(train_class_task_infos, DefaultParams.RealMLP_TD_S_CLASS),
                                 (train_reg_task_infos, DefaultParams.RealMLP_TD_S_REG)]:
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-mc-rs-sc-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['median_center', 'robust_scale', 'smooth_clip', 'one_hot']
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-mc-rs-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['median_center', 'robust_scale', 'one_hot']
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-std-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['mean_center', 'l2_normalize', 'one_hot'],
                             l2_normalize_eps=1e-30,
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-std-sc-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['mean_center', 'l2_normalize', 'smooth_clip', 'one_hot'],
                             l2_normalize_eps=1e-30,
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-kdi1-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['kdi', 'one_hot'], kdi_alpha=1.0,
                             max_n_vectorized=1,
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-quantile-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['quantile', 'one_hot'],
                             max_n_vectorized=1,
                         ))),
                         [tag])
        job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-quantiletabr-oh',
                         NNInterfaceWrapper(**utils.update_dict(defaults, dict(
                             tfms=['quantile_tabr', 'one_hot'],
                             max_n_vectorized=1,
                         ))),
                         [tag])

    job_mgr.run_jobs(scheduler)


def run_all_ablations(paths: Paths, with_class: bool = True, with_reg: bool = True):
    run_ablations(paths, {
        'default': dict(),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'lr-cos-decay': dict(lr_sched='cos'),
        'lr-constant': dict(lr_sched='constant'),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'wd-0.0': dict(wd=0.0, wd_sched='constant', bias_wd_factor=0.0),
        'wd-0.02': dict(wd=0.02, wd_sched='constant', bias_wd_factor=0.0),
    }, with_class=with_class, with_reg=with_reg)
    # run_ablations(paths, {
    #     'wd-0.01-flatcos': dict(wd=0.01, wd_sched='flat_cos', bias_wd_factor=0.0),
    #     'wd-0.01': dict(wd=0.01, wd_sched='constant', bias_wd_factor=0.0),
    # }, with_class=False, with_reg=with_reg)
    # run_ablations(paths, {
    #     'wd-0.0': dict(wd=0.0, wd_sched='constant', bias_wd_factor=0.0),
    #     'wd-0.01': dict(wd=0.01, wd_sched='constant', bias_wd_factor=0.0),
    # }, with_class=with_class, with_reg=False)
    run_ablations(paths, {
        'pdrop-0.0': dict(p_drop=0.0, p_drop_sched='constant'),
        'pdrop-0.15': dict(p_drop=0.15, p_drop_sched='constant'),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'no-front-scale': dict(first_layer_config=dict()),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'normal-init': dict(bias_init_mode='zeros', weight_init_mode='normal'),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'standard-param_no-wd': dict(weight_param='standard', bias_lr_factor=1 / 16, weight_lr_factor=1 / 16, wd=0.0),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'non-parametric-act': dict(use_parametric_act=False),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'act-relu': dict(act='relu'),
        'act-mish': dict(act='mish')
    }, with_class=with_class, with_reg=False)
    run_ablations(paths, {
        'act-relu': dict(act='relu'),
        'act-selu': dict(act='selu')
    }, with_class=False, with_reg=with_reg)
    run_ablations(paths, {
        'no-label-smoothing': dict(use_ls=False, ls_eps=0.0),
    }, with_class=with_class, with_reg=False)
    run_ablations(paths, {
        'num-embeddings-plr': dict(plr_act_name='relu', plr_use_densenet=False, plr_use_cos_bias=False),
        'num-embeddings-pl': dict(plr_act_name='linear', plr_use_densenet=False, plr_use_cos_bias=False),
        'num-embeddings-none': dict(use_plr_embeddings=False)
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'beta2-0.999': dict(sq_mom=0.999),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'first-best-epoch': dict(use_last_best_epoch=False),
    }, with_class=with_class, with_reg=with_reg)
    run_ablations(paths, {
        'no-cat-embs': dict(max_one_hot_cat_size=-1),
    }, with_class=with_class, with_reg=with_reg)


def run_architecture_ablations(paths: Paths, tag: str = 'paper', rerun: bool = False,
                               only_meta_train: bool = False, only_meta_test: bool = False,
                               start_split: int = 0, end_split: int = 10):
    # ca 1h45m + 40m + 2h
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos

    lr_grid_std = [1.5e-3, 7e-4, 1e-3, 4e-4, 2.5e-3, 4e-3, 7e-3, 1e-2, 1.5e-2]
    lr_grid_ntp = [0.04, 0.2, 0.1, 0.02, 0.07, 0.01, 0.3, 0.03, 0.4]

    mlp_rtdl_repr_config_class = dict(
        hidden_sizes=[128, 256, 128],
        p_drop=0.1,
        block_str='w-b-a-d',
        lr=2.5e-3,  # will be changed later
        opt='adam',
        tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'],
        embedding_size=8,
        batch_size=128,
        n_epochs=1000,
        use_early_stopping=True,
        early_stopping_multiplicative_patience=1,
        early_stopping_additive_patience=20,
        act='relu',
        weight_param='standard',
        weight_init_mode='uniform',
        weight_init_gain=1. / np.sqrt(3.),
        bias_init_mode='pytorch-default',
        use_last_best_epoch=False,
        emb_init_mode='kaiming-uniform-t',
    )

    mlp_rtdl_repr_config_reg = utils.join_dicts(mlp_rtdl_repr_config_class, dict(
        normalize_output=True, lr=1.5e-3))

    mlp_rtdl_num_emb_repr_config_class = utils.join_dicts(mlp_rtdl_repr_config_class, dict(
        num_emb_type='plr', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1,
        # todo: or pl embeddings?
        lr=2.5e-3,
    ))

    mlp_rtdl_num_emb_repr_config_reg = utils.join_dicts(mlp_rtdl_num_emb_repr_config_class,
                                                        dict(normalize_output=True, lr=7e-4))

    mlp_rtdl_pl_config_class = utils.join_dicts(mlp_rtdl_num_emb_repr_config_class, dict(
        num_emb_type='pl', lr=4e-3))
    mlp_rtdl_pl_config_reg = utils.join_dicts(mlp_rtdl_pl_config_class, dict(normalize_output=True, lr=4e-4))

    realmlp_arch_class = dict(
        hidden_sizes=[128, 256, 128],
        p_drop=0.1,
        block_str='w-b-a-d',
        opt='adam',
        tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'],
        embedding_size=8,
        batch_size=128,
        n_epochs=1000,
        use_early_stopping=True,
        early_stopping_multiplicative_patience=1,
        early_stopping_additive_patience=20,

        weight_init_mode='uniform',
        weight_init_gain=1. / np.sqrt(3.),
        bias_init_mode='pytorch-default',
        use_last_best_epoch=False,
        emb_init_mode='kaiming-uniform-t',
        lr=2e-2,
        num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1,
        weight_param='ntk', bias_lr_factor=0.1,
        act='selu',
        use_parametric_act=True, act_lr_factor=0.1,
        add_front_scale=True, scale_lr_factor=6.0,
    )

    realmlp_arch_reg = utils.join_dicts(realmlp_arch_class, dict(act='mish', normalize_output=True, lr=1e-2))

    def add_jobs(name: str, config_class: dict, config_reg: dict, lr_grid: List[float], with_meta_test: bool = True):
        for task_infos, all_task_infos, task_type_name, config in [
            (train_class_task_infos, class_task_infos, 'class', config_class),
            (train_reg_task_infos, reg_task_infos, 'reg', config_reg)]:
            for lr in lr_grid:
                job_mgr.add_jobs(task_infos, config_10_1_0,
                                 f'{name}_lr-{lr:g}',
                                 NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))),
                                 tags=['paper_arch-lr-tuning'], rerun=rerun)

            if with_meta_test:
                job_mgr.add_jobs(all_task_infos, config_10_1_0,
                                 f'{name}',
                                 NNInterfaceWrapper(**config),
                                 tags=['paper'], rerun=rerun)

    add_jobs('MLP-RTDL-reprod', mlp_rtdl_repr_config_class, mlp_rtdl_repr_config_reg, lr_grid_std)
    add_jobs('MLP-RTDL-reprod-plr', mlp_rtdl_num_emb_repr_config_class, mlp_rtdl_num_emb_repr_config_reg, lr_grid_std,
             with_meta_test=False)
    add_jobs('MLP-RTDL-reprod-pl', mlp_rtdl_pl_config_class, mlp_rtdl_pl_config_reg, lr_grid_std)
    add_jobs('MLP-RTDL-reprod-RealMLP-arch', realmlp_arch_class, realmlp_arch_reg, lr_grid_ntp)

    job_mgr.run_jobs(scheduler)


def run_cumulative_ablations_new(paths: Paths, n_lrs: int = -1, tag: str = 'paper_cumulative_ablations_new',
                                 rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False)  # todo: it's false

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)

    # lr_grid_ntp = [0.01, 0.015, 0.025, 0.04, 0.07, 0.1, 0.2, 0.3, 0.4]
    # lr_grid_std = [4e-4, 7e-4, 1e-3, 1.5e-3, 2.5e-3, 4e-3, 7e-3, 1e-2, 2e-2]
    lr_grid_std = [1.5e-3, 7e-4, 1e-3, 4e-4, 2.5e-3, 4e-3, 7e-3, 1e-2, 1.5e-2]
    lr_grid_ntp = [0.04, 0.2, 0.1, 0.02, 0.07, 0.01, 0.3, 0.03, 0.4]

    if n_lrs > 0:
        lr_grid_std = lr_grid_std[:n_lrs]
        lr_grid_ntp = lr_grid_ntp[:n_lrs]

    config_class = dict()
    config_reg = dict()
    ablation_counter = 1

    def add_config(name: str, lr_grid: List[float],
                   add: Optional[Dict[str, Any]] = None, add_class: Optional[Dict[str, Any]] = None,
                   add_reg: Optional[Dict[str, Any]] = None, run_this: bool = True):
        nonlocal ablation_counter
        nonlocal config_class
        nonlocal config_reg

        if add is not None:
            config_class = utils.join_dicts(config_class, add)
            config_reg = utils.join_dicts(config_reg, add)
        if add_class is not None:
            config_class = utils.join_dicts(config_class, add_class)
        if add_reg is not None:
            config_reg = utils.join_dicts(config_reg, add_reg)

        if run_this:
            for lr in lr_grid:
                for task_infos, task_type_name, config in [(train_class_task_infos, 'class', config_class),
                                                           (train_reg_task_infos, 'reg', config_reg)]:
                    job_mgr.add_jobs(task_infos, config_10_1_0,
                                     f'MLP-cumul-abl-new-{ablation_counter}-{task_type_name}_{name}_lr-{lr:g}',
                                     NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))),
                                     tags=[tag], rerun=rerun)

        ablation_counter += 1

    vanilla_config_class = dict(
        hidden_sizes=[256] * 3,
        p_drop=0.0,
        block_str='w-b-a-d',
        opt='adam',
        tfms=['quantile', 'embedding'],
        embedding_size=8,
        batch_size=256,
        n_epochs=256,
        use_early_stopping=True,
        early_stopping_multiplicative_patience=1,
        early_stopping_additive_patience=40,
        act='relu',
        weight_param='standard',
        weight_init_mode='uniform',
        weight_init_gain=1. / np.sqrt(3.),
        bias_init_mode='pytorch-default',
        max_n_vectorized=1,
        use_last_best_epoch=False,
    )

    add_config('vanilla', lr_grid_std, add=vanilla_config_class,
               add_reg=dict(normalize_output=True), run_this=True)
    # quantile_tabr was not well-suited for vectorization, now we can vectorize
    add_config('robust-scale-smooth-clip', lr_grid_std,
               dict(tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                    max_n_vectorized=50))
    add_config('one-hot-small-cat', lr_grid_std,
               dict(tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                    max_one_hot_cat_size=9))
    add_config('no-early-stop', lr_grid_std, dict(use_early_stopping=False))
    add_config('last-best-epoch', lr_grid_std, dict(use_last_best_epoch=True))
    add_config('lr-multi-cycle', lr_grid_std, dict(lr_sched='coslog4'))
    add_config('beta2-0.95', lr_grid_std, dict(sq_mom=0.95))
    add_config('label-smoothing', lr_grid_std, add_class=dict(use_ls=True, ls_eps=0.1))
    add_config('output-clipping', lr_grid_std, add_reg=dict(clamp_output=True))
    add_config('ntp', lr_grid_ntp, dict(weight_param='ntk', bias_lr_factor=0.1))
    add_config('different-act', lr_grid_ntp, add_class=dict(act='selu'), add_reg=dict(act='mish'))
    add_config('param-act', lr_grid_ntp, dict(use_parametric_act=True, act_lr_factor=0.1))
    add_config('front-scale', lr_grid_ntp, dict(add_front_scale=True, scale_lr_factor=6.0))
    add_config('num-emb-pl', lr_grid_ntp,
               dict(num_emb_type='pl', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1))
    add_config('num-emb-pbld', lr_grid_ntp, dict(num_emb_type='pbld'))
    add_config('alt-pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15))
    add_config('alt-pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos'))
    add_config('alt-wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0))
    add_config('alt-wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos'))
    add_config('alt-bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5'))
    add_config('alt-weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0))

    # add_config('bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5'))
    # add_config('weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0))
    # add_config('pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15))
    # add_config('pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos'))
    # add_config('wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0))
    # add_config('wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos'))

    job_mgr.run_jobs(scheduler)
    pass


if __name__ == '__main__':
    paths = Paths.from_env_variables()

    run_td_configs(paths, tag='paper', rerun=False)
    run_default_configs(paths, tag='paper', rerun=False)
    run_rtdl_default_configs(paths, tag='paper', tabzilla_defaults=True)
    run_tabr_configs(paths)

    run_gbdt_rs_configs()
    run_rf_rs_configs()
    for i in range(50):
        if (i + 1) % 10 == 0:
            run_rtdl_tuning_configs(paths, n_steps=i + 1, with_resnet=True, only_meta_train=False)
    for i in range(50):
        if (i + 1) % 10 == 0:
            run_realmlp_tuning_configs(paths, n_steps=i + 1, tag='paper_mlp-hpo', rerun=False)
    for n_steps in [1, 2, 5, 10, 20, 30, 40, 50]:
        run_tabr_tuning_configs(paths, n_steps=n_steps)

    run_rtdl_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper')
    run_gbdt_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper')
    run_rf_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper')
    run_realmlp_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper', rerun=False)
    run_tabr_hpo_alg_selection(paths, n_hpo_steps=50)
    run_ensemble_configs(paths, tag='paper')

    # ----- ablations (mostly for the appendix) -----

    for n_lrs in [10]:  # range(1, 10):
        run_cumulative_ablations_new(paths, n_lrs=n_lrs)

    run_rtdl_rssc_default_configs(paths, tag='paper')
    run_default_ce_configs(paths)
    run_nns_no_ls(paths)

    run_all_ablations(paths)
    run_architecture_ablations(paths)
    run_preprocessing_experiments(paths)
    run_refit_configs(paths, tag='paper', rerun=False)
    run_early_stopping_configs(paths)
    run_brier_stopping_configs(paths)
    run_cross_entropy_stopping_configs(paths)
    run_refit_configs(paths, tag='paper', rerun=False)


================================================
FILE: scripts/run_experiments_unused.py
================================================
from typing import List, Optional, Dict, Any

import numpy as np

from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsNNInterfaceWrapper, NNInterfaceWrapper, \
    AutoGluonModelInterfaceWrapper, CatBoostInterfaceWrapper, LGBMInterfaceWrapper, XGBInterfaceWrapper, \
    XGBHyperoptInterfaceWrapper, CatBoostHyperoptInterfaceWrapper, LGBMHyperoptInterfaceWrapper
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models import utils
from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler
from pytabkit.models.sklearn.default_params import DefaultParams


def run_extra_realmlp_tuning_configs(paths: Paths, n_steps: int = 50, tag: str = 'paper_reamlp-hpo-clr',
                                     rerun: bool = False):
    # 1h8m for 5 steps of clr on meta-train. 2h40m for 5 steps of ms on meta-train.
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for step_idx in range(n_steps):
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-clr_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='clr'),
                         tags=['realmlp-hpo-clr'], rerun=rerun)
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-moresigma_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigma'),
                         tags=['realmlp-hpo-ms'], rerun=rerun)
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-moresigmadim_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadim'),
                         tags=['realmlp-hpo-msd'], rerun=rerun)
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-moresigmadimreg_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimreg'),
                         tags=['realmlp-hpo-msdr'], rerun=rerun)
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-moresigmadimsize_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimsize'),
                         tags=['realmlp-hpo-msds'], rerun=rerun)
        job_mgr.add_jobs(train_task_infos, config_10_1_0,
                         f'RealMLP-HPO-moresigmadimlr_step-{step_idx}',
                         RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimlr'),
                         tags=['realmlp-hpo-msdl'], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_mlp_random_configs(paths: Paths, n_steps: int = 50, tag: str = 'mlp_random', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    sampler = RealMLPParamSampler(is_classification=False)

    for step_idx in range(n_steps):
        params = sampler.sample_params(seed=step_idx)
        relevant_params = {key: value for key, value in params.items() if key in
                           ['num_emb_type', 'add_front_scale', 'lr', 'p_drop', 'wd',
                            'plr_sigma', 'act', 'hidden_sizes', 'ls_eps']}
        config_str = ''
        for key, value in relevant_params.items():
            if key == 'hidden_sizes':
                value = f'{value[0]}x{len(value)}'
            config_str = config_str + '_' + key.replace('_', '-') + '-' + str(value)
        job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                         f'RealMLP-reg' + config_str,
                         NNInterfaceWrapper(**params),
                         tags=[tag], rerun=rerun)

    sampler = RealMLPParamSampler(is_classification=True)

    for step_idx in range(n_steps):
        params = sampler.sample_params(seed=step_idx)
        relevant_params = {key: value for key, value in params.items() if key in
                           ['num_emb_type', 'add_front_scale', 'lr', 'p_drop', 'wd',
                            'plr_sigma', 'act', 'hidden_sizes', 'ls_eps']}
        config_str = ''
        for key, value in relevant_params.items():
            if key == 'hidden_sizes':
                value = f'{value[0]}x{len(value)}'
            config_str = config_str + '_' + key.replace('_', '-') + '-' + str(value)
        job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                         f'RealMLP-class' + config_str,
                         NNInterfaceWrapper(**params),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_mlp_random_seed_configs(paths: Paths, n_steps: int = 50, tag: str = 'mlp_random_seeds', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    for step_idx in range(n_steps):
        job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                         f'RealMLP-reg_seed-offset-{step_idx}',
                         NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG, random_seed_offset=step_idx),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_additional_configs(paths: Paths, tag: str = 'paper_additional', rerun: bool = False):
    # not in the paper
    # this took around 17h24m
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    # run class-on-reg and reg-on-class

    job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                     'RealMLP-TD-class-on-reg',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_CLASS,
                                                            dict(use_ls=False, ls_eps=0.0, normalize_output=True,
                                                                 clamp_output=True))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                     'RealMLP-TD-reg-on-class',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_REG,
                                                            dict(use_ls=True, ls_eps=0.1, normalize_output=False,
                                                                 clamp_output=False))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                     'RealMLP-TD-S-class-on-reg',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_S_CLASS,
                                                            dict(use_ls=False, ls_eps=0.0, normalize_output=True))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                     'RealMLP-TD-S-reg-on-class',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_S_REG,
                                                            dict(use_ls=True, ls_eps=0.1, normalize_output=False))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'RealMLP-TD-class_only-one-hot',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_CLASS,
                                                            dict(max_one_hot_cat_size=-1))),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'RealMLP-TD-reg_only-one-hot',
                     NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_REG,
                                                            dict(max_one_hot_cat_size=-1))),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_seed_opt_configs(paths: Paths, random_seed_offset: int, tag: str = 'paper', rerun: bool = False):
    # not used in the paper
    # this took around 17h24m
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(train_class_task_infos, config_10_1_0,
                     f'RealMLP-TD-class_alt-seed-{random_seed_offset}',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS, random_seed_offset=random_seed_offset),
                     tags=[tag], rerun=rerun)

    job_mgr.add_jobs(train_reg_task_infos, config_10_1_0,
                     f'RealMLP-TD-reg_alt-seed-{random_seed_offset}',
                     NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG, random_seed_offset=random_seed_offset),
                     tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_ag_nn_configs(paths: Paths, tag: str = 'paper', rerun: bool = False,
                      only_meta_train: bool = False, only_meta_test: bool = False,
                      with_ftt: bool = True,
                      start_split: int = 0, end_split: int = 10):
    # ca 50 min for meta-train-reg
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)
    grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths)
    grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos
    train_task_infos = train_class_task_infos + train_reg_task_infos
    test_task_infos = test_class_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    if only_meta_train:
        class_task_infos = train_class_task_infos
        reg_task_infos = train_reg_task_infos
    elif only_meta_test:
        class_task_infos = test_class_task_infos
        reg_task_infos = test_reg_task_infos

    # fastai on meta-train took 40 GPU-minutes
    # MLP-AGT took 1h31m on one RTX 3090
    # FT-T with some RAM estimates: ca 44m + 17h + 40m
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'MLP-FAI-D-class',
                     AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='FASTAI',
                                                    max_n_models_per_type=1),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'MLP-FAI-D-reg',
                     AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='FASTAI',
                                                    max_n_models_per_type=1),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(class_task_infos, config_10_1_0,
                     'MLP-AGT-D-class',
                     AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='NN_TORCH',
                                                    max_n_models_per_type=1),
                     tags=[tag], rerun=rerun)
    job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                     'MLP-AGT-D-reg',
                     AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='NN_TORCH',
                                                    max_n_models_per_type=1),
                     tags=[tag], rerun=rerun)
    if with_ftt:
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         'FT-Transformer-D-class',
                         AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default_FTT',
                                                        model_types='FT_TRANSFORMER',
                                                        max_n_models_per_type=1),
                         tags=[tag], rerun=rerun)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         'FT-Transformer-D-reg',
                         AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default_FTT',
                                                        model_types='FT_TRANSFORMER',
                                                        max_n_models_per_type=1),
                         tags=[tag], rerun=rerun)

    job_mgr.run_jobs(scheduler)


def run_trees_custom(paths: Paths, n_estimators: int, tag: str = 'paper', with_defaults: bool = True):
    # only for speed-testing
    # this generates about 10GB of data
    # took 7h17m for n_estimators=2
    # took about 6h30m for n_estimators=1  (but slightly more tasks were run for that because of the rerun=True)
    # the large main overhead is probably mainly for evaluating the metrics
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True)

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)
    test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)

    class_task_infos = train_class_task_infos + test_class_task_infos
    reg_task_infos = train_reg_task_infos + test_reg_task_infos
    all_task_infos = class_task_infos + reg_task_infos

    job_mgr.add_jobs(all_task_infos, config_10_1_0, f'XGB_hyperopt-50_grinsztajn_nest-{n_estimators}',
                     XGBHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                 tree_method='hist', space='grinsztajn'),
                     tags=[tag], rerun=True)
    job_mgr.add_jobs(all_task_infos, config_10_1_0, f'CatBoost_hyperopt-50_shwartz-ziv_nest-{n_estimators}',
                     CatBoostHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                      space='shwartz-ziv'),
                     tags=[tag], rerun=True)
    job_mgr.add_jobs(all_task_infos, config_10_1_0, f'LGBM_hyperopt-50_cqb_nest-{n_estimators}',
                     LGBMHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50,
                                                  space='catboost_quality_benchmarks'), rerun=True)

    if with_defaults:
        # optimized default parameters
        # classification
        job_mgr.add_jobs(class_task_infos, config_10_1_0, f'LGBM-TD-class_nest-{n_estimators}',
                         LGBMInterfaceWrapper(**utils.update_dict(DefaultParams.LGBM_TD_CLASS,
                                                                  dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         f'XGB-TD-class_nest-{n_estimators}',
                         XGBInterfaceWrapper(**utils.update_dict(DefaultParams.XGB_TD_CLASS,
                                                                 dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)
        job_mgr.add_jobs(class_task_infos, config_10_1_0,
                         f'CatBoost-TD-class_nest-{n_estimators}',
                         CatBoostInterfaceWrapper(**utils.update_dict(DefaultParams.CB_TD_CLASS,
                                                                      dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)

        # regression
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         f'LGBM-TD-reg_nest-{n_estimators}',
                         LGBMInterfaceWrapper(**utils.update_dict(DefaultParams.LGBM_TD_REG,
                                                                  dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         f'XGB-TD-reg_nest-{n_estimators}',
                         XGBInterfaceWrapper(**utils.update_dict(DefaultParams.XGB_TD_REG,
                                                                 dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)
        job_mgr.add_jobs(reg_task_infos, config_10_1_0,
                         f'CatBoost-TD-reg_nest-{n_estimators}',
                         CatBoostInterfaceWrapper(**utils.update_dict(DefaultParams.CB_TD_REG,
                                                                      dict(n_estimators=n_estimators))),
                         tags=[tag], rerun=True)

    job_mgr.run_jobs(scheduler)


def run_cumulative_ablations(paths: Paths, tag: str = 'paper_cumulative_ablations', rerun: bool = False):
    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False)  # todo: it's false

    train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths)
    train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths)

    # lr_grid_ntp = [0.01, 0.015, 0.025, 0.04, 0.07, 0.1, 0.2, 0.3, 0.4]
    # lr_grid_std = [4e-4, 7e-4, 1e-3, 1.5e-3, 2.5e-3, 4e-3, 7e-3, 1e-2, 2e-2]
    lr_grid_std = [2e-3, 4e-3]
    lr_grid_ntp = [0.04, 0.2]

    config_class = dict()
    config_reg = dict()
    ablation_counter = 1

    def add_config(name: str, lr_grid: List[float],
                   add: Optional[Dict[str, Any]] = None, add_class: Optional[Dict[str, Any]] = None,
                   add_reg: Optional[Dict[str, Any]] = None, run_this: bool = True):
        nonlocal ablation_counter
        nonlocal config_class
        nonlocal config_reg

        if add is not None:
            config_class = utils.join_dicts(config_class, add)
            config_reg = utils.join_dicts(config_reg, add)
        if add_class is not None:
            config_class = utils.join_dicts(config_class, add_class)
        if add_reg is not None:
            config_reg = utils.join_dicts(config_reg, add_reg)

        if run_this:
            for lr in lr_grid:
                for task_infos, task_type_name, config in [(train_class_task_infos, 'class', config_class),
                                                           (train_reg_task_infos, 'reg', config_reg)]:
                    job_mgr.add_jobs(task_infos, config_10_1_0,
                                     f'MLP-cumul-abl-{ablation_counter}-{task_type_name}_{name}_lr-{lr:g}',
                                     NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))),
                                     tags=[tag], rerun=rerun)

        ablation_counter += 1

    mlp_rtdl_repr_config_class = dict(
        hidden_sizes=[128, 256, 128],
        p_drop=0.1,
        block_str='w-b-a-d',
        lr=1e-3,  # will be overridden by the lrs from the grid anyway
        opt='adam',
        tfms=['quantile_tabr', 'embedding'],
        embedding_size=8,
        batch_size=128,
        n_epochs=1000,
        use_early_stopping=True,
        early_stopping_multiplicative_patience=1,
        early_stopping_additive_patience=20,
        act='relu',
        weight_param='standard',
        weight_init_mode='uniform',
        weight_init_gain=1. / np.sqrt(3.),
        bias_init_mode='pytorch-default',
        max_n_vectorized=1,
        use_last_best_epoch=False,
        emb_init_mode='kaiming-uniform-t',
    )

    # for reproducing: weight decay
    # initialize missing embeddings to zero
    # have a different early stopping tolerance threshold

    # MLP-RTDL also uses the two-output + cross-entropy thing
    # hard to reproduce: handling unknown classes with different embedding category initialized to zero

    # todo: include all lr factors etc.

    add_config('rtdl-d-reprod', [1e-3], add=mlp_rtdl_repr_config_class,
               add_reg=dict(normalize_output=True), run_this=True)
    add_config('tune-lr', lr_grid_std)
    add_config('max-epochs-256', lr_grid_std, dict(n_epochs=256))
    add_config('batch-size-256', lr_grid_std, dict(batch_size=256))
    add_config('hidden-256x3', lr_grid_std, dict(hidden_sizes=[256] * 3))
    add_config('normal-emb-init', lr_grid_std, dict(emb_init_mode='normal'))
    add_config('one-hot-small-cat', lr_grid_std, dict(tfms=['quantile_tabr', 'one_hot', 'embedding'],
                                                      max_one_hot_cat_size=9))
    # quantile_tabr was not well-suited for vectorization, now we can vectorize
    add_config('robust-scale-smooth-clip', lr_grid_std,
               dict(tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'],
                    max_n_vectorized=50))
    add_config('no-early-stop', lr_grid_std, dict(use_early_stopping=False))
    add_config('last-best-epoch', lr_grid_std, dict(use_last_best_epoch=True))
    add_config('lr-multi-cycle', lr_grid_std, dict(lr_sched='coslog4'))
    add_config('beta2-0.95', lr_grid_std, dict(sq_mom=0.95))
    add_config('label-smoothing', lr_grid_std, add_class=dict(use_ls=True, ls_eps=0.1))
    add_config('output-clipping', lr_grid_std, add_reg=dict(clamp_output=True))
    add_config('ntp', lr_grid_ntp, dict(weight_param='ntk', bias_lr_factor=0.1))
    add_config('weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0))
    add_config('bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5'))
    add_config('different-act', lr_grid_ntp, add_class=dict(act='selu'), add_reg=dict(act='mish'))
    add_config('param-act', lr_grid_ntp, dict(use_parametric_act=True, act_lr_factor=0.1))
    add_config('front-scale', lr_grid_ntp, dict(add_front_scale=True, scale_lr_factor=6.0))
    add_config('num-emb-pl', lr_grid_ntp,
               dict(num_emb_type='pl', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1))
    add_config('num-emb-pbld', lr_grid_ntp, dict(num_emb_type='pbld'))
    add_config('pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15))
    add_config('pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos'))
    add_config('wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0))
    add_config('wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos'), run_this=True)

    job_mgr.run_jobs(scheduler)
    pass


if __name__ == '__main__':
    pass
    # ----- not in the paper, only experimental -----
    # for i in range(50):
    #     if (i + 1) % 5 == 0:
    #         run_extra_realmlp_tuning_configs(paths, n_steps=i + 1)

    # run_additional_configs(paths)
    # run_ag_nn_configs(paths, tag='paper', only_meta_train=True, with_ftt=True)
    # run_mlp_random_configs(paths, n_steps=50)  # not in the paper
    # run_mlp_random_seed_configs(paths, n_steps=20)  # not in the paper
    ## run_seed_opt_configs(paths, random_seed_offset=1, tag='paper_seeds')
    # run_cumulative_ablations(paths)


================================================
FILE: scripts/run_probclass_experiments.py
================================================
import copy
import time
from typing import List, Optional, Dict, Any

import numpy as np
import pandas as pd
import sklearn
import torch

from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsNNInterfaceWrapper, \
    RandomParamsXGBInterfaceWrapper, LoadResultsWrapper, NNInterfaceWrapper, XGBInterfaceWrapper
from pytabkit.bench.data.common import SplitType
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.run.results import ResultManager
from pytabkit.bench.run.task_execution import TabBenchJobManager, RunConfig, run_alg_selection
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models import utils
from pytabkit.models.data.data import TaskType
from pytabkit.models.data.splits import SplitInfo
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.metrics import Metrics


class ProbclassExperiments:
    def __init__(self, paths: Paths, n_tt_splits: int, n_cv: int, n_hpo_steps: int, hpo_models: List[str],
                 default_models: List[str]):
        self.paths = paths
        self.n_tt_splits = n_tt_splits
        self.n_cv = n_cv
        self.n_hpo_steps = n_hpo_steps
        self.hpo_models = hpo_models
        self.default_models = default_models

        self.job_mgr = None
        self.scheduler = None
        self.config = None
        self.task_infos = None
        self.val_metric_names = None
        self.calib_options = None
        self.hpo_names = None

    def setup(self):
        # don't do this in the constructor so we have a new job_mgr etc. every time  (to be safe)
        self.job_mgr = TabBenchJobManager(paths)
        self.scheduler = SimpleJobScheduler(RayJobManager())
        metrics = Metrics(metric_names=[
            'cross_entropy', 'brier',
            'n_cross_entropy', 'n_brier',
            'logloss-clip1e-06',
            'smece', 'ece-15', 'rmsce-15', 'mce-15',
            'class_error', '1-mcc', '1-auroc-ovr',
            'ref-ll-ts', 'ref-br-ts', 'cal-ll-ts', 'cal-br-ts',
        ],
            val_metric_name='logloss',  # probably unused anyway
            task_type=TaskType.CLASSIFICATION)
        self.config = RunConfig(n_tt_splits=self.n_tt_splits, n_cv=self.n_cv, n_refit=0, save_y_pred=True,
                                metrics=metrics, train_fraction=0.8)
        self.task_infos = TaskCollection.from_name('talent-class-small', paths).load_infos(paths)
        self.val_metric_names = ['cross_entropy', 'brier', 'class_error', '1-auroc-ovr', 'ref-ll-ts', 'ref-br-ts']
        self.hpo_names = copy.copy(self.hpo_models)
        if n_cv != 1:
            self.hpo_names = [bn + f'-cv{n_cv}' for bn in self.hpo_names]
        self.calib_options = {'ts-mix': dict(calibration_method='temp-scaling', calibrate_with_mixture=True)}

    def run_hpo_configs(self, n_hpo_steps: Optional[int] = None, rerun: bool = False):
        # for RealMLP:
        # 10 steps with 2 splits and n_cv=1: 2h5m
        # 2 steps with 2 splits and n_cv=5: 48m
        # 10 steps with 2 splits and n_cv=5: 4h6m
        # -> run 50 steps with 10 splits and n_cv=1: a bit more than 50h
        # for XGB:
        # 10 steps with 2 splits and n_cv=5: 10h22m (but waiting long for results on volkert, otherwise more like 6h30m)
        # 20 steps with 2 splits and n_cv=1: 3h
        # -> run 50 steps with 10 splits and n_cv=1: 37h
        self.setup()

        if n_hpo_steps is None:
            n_hpo_steps = self.n_hpo_steps

        # tag = f'paper_hpo-cv{n_cv}' if n_cv != 1 else 'paper_hpo'
        tag = 'paper_hpo'
        cv_str = f'-cv{n_cv}' if n_cv != 1 else ''
        for step_idx in range(n_hpo_steps):
            for base_name in self.hpo_names:
                if base_name.startswith('RealMLP-HPO'):
                    self.job_mgr.add_jobs(self.task_infos, self.config,
                                          f'RealMLP-HPO{cv_str}_step-{step_idx}',
                                          RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='probclass',
                                                                         val_metric_names=self.val_metric_names),
                                          tags=[tag + '_' + base_name], rerun=rerun)
                elif base_name.startswith('XGB-HPO'):
                    self.job_mgr.add_jobs(self.task_infos, self.config,
                                          f'XGB-HPO{cv_str}_step-{step_idx}',
                                          RandomParamsXGBInterfaceWrapper(model_idx=step_idx,
                                                                          hpo_space_name='probclass',
                                                                          n_estimators=1000,
                                                                          early_stopping_rounds=1000,
                                                                          val_metric_names=self.val_metric_names),
                                          tags=[tag + '_' + base_name], rerun=rerun)
                elif base_name.startswith('MLP-HPO'):
                    self.job_mgr.add_jobs(self.task_infos, self.config,
                                          f'MLP-HPO{cv_str}_step-{step_idx}',
                                          RandomParamsNNInterfaceWrapper(model_idx=step_idx,
                                                                         hpo_space_name='probclass-mlp',
                                                                         val_metric_names=self.val_metric_names),
                                          tags=[tag + '_' + base_name], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    def run_hpo_alg_selection(self, rerun: bool = False):
        tag = 'paper'
        self.setup()

        for base_name in self.hpo_names:
            for val_metric_name in self.val_metric_names:
                alg_names = [f'{base_name}_step-{i}_val-{val_metric_name}' for i in range(self.n_hpo_steps)]
                run_alg_selection(paths, self.config, self.task_infos,
                                  f'{base_name}-{self.n_hpo_steps}_val-{val_metric_name}',
                                  alg_names, val_metric_name, tags=[tag + '_' + base_name], rerun=rerun)

    def run_hpo_calibration_configs(self, rerun: bool = False):
        tag = 'paper'
        self.setup()

        for base_name in self.hpo_names:
            for calib_name, calib_params in self.calib_options.items():
                for val_metric_name in self.val_metric_names:
                    alg_name = f'{base_name}-{self.n_hpo_steps}_val-{val_metric_name}'
                    self.job_mgr.add_jobs(self.task_infos, self.config,
                                          f'{alg_name}_{calib_name}',
                                          LoadResultsWrapper(alg_name=alg_name, **calib_params),
                                          tags=[tag + '_' + base_name], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    def run_step_calibration_configs(self, rerun: bool = False):
        # took 1h10m for 20 steps and 2 tt splits of RealMLP-HPO
        tag = 'paper_hpo-calib'
        self.setup()

        for calib_name, calib_params in self.calib_options.items():
            for val_metric_name in self.val_metric_names:
                for step_idx in range(self.n_hpo_steps):
                    for base_name in self.hpo_names:
                        alg_name = f'{base_name}_step-{step_idx}_val-{val_metric_name}'
                        self.job_mgr.add_jobs(self.task_infos, self.config,
                                              f'{alg_name}_{calib_name}',
                                              LoadResultsWrapper(alg_name=alg_name, **calib_params),
                                              tags=[tag + '_' + base_name], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    def run_default_configs(self, rerun: bool = False):
        tag = 'paper'
        cv_str = f'-cv{n_cv}' if n_cv != 1 else ''
        self.setup()
        val_metric_names = self.val_metric_names + ['ref-ll-ts-cv5', 'ref-ll-is']

        for base_name in self.default_models:
            if base_name.startswith('RealMLP-TD'):
                self.job_mgr.add_jobs(self.task_infos, self.config,
                                      f'RealMLP-TD{cv_str}',
                                      NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict(
                                          use_ls=False, val_metric_names=val_metric_names,
                                      ))),
                                      tags=[tag + '_' + base_name], rerun=rerun)
            elif base_name.startswith('XGB-D'):
                self.job_mgr.add_jobs(self.task_infos, self.config,
                                      f'XGB-D{cv_str}',
                                      XGBInterfaceWrapper(**DefaultParams.XGB_D,
                                                          val_metric_names=val_metric_names),
                                      tags=[tag + '_' + base_name], rerun=rerun)
            elif base_name.startswith('MLP-D'):
                self.job_mgr.add_jobs(self.task_infos, self.config,
                                      f'MLP-D{cv_str}',
                                      NNInterfaceWrapper(**DefaultParams.VANILLA_MLP_CLASS,
                                                         val_metric_names=val_metric_names),
                                      tags=[tag + '_' + base_name], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    def run_default_calibration_configs(self, rerun: bool = False):
        tag = 'paper'
        self.setup()

        val_metric_names = self.val_metric_names + ['ref-ll-ts-cv5', 'ref-ll-is']

        for base_name in self.default_models:
            for calib_name, calib_params in self.calib_options.items():
                for val_metric_name in val_metric_names:
                    alg_name = f'{base_name}_val-{val_metric_name}'
                    self.job_mgr.add_jobs(self.task_infos, self.config,
                                          f'{alg_name}_{calib_name}',
                                          LoadResultsWrapper(alg_name=alg_name, **calib_params),
                                          tags=[tag + '_' + base_name], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    @staticmethod
    def get_extended_calib_methods() -> Dict[str, Dict[str, Any]]:
        return {
            'ts': dict(calibration_method='temp-scaling'),
            'ts-mix': dict(calibration_method='temp-scaling', calibrate_with_mixture=True),
            'ag-ts': dict(calibration_method='autogluon-ts'),
            'ag-ts-mix': dict(calibration_method='autogluon-ts', calibrate_with_mixture=True),
            'ag-inv-ts': dict(calibration_method='autogluon-inv-ts'),
            'ag-inv-ts-mix': dict(calibration_method='autogluon-inv-ts', calibrate_with_mixture=True),
            'torchunc-ts': dict(calibration_method='torchunc-ts'),
            'torchunc-ts-mix': dict(calibration_method='torchunc-ts', calibrate_with_mixture=True),
            'torchcal-ts': dict(calibration_method='torchcal-ts'),
            'torchcal-ts-mix': dict(calibration_method='torchcal-ts', calibrate_with_mixture=True),
            'guo-ts': dict(calibration_method='guo-ts'),
            'guo-ts-mix': dict(calibration_method='guo-ts', calibrate_with_mixture=True),
            'ir': dict(calibration_method='isotonic'),
            'ir-mix': dict(calibration_method='isotonic', calibrate_with_mixture=True),
        }

    def run_calibration_benchmark(self, rerun: bool = False):
        tag = 'paper_calib-bench'
        self.setup()

        alg_name = f'XGB-D_val-class_error'

        calib_methods = self.get_extended_calib_methods()

        for calib_name, calib_params in calib_methods.items():
            self.job_mgr.add_jobs(self.task_infos, self.config,
                                  f'{alg_name}_calib-bench_{calib_name}',
                                  LoadResultsWrapper(alg_name=alg_name, **calib_params),
                                  tags=[tag], rerun=rerun)

        self.job_mgr.run_jobs(self.scheduler)

    def run_calibration_timing(self, rerun: bool = False):
        import probmetrics.calibrators
        from probmetrics.distributions import CategoricalLogits

        self.setup()

        results_list = []

        csv_path = paths.base() / 'calib_times' / 'times.csv'
        if utils.existsFile(csv_path) and not rerun:
            return

        alg_name = f'XGB-D_val-class_error'
        calib_methods = self.get_extended_calib_methods()

        for i, task_info in enumerate(self.task_infos):
            print(f'Running calibration timing on {task_info.task_desc} ({i+1}/{len(self.task_infos)})')
            ds = task_info.load_task(self.paths).ds
            y_full = ds.tensors['y'].squeeze(-1)
            random_splits = task_info.get_random_splits(self.n_tt_splits, train_fraction=self.config.train_fraction,
                                                        trainval_fraction=self.config.trainval_fraction)
            for split_idx in range(self.n_tt_splits):
                random_split: SplitInfo = random_splits[split_idx]
                trainval_split = random_split.splitter.split_ds(ds)
                trainval_idxs = trainval_split.get_sub_idxs(0)
                trainval_ds = trainval_split.get_sub_ds(0)
                sub_splits = random_split.get_sub_splits(trainval_ds, n_splits=self.n_cv, is_cv=True)

                path = self.paths.results_alg_task_split(task_info.task_desc, alg_name, n_cv=self.n_cv,
                                                         split_type=SplitType.RANDOM, split_id=split_idx)
                rm = ResultManager.load(path, load_other=False, load_preds=True)
                y_logits_torch = torch.as_tensor(rm.y_preds_cv, dtype=torch.float32)

                for cv_idx in range(self.n_cv):
                    sub_split = sub_splits[cv_idx]
                    val_idxs = trainval_idxs[sub_split.get_sub_idxs(0)]
                    y_val = y_full[val_idxs]
                    y_pred_val = CategoricalLogits(y_logits_torch[cv_idx, val_idxs])

                    for calib_name, calib_params in calib_methods.items():
                        cal = probmetrics.calibrators.get_calibrator(**calib_params)
                        if i == 0 and split_idx == 0 and cv_idx == 0:
                            # dry run to avoid measuring import times
                            cal_tmp = sklearn.base.clone(cal)
                            cal_tmp.fit_torch(y_pred_val, y_val)

                        start_time = time.time()
                        cal.fit_torch(y_pred_val, y_val)
                        end_time = time.time()
                        results_list.append(dict(
                            alg_name=alg_name,
                            calib_name=calib_name,
                            task=str(task_info.task_desc),
                            n_val=len(val_idxs),
                            tt_split_idx=split_idx,
                            cv_split_idx=cv_idx,
                            time=end_time - start_time))

        results_df = pd.DataFrame(results_list)
        utils.ensureDir(csv_path)
        results_df.to_csv(csv_path)


if __name__ == '__main__':
    n_hpo_steps = 30
    n_tt_splits = 5
    n_cv = 1
    paths = Paths.from_env_variables()
    exp = ProbclassExperiments(paths=paths, n_tt_splits=n_tt_splits, n_cv=n_cv, n_hpo_steps=n_hpo_steps,
                               hpo_models=['MLP-HPO', 'XGB-HPO', 'RealMLP-HPO'],
                               default_models=['MLP-D', 'XGB-D', 'RealMLP-TD'])

    exp.run_default_configs()
    exp.run_default_calibration_configs()

    # took 9h for 20 steps with 5 splits for MLP-HPO
    # for RealMLP + XGB-HPO: 9h45m + 1h34m + ...
    # 30 hpo steps with 5 splits for MLP + RealMLP + XGB: 9h + 9h45m + 1h34m + 17h52m = 20h19m + 17h52m = 38h11m
    exp.run_hpo_configs()

    exp.run_hpo_alg_selection()
    exp.run_hpo_calibration_configs()
    exp.run_calibration_timing()
    exp.run_calibration_benchmark()

    # not used in the paper
    # exp.run_step_calibration_configs()


================================================
FILE: scripts/run_single_task.py
================================================
import time

import numpy as np
import torch

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskPackage, TaskDescription
from pytabkit.bench.scheduling.resources import NodeResources
from pytabkit.models import utils
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.logging import StdoutLogger
from pytabkit.bench.alg_wrappers.interface_wrappers import NNInterfaceWrapper, MLPRTDLInterfaceWrapper, ResNetRTDLInterfaceWrapper, \
    TabRInterfaceWrapper
from pytabkit.bench.alg_wrappers.extra_interface_wrappers import IterativeImportanceNNInterfaceWrapper, \
    IterativeWeightNNInterfaceWrapper, IterativeReinitNNInterfaceWrapper
from pytabkit.models.training.metrics import Metrics


def run_example(paths: Paths):
    start_time = time.time()
    use_gpu = torch.cuda.is_available()

    wrapper = NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_REG))

    task_info = TaskDescription('uci-reg', 'parkinson_motor').load_info(paths)

    print('n_samples:', task_info.n_samples)
    print('n_cont:', task_info.tensor_infos['x_cont'].get_n_features())
    print('x_cat cat sizes:', task_info.tensor_infos['x_cat'].get_cat_sizes())
    print('n_classes:', task_info.tensor_infos['y'].get_cat_sizes())
    if task_info.tensor_infos['y'].get_cat_sizes() > 0:
        class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1))
        print(f'class frequencies: {class_frequencies.numpy()}')

    is_nn = (isinstance(wrapper, NNInterfaceWrapper) or isinstance(wrapper, MLPRTDLInterfaceWrapper)
             or isinstance(wrapper, ResNetRTDLInterfaceWrapper)
             or isinstance(wrapper, IterativeImportanceNNInterfaceWrapper)
             or isinstance(wrapper, IterativeWeightNNInterfaceWrapper)
             or isinstance(wrapper, IterativeReinitNNInterfaceWrapper)
             or isinstance(wrapper, TabRInterfaceWrapper))
    use_gpu = use_gpu and is_nn

    print(f'Running on task {task_info.task_desc}')
    if is_nn:
        split_infos = task_info.get_random_splits(10)[0:1]
        task_package = TaskPackage(task_info, split_infos=split_infos, n_cv=1, n_refit=0, paths=paths, rerun=False,
                                   alg_name='test', save_y_pred=False)
    else:
        split_infos = task_info.get_random_splits(10)[1:2]
        task_package = TaskPackage(task_info, split_infos=split_infos, n_cv=1, n_refit=0, paths=paths, rerun=True,
                                   alg_name='test', save_y_pred=False)
    logger = StdoutLogger(verbosity_level=2)
    metric_name = Metrics.default_eval_metric_name(task_info.task_type)
    required_resources = wrapper.get_required_resources(task_package)
    print(f'Predicted time usage in s: {required_resources.time_s:g}')
    print(f'Predicted CPU RAM usage in GB: {required_resources.cpu_ram_gb:g}')
    print(f'Requested n_threads: {required_resources.n_threads:g}')

    # metric_name = '1-auroc'
    gpu_usages = np.array([1.0]) if use_gpu and is_nn else np.array([], dtype=np.float32)
    gpu_rams_gb = np.array([5.0]) if use_gpu and is_nn else np.array([], dtype=np.float32)
    tmp_folders = [paths.results_alg_task_split(task_package.task_info.task_desc,
                                                alg_name=task_package.alg_name, n_cv=task_package.n_cv,
                                                split_type=split_info.split_type,
                                                split_id=split_info.id) / 'tmp' for split_info in
                   task_package.split_infos]
    result_managers = wrapper.run(task_package, logger,
                                  assigned_resources=NodeResources(node_id=0, n_threads=16.0, cpu_ram_gb=2.0,
                                                                   gpu_usages=gpu_usages,
                                                                   gpu_rams_gb=gpu_rams_gb,
                                                                   physical_core_usages=np.array([0.0])),
                                  tmp_folders=tmp_folders)

    for rm in result_managers:
        print(rm.metrics_dict)
        print(rm.other_dict)

    result_pairs = [('val', [rm.metrics_dict['cv']['val']['1']['0'][metric_name] for rm in result_managers])]
    for is_cv in [True, False] if task_package.n_refit > 0 else [True]:
        cv_str = 'cv' if is_cv else 'refit'
        max_n_models = task_package.n_cv if is_cv else task_package.n_refit
        for n_models in {1, max_n_models}:  # use a set in case max_n_models == 1
            try:
                name = 'test-' + cv_str + '-' + str(n_models)
                results = [rm.metrics_dict[cv_str]['test'][str(n_models)][str(start_idx)][metric_name]
                           for rm in result_managers for start_idx in range(1 if n_models > 1 else max_n_models)]
                result_pairs.append((name, results))
            except KeyError as e:
                print(e)
                pass  # might happen if wrapper is not a randomized alg and therefore does not do ensembling
    for name, results in result_pairs:
        print(f'Mean {name} error: {np.mean(results):g} +- {np.std(results) / np.sqrt(len(results)):g}')

    # for rm in rms:
    #     print('val:', rm.val_dict)
    #     print('test:', rm.test_dict)

    print(f'Time: {time.time() - start_time:g} s')


if __name__ == '__main__':
    run_example(Paths.from_env_variables())


================================================
FILE: scripts/run_slurm.py
================================================
import functools

import fire

from run_experiments import run_gbdt_rs_configs
from pytabkit.bench.data.paths import Paths


if __name__ == '__main__':
    # paths = Paths.from_env_variables()
    # run_configs(paths)
    fire.Fire(run_gbdt_rs_configs)


================================================
FILE: scripts/run_time_measurement.py
================================================
import random
import time
import torch

import numpy as np
import sklearn

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskInfo, TaskCollection
from pytabkit.models import utils
from pytabkit.models.data.splits import RandomSplitter
from pytabkit.models.sklearn.sklearn_base import AlgInterfaceEstimator
from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier, CatBoost_TD_Classifier, \
    LGBM_TD_Classifier, \
    XGB_TD_Classifier, LGBM_D_Classifier, CatBoost_D_Classifier, XGB_D_Classifier, LGBM_HPO_Classifier, \
    CatBoost_HPO_Classifier, \
    XGB_HPO_Classifier, RealMLP_HPO_Classifier, XGB_PBB_D_Classifier, RF_SKL_D_Classifier, MLP_SKL_D_Classifier, \
    MLP_SKL_D_Regressor, \
    RF_SKL_D_Regressor, RealMLP_HPO_Regressor, XGB_HPO_Regressor, CatBoost_HPO_Regressor, LGBM_HPO_Regressor, \
    XGB_D_Regressor, \
    CatBoost_D_Regressor, LGBM_D_Regressor, RealMLP_TD_Regressor, RealMLP_TD_S_Regressor, RealMLP_TD_S_Classifier, \
    XGB_TD_Regressor, \
    CatBoost_TD_Regressor, LGBM_TD_Regressor, MLP_RTDL_D_Classifier, Resnet_RTDL_D_Classifier, MLP_RTDL_D_Regressor, \
    Resnet_RTDL_D_Regressor, TabR_S_D_Classifier, TabR_S_D_Regressor, MLP_RTDL_HPO_Classifier, \
    MLP_RTDL_HPO_Regressor, XGB_HPO_TPE_Regressor, LGBM_HPO_TPE_Regressor, \
    CatBoost_HPO_TPE_Regressor, XGB_HPO_TPE_Classifier, LGBM_HPO_TPE_Classifier, CatBoost_HPO_TPE_Classifier, \
    MLP_PLR_D_Classifier, MLP_PLR_HPO_Classifier, MLP_PLR_D_Regressor, MLP_PLR_HPO_Regressor, Resnet_RTDL_HPO_Regressor, \
    Resnet_RTDL_HPO_Classifier, RealTabR_D_Classifier, RealTabR_D_Regressor, TabR_HPO_Classifier, TabR_HPO_Regressor, \
    RF_HPO_Classifier, RF_HPO_Regressor, FTT_HPO_Classifier, FTT_D_Classifier, FTT_D_Regressor, FTT_HPO_Regressor


def measure_times(paths: Paths, alg_name: str, estimator: AlgInterfaceEstimator, coll_name: str, device: str,
                  rerun: bool = False, n_predict_reps: int = 20) -> None:
    task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths)
    times_list = []
    for task_info in task_infos:
        file_path = paths.times_alg_task(alg_name=alg_name, task_desc=task_info.task_desc) / 'times.yaml'
        if utils.existsFile(file_path) and not rerun:
            times_list.append(utils.deserialize(file_path, use_yaml=True))
            # print(f'Results exist already')
            continue

        print(f'Measuring time for alg {alg_name} on task {task_info.task_desc}: ', end='')
        estimator: AlgInterfaceEstimator = sklearn.base.clone(estimator)
        estimator.device = device

        task = task_info.load_task(paths)
        ds = task.ds
        seed = task_info.n_samples
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        trainval_test_split = RandomSplitter(seed).split_ds(ds)
        trainval_ds, test_ds = trainval_test_split.get_sub_ds(0), trainval_test_split.get_sub_ds(1)
        train_val_split = RandomSplitter(seed + 1, first_fraction=0.75).split_ds(trainval_ds)
        val_idxs = train_val_split.get_sub_idxs(1).numpy()
        x_trainval = trainval_ds.without_labels().to_df()
        y_trainval = trainval_ds.tensors['y'].numpy().squeeze(-1)
        x_test = test_ds.without_labels().to_df()

        start_time = time.time()
        estimator.fit(x_trainval, y_trainval, val_idxs=val_idxs)
        end_time = time.time()
        fit_time = end_time - start_time

        start_time = time.time()
        for i in range(n_predict_reps):
            estimator.predict(x_test)
        end_time = time.time()
        predict_time = (end_time - start_time) / n_predict_reps

        times = {'fit_time': fit_time, 'predict_time': predict_time}
        utils.serialize(file_path, times, use_yaml=True)
        times_list.append(times)

        print(f'{fit_time=:g}s, {predict_time=:g}s')

    avg_fit_time = np.mean([times['fit_time'] for times in times_list])
    avg_predict_time = np.mean([times['predict_time'] for times in times_list])
    print(f'Average times for {alg_name} on {coll_name}: {avg_fit_time=:g}s, {avg_predict_time=:g}s')


def measure_times_cpu_class(n_threads: int, rerun: bool = False):
    paths = Paths.from_env_variables()
    estimators = {
        'LGBM-TD_CPU': LGBM_TD_Classifier(n_threads=n_threads, verbosity=-1),
        'CatBoost-TD_CPU': CatBoost_TD_Classifier(n_threads=n_threads),
        'XGB-TD_CPU': XGB_TD_Classifier(n_threads=n_threads),
        'RealMLP-TD_CPU': RealMLP_TD_Classifier(n_threads=n_threads),
        'RealMLP-TD-S_CPU': RealMLP_TD_S_Classifier(n_threads=n_threads),
        'LGBM-D_CPU': LGBM_D_Classifier(n_threads=n_threads, verbosity=-1),
        'CatBoost-D_CPU': CatBoost_D_Classifier(n_threads=n_threads),
        'XGB-D_CPU': XGB_D_Classifier(n_threads=n_threads),
        'RF-SKL-D_CPU': RF_SKL_D_Classifier(n_threads=n_threads),
        'MLP-SKL-D_CPU': MLP_SKL_D_Classifier(n_threads=n_threads),
        'MLP-RTDL-D_CPU': MLP_RTDL_D_Classifier(n_threads=n_threads),
        'MLP-PLR-D_CPU': MLP_PLR_D_Classifier(n_threads=n_threads),
        'ResNet-RTDL-D_CPU': Resnet_RTDL_D_Classifier(n_threads=n_threads),
        'XGB-PBB-D_CPU': XGB_PBB_D_Classifier(n_threads=n_threads),
        'TabR-S-D_CPU': TabR_S_D_Classifier(n_threads=n_threads),
        'RealTabR-D_CPU': RealTabR_D_Classifier(n_threads=n_threads),
        'FTT-D_CPU': FTT_D_Classifier(n_threads=n_threads),

        'RealMLP-HPO-2_CPU': RealMLP_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'MLP-RTDL-HPO-2_CPU': MLP_RTDL_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'MLP-PLR-HPO-2_CPU': MLP_PLR_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'ResNet-RTDL-HPO-2_CPU': Resnet_RTDL_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'XGB-HPO-TPE_CPU': XGB_HPO_TPE_Classifier(n_threads=n_threads),
        'LGBM-HPO-TPE_CPU': LGBM_HPO_TPE_Classifier(n_threads=n_threads, verbosity=-1),
        'CatBoost-HPO-TPE_CPU': CatBoost_HPO_TPE_Classifier(n_threads=n_threads),
        'XGB-HPO-2_CPU': XGB_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'LGBM-HPO-2_CPU': LGBM_HPO_Classifier(n_threads=n_threads, verbosity=-1, n_hyperopt_steps=2),
        'CatBoost-HPO-2_CPU': CatBoost_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'RF-HPO-2_CPU': RF_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2),
        'TabR-HPO-1_CPU': TabR_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=1),
        'FTT-HPO-1_CPU': FTT_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=1),

        'LGBM-D_val-ce_CPU': LGBM_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy', verbosity=-1),
        'XGB-D_val-ce_CPU': XGB_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'),
        'CatBoost-D_val-ce_CPU': CatBoost_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'),
        'LGBM-TD_val-ce_CPU': LGBM_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy', verbosity=-1),
        'XGB-TD_val-ce_CPU': XGB_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'),
        'CatBoost-TD_val-ce_CPU': CatBoost_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'),
        'XGB-PBB-D_val-ce_CPU': XGB_PBB_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'),
        'RealMLP-TD_val-ce_no-ls_CPU': RealMLP_TD_Classifier(val_metric_name='cross_entropy',
                                                             use_ls=False, n_threads=n_threads),
        'RealMLP-TD-S_val-ce_no-ls_CPU': RealMLP_TD_S_Classifier(val_metric_name='cross_entropy',
                                                                 use_ls=False, n_threads=n_threads),
        'RealMLP-TD_no-ls_CPU': RealMLP_TD_Classifier(device='cpu',
                                                      use_ls=False, n_threads=n_threads),
        'RealMLP-TD-S_no-ls_CPU': RealMLP_TD_S_Classifier(device='cpu',
                                                          use_ls=False, n_threads=n_threads),
        'RealMLP-TD_val-ce_CPU': RealMLP_TD_Classifier(val_metric_name='cross_entropy',
                                                       n_threads=n_threads),
        'RealMLP-TD-S_val-ce_CPU': RealMLP_TD_S_Classifier(val_metric_name='cross_entropy',
                                                           n_threads=n_threads),
        'MLP-RTDL-D_val-ce_CPU': MLP_RTDL_D_Classifier(val_metric_name='cross_entropy',
                                                       n_threads=n_threads),
        'MLP-PLR-D_val-ce_CPU': MLP_PLR_D_Classifier(val_metric_name='cross_entropy',
                                                       n_threads=n_threads),
        'ResNet-RTDL-D_val-ce_CPU': Resnet_RTDL_D_Classifier(val_metric_name='cross_entropy',
                                                             n_threads=n_threads),
        'TabR-S-D_val-ce_CPU': TabR_S_D_Classifier(val_metric_name='cross_entropy',
                                                   n_threads=n_threads),
        'RealTabR-D_val-ce_CPU': RealTabR_D_Classifier(val_metric_name='cross_entropy',
                                                   n_threads=n_threads),
        'RealTabR-D_no-ls_CPU': RealTabR_D_Classifier(ls_eps=0.0,
                                                       n_threads=n_threads),
        'RealTabR-D_val-ce_no-ls_CPU': RealTabR_D_Classifier(ls_eps=0.0, val_metric_name='cross_entropy',
                                                      n_threads=n_threads),
        'FTT-D_val-ce_CPU': FTT_D_Classifier(val_metric_name='cross_entropy',
                                                   n_threads=n_threads),

        'MLP-RTDL-D_rssc_CPU': MLP_RTDL_D_Classifier(n_threads=n_threads,
                                                     tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'ResNet-RTDL-D_rssc_CPU': Resnet_RTDL_D_Classifier(n_threads=n_threads,
                                                           tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'TabR-S-D_rssc_CPU': TabR_S_D_Classifier(n_threads=n_threads,
                                                 tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'FTT-D_rssc_CPU': FTT_D_Classifier(n_threads=n_threads,
                                                 tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'MLP-PLR-D_rssc_CPU': MLP_PLR_D_Classifier(n_threads=n_threads,
                                                     tfms=['median_center', 'robust_scale', 'smooth_clip']),
    }

    for alg_name, estimator in estimators.items():
        measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-class', device='cpu',
                      rerun=rerun)


def measure_times_cpu_reg(n_threads: int, rerun: bool = False):
    paths = Paths.from_env_variables()
    estimators = {
        'LGBM-TD_CPU': LGBM_TD_Regressor(n_threads=n_threads, verbosity=-1),
        'CatBoost-TD_CPU': CatBoost_TD_Regressor(n_threads=n_threads),
        'XGB-TD_CPU': XGB_TD_Regressor(n_threads=n_threads),
        'RealMLP-TD_CPU': RealMLP_TD_Regressor(n_threads=n_threads),
        'RealMLP-TD-S_CPU': RealMLP_TD_S_Regressor(n_threads=n_threads),
        'LGBM-D_CPU': LGBM_D_Regressor(n_threads=n_threads, verbosity=-1),
        'CatBoost-D_CPU': CatBoost_D_Regressor(n_threads=n_threads),
        'XGB-D_CPU': XGB_D_Regressor(n_threads=n_threads),
        'RF-SKL-D_CPU': RF_SKL_D_Regressor(n_threads=n_threads),
        'MLP-SKL-D_CPU': MLP_SKL_D_Regressor(n_threads=n_threads),
        'MLP-RTDL-D_CPU': MLP_RTDL_D_Regressor(n_threads=n_threads),
        'MLP-PLR-D_CPU': MLP_PLR_D_Regressor(n_threads=n_threads),
        'ResNet-RTDL-D_CPU': Resnet_RTDL_D_Regressor(n_threads=n_threads),
        'TabR-S-D_CPU': TabR_S_D_Regressor(n_threads=n_threads),
        'RealTabR-D_CPU': RealTabR_D_Regressor(n_threads=n_threads),
        'FTT-D_CPU': FTT_D_Regressor(n_threads=n_threads),

        'RealMLP-HPO-2_CPU': RealMLP_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'MLP-RTDL-HPO-2_CPU': MLP_RTDL_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'MLP-PLR-HPO-2_CPU': MLP_PLR_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'ResNet-RTDL-HPO-2_CPU': Resnet_RTDL_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'XGB-HPO-2_CPU': XGB_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'LGBM-HPO-2_CPU': LGBM_HPO_Regressor(n_threads=n_threads, verbosity=-1, n_hyperopt_steps=2),
        'CatBoost-HPO-2_CPU': CatBoost_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'XGB-HPO-TPE_CPU': XGB_HPO_TPE_Regressor(n_threads=n_threads),
        'LGBM-HPO-TPE_CPU': LGBM_HPO_TPE_Regressor(n_threads=n_threads, verbosity=-1),
        'CatBoost-HPO-TPE_CPU': CatBoost_HPO_TPE_Regressor(n_threads=n_threads),
        'RF-HPO-2_CPU': RF_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2),
        'TabR-HPO-1_CPU': TabR_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=1),
        'FTT-HPO-1_CPU': FTT_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=1),

        'MLP-RTDL-D_rssc_CPU': MLP_RTDL_D_Regressor(n_threads=n_threads,
                                                     tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'ResNet-RTDL-D_rssc_CPU': Resnet_RTDL_D_Regressor(n_threads=n_threads,
                                                           tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'TabR-S-D_rssc_CPU': TabR_S_D_Regressor(n_threads=n_threads,
                                                 tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'FTT-D_rssc_CPU': FTT_D_Regressor(n_threads=n_threads,
                                                tfms=['median_center', 'robust_scale', 'smooth_clip']),
        'MLP-PLR-D_rssc_CPU': MLP_PLR_D_Regressor(n_threads=n_threads,
                                                tfms=['median_center', 'robust_scale', 'smooth_clip']),
    }

    for alg_name, estimator in estimators.items():
        measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-reg', device='cpu',
                      rerun=rerun)


def measure_times_gpu_class(n_threads: int, rerun: bool = False):
    paths = Paths.from_env_variables()
    # todo: add XGB-GPU and CatBoost-GPU?
    estimators = {
        'MLP-TD_GPU': RealMLP_TD_Classifier(device='cuda:0', n_threads=n_threads),
        'MLP-TD-S_GPU': RealMLP_TD_S_Classifier(device='cuda:0', n_threads=n_threads),
        'MLP-HPO-2_GPU': RealMLP_HPO_Classifier(device='cuda:0', n_threads=n_threads, n_hyperopt_steps=2),
    }

    import torch
    # have torch cuda initialization before running the first NN
    _ = torch.zeros(1, device='cuda:0')

    for alg_name, estimator in estimators.items():
        measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-class', device='cuda:0',
                      rerun=rerun)


def measure_times_gpu_reg(n_threads: int, rerun: bool = False):
    paths = Paths.from_env_variables()
    estimators = {
        'MLP-TD_GPU': RealMLP_TD_Regressor(device='cuda:0', n_threads=n_threads),
        'MLP-TD-S_GPU': RealMLP_TD_S_Regressor(device='cuda:0', n_threads=n_threads),
        'MLP-HPO-2_GPU': RealMLP_HPO_Regressor(device='cuda:0', n_threads=n_threads, n_hyperopt_steps=2),
    }

    import torch
    # have torch cuda initialization before running the first NN
    _ = torch.zeros(1, device='cuda:0')

    for alg_name, estimator in estimators.items():
        measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-reg', device='cuda:0',
                      rerun=rerun)


if __name__ == '__main__':
    # may take a day or so on a good CPU
    n_threads = 32
    measure_times_cpu_class(n_threads=n_threads, rerun=False)
    measure_times_cpu_reg(n_threads=n_threads, rerun=False)
    # measure_times_gpu_class(n_threads=n_threads, rerun=False)  # not used in the paper
    # measure_times_gpu_reg(n_threads=n_threads, rerun=False)  # not used in the paper


================================================
FILE: scripts/run_xrfm_large_ablations.py
================================================
import fire

from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsxRFMInterfaceWrapper
from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager, run_alg_selection

from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskCollection
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models.data.data import TaskType


def run_xrfm_large_ablations(hpo_space_name: str, n_hpo_steps: int = 30, rerun: bool = False):
    # todo: install xrfm directly from the repo
    # todo: set env variable for the tab_bench_data_path
    # todo: measure runtime
    # todo: ensure that only one job runs per GPU, so that the time measurements are accurate
    # todo: make sure to install the version with kermac
    paths = Paths.from_env_variables()
    task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths))
    task_infos = [ti for ti in task_infos if 70_000 <= ti.n_samples]
    class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION]
    reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION]
    TaskCollection('meta-test-large-class', [info.task_desc for info in class_task_infos]).save(paths)
    TaskCollection('meta-test-large-reg', [info.task_desc for info in reg_task_infos]).save(paths)
    for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]:
        print(f'{name} task infos:')
        for info in infos:
            print(f'{info.task_desc}: n_samples={info.n_samples}')
        print()

    config = RunConfig(n_tt_splits=1, n_cv=1, n_refit=0, save_y_pred=False)

    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    for step_idx in range(n_hpo_steps):
        job_mgr.add_jobs(task_infos, config,
                         f'xRFM-HPO-{hpo_space_name}_new_step-{step_idx}',
                         RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name,
                                                          M_batch_size=8192, max_leaf_size=40_000),
                         tags=[f'xrfm_hpo_{hpo_space_name}_new_steps'], rerun=rerun)

    job_mgr.run_jobs(scheduler)

    alg_names = [f'xRFM-HPO-{hpo_space_name}_new_step-{i}' for i in range(n_hpo_steps)]

    run_alg_selection(paths, config, class_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_new', alg_names, val_metric_name='class_error',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)
    run_alg_selection(paths, config, reg_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_new', alg_names, val_metric_name='rmse',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)


def run_xrfm_large_ablations_old(hpo_space_name: str = 'paper-large-pca', n_hpo_steps: int = 30, rerun: bool = False):
    # todo: install xrfm directly from the repo
    # todo: set env variable for the tab_bench_data_path
    # todo: measure runtime
    # todo: ensure that only one job runs per GPU, so that the time measurements are accurate
    # todo: make sure to install the version with kermac
    paths = Paths.from_env_variables()
    task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths))
    task_infos = [ti for ti in task_infos if 70_000 <= ti.n_samples <= 200_000]
    class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION]
    reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION]
    TaskCollection('meta-test-medlarge-class', [info.task_desc for info in class_task_infos]).save(paths)
    TaskCollection('meta-test-medlarge-reg', [info.task_desc for info in reg_task_infos]).save(paths)
    for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]:
        print(f'{name} task infos:')
        for info in infos:
            print(f'{info.task_desc}: n_samples={info.n_samples}')
        print()

    config = RunConfig(n_tt_splits=1, n_cv=1, n_refit=0, save_y_pred=False)

    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    for step_idx in range(n_hpo_steps):
        job_mgr.add_jobs(task_infos, config,
                         f'xRFM-HPO-{hpo_space_name}_Mbs-8192_step-{step_idx}',
                         RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name,
                                                          M_batch_size=8192),
                         tags=[f'xrfm_hpo_{hpo_space_name}_steps'], rerun=rerun)

    job_mgr.run_jobs(scheduler)

    alg_names = [f'xRFM-HPO-{hpo_space_name}_Mbs-8192_step-{i}' for i in range(n_hpo_steps)]

    run_alg_selection(paths, config, class_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_Mbs-8192', alg_names, val_metric_name='class_error',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)
    run_alg_selection(paths, config, reg_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_Mbs-8192', alg_names, val_metric_name='rmse',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)


def run_xrfm_small_test_ablations(n_hpo_steps: int = 50, rerun: bool = False):
    # todo: install xrfm directly from the repo
    # todo: set env variable
    # todo: measure runtime
    # todo: ensure that only one job runs per GPU, so that the time measurements are accurate
    paths = Paths.from_env_variables()
    task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths)
    task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths))
    task_infos = [ti for ti in task_infos if 100 <= ti.n_samples <= 2000]
    class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION]
    reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION]
    TaskCollection('meta-test-small-class', [info.task_desc for info in class_task_infos]).save(paths)
    TaskCollection('meta-test-small-reg', [info.task_desc for info in reg_task_infos]).save(paths)
    for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]:
        print(f'{name} task infos:')
        for info in infos:
            print(f'{info.task_desc}: n_samples={info.n_samples}')
        print()

    config = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False)

    job_mgr = TabBenchJobManager(paths)
    scheduler = SimpleJobScheduler(RayJobManager())
    hpo_space_name = 'paper-large-pca'
    for step_idx in range(n_hpo_steps):
        job_mgr.add_jobs(task_infos, config,
                         f'xRFM-HPO-{hpo_space_name}_small_step-{step_idx}',
                         RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name,
                                                          max_leaf_size=200),
                         tags=[f'xrfm_hpo_{hpo_space_name}_steps'], rerun=rerun)

    job_mgr.run_jobs(scheduler)

    alg_names = [f'xRFM-HPO-{hpo_space_name}_small_step-{i}' for i in range(n_hpo_steps)]

    run_alg_selection(paths, config, class_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_small', alg_names, val_metric_name='class_error',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)
    run_alg_selection(paths, config, reg_task_infos,
                      f'xRFM-HPO-{hpo_space_name}_small', alg_names, val_metric_name='rmse',
                      tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True)


if __name__ == '__main__':
    # run_xrfm_small_test_ablations()
    fire.Fire(run_xrfm_large_ablations)
    # run_xrfm_large_ablations(hpo_space_name='paper-large-pca')
    # run_xrfm_large_ablations(hpo_space_name='paper-large')


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/test_bench.py
================================================
from pathlib import Path

from sklearn.datasets import make_classification
import torch

from pytabkit import XGB_TD_Classifier
from pytabkit.bench.alg_wrappers.interface_wrappers import XGBInterfaceWrapper
from pytabkit.bench.data.paths import Paths
from pytabkit.bench.data.tasks import TaskDescription, TaskInfo, Task, TaskCollection
from pytabkit.bench.run.task_execution import TabBenchJobManager, RunConfig
from pytabkit.bench.scheduling.execution import RayJobManager
from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler
from pytabkit.models import utils
from pytabkit.models.data.data import TensorInfo, DictDataset
from pytabkit.models.sklearn.default_params import DefaultParams


# Running this test before the sklearn tests can cause an error in the pickling test for NNs using skorch:
# _pickle.PicklingError: Can't pickle <built-in function print>: it's not the same object as builtins.print
# The error occurs when ray.init() and FunctionProcess() are both used.

# def test_bench_simple(tmp_path: Path):
#     paths = Paths(base_folder=str(tmp_path/'tab_bench_data'))
#
#     # ----- import dataset -----
#
#     n_samples = 1000
#
#     X, Y = make_classification(
#         n_samples=n_samples,
#         random_state=1
#     )
#     x_cont = torch.as_tensor(X, dtype=torch.float32)
#     x_cat = torch.zeros(n_samples, 0, dtype=torch.long)
#     print(f'{Y.shape=}')
#     y = torch.as_tensor(Y, dtype=torch.long)
#     tensors = dict(x_cont=x_cont, x_cat=x_cat, y=y[:, None])
#     tensor_infos = dict(x_cont=TensorInfo(feat_shape=[x_cont.shape[1]]), x_cat=TensorInfo(feat_shape=[0]),
#                         y=TensorInfo(cat_sizes=[2]))
#     ds = DictDataset(tensors, tensor_infos)
#
#     task_desc = TaskDescription('custom-class', 'ds_custom')
#     task_info = TaskInfo.from_ds(task_desc=task_desc, ds=ds)
#     task = Task(task_info=task_info, ds=ds)
#     task.save(paths)
#     TaskCollection.from_source('custom-class', paths).save(paths)
#
#
#     # ----- run benchmark -----
#     job_mgr = TabBenchJobManager(paths)
#     scheduler = SimpleJobScheduler(RayJobManager())
#     config_10_1_0 = RunConfig(n_tt_splits=2, n_cv=1, n_refit=0, save_y_pred=False)
#     task_infos = TaskCollection.from_name('custom-class', paths).load_infos(paths)
#
#     ds_x, ds_y = task_infos[0].load_task(paths).ds.split_xy()
#     # xgb = XGBInterfaceWrapper(**utils.join_dicts(DefaultParams.XGB_D, dict(n_estimators=2)))
#     xgb = XGB_TD_Classifier(n_estimators=2)
#     xgb.fit(ds_x.to_df(), ds_y.to_df())
#
#     job_mgr.add_jobs(task_infos, config_10_1_0,
#                      'XGB-D-class',
#                      XGBInterfaceWrapper(**utils.join_dicts(DefaultParams.XGB_D, dict(n_estimators=2))),
#                      tags=['default'], rerun=False)
#
#     job_mgr.run_jobs(scheduler)


================================================
FILE: tests/test_ensemble.py
================================================
import pytest
import sklearn.base
import numpy as np

from pytabkit import Ensemble_TD_Classifier, Ensemble_TD_Regressor
from pytabkit.models.sklearn.sklearn_interfaces import Ensemble_HPO_Classifier, Ensemble_HPO_Regressor


@pytest.mark.parametrize('model', [
    Ensemble_TD_Classifier(calibration_method='ts-mix', val_metric_name='ref-ll-ts', device='cpu'),
    Ensemble_TD_Regressor(device='cpu'),
    Ensemble_HPO_Classifier(calibration_method='ts-mix',
                            val_metric_name='ref-ll-ts', n_hpo_steps=1, device='cpu'),
    Ensemble_HPO_Regressor(n_hpo_steps=1, device='cpu'),
    ])
def test_ensemble(model):
    np.random.seed(0)
    X = np.random.randn(100, 2)
    y = np.random.randn(100, 1)
    if sklearn.base.is_classifier(model):
        y = y > 0.0
    model.fit(X, y)
    model.predict(X)


================================================
FILE: tests/test_metrics.py
================================================
import numpy as np
import torch
import sklearn

from pytabkit.models.training.metrics import Metrics


def test_pinball():
    torch.manual_seed(0)
    y_pred = torch.randn(100)[:, None]
    y = torch.randn(100)[:, None]
    loss = Metrics.apply(y_pred, y, 'pinball(0.95)').item()
    sklearn_loss = sklearn.metrics.mean_pinball_loss(y.numpy(), y_pred.numpy(), alpha=0.95)
    assert np.isclose(loss, sklearn_loss)


================================================
FILE: tests/test_rtdl_nns.py
================================================
import numpy as np
import pandas as pd
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from pytabkit.models.sklearn.sklearn_interfaces import Resnet_RTDL_D_Classifier, Resnet_RTDL_D_Regressor, \
    MLP_RTDL_D_Classifier, MLP_RTDL_D_Regressor, FTT_D_Classifier, FTT_D_Regressor
from sklearn.datasets import make_classification, make_regression
import pytest
import torch
# def test_estimator_compliance():
#     # Check if the custom estimators comply with scikit-learn's conventions
#     check_estimator(Resnet_RTDL_D_Classifier())
#     check_estimator(Resnet_RTDL_D_Regressor())

# @pytest.mark.parametrize("n_classes", [2, 3])
# @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"])
# def test_numerical_data(n_classes, model_name):
#     # Generate synthetic data
#     X, y = make_classification(n_samples=1000, n_features=20, n_informative=3,n_classes=n_classes, random_state=42)
#     X = pd.DataFrame(X)
#     y = pd.Series(y)
#
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     # Train the classifier
#     if model_name == "resnet":
#         clf = Resnet_RTDL_D_Classifier(device="cpu")
#     elif model_name == "mlp":
#         clf = MLP_RTDL_D_Classifier(device="cpu")
#     elif model_name == "ft_transformer":
#         clf = FTT_D_Classifier(device="cpu")
#     clf.fit(X_train, y_train, cat_indicator=[False] * 20)  # Assuming no categorical features
#
#     # Predict and evaluate
#     predictions = clf.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions)
#     assert accuracy > 0.5, "Accuracy should be greater than 50%"
#
#
# @pytest.mark.parametrize("n_classes", [2, 3])
# @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"])
# def test_categorical_data(n_classes, model_name):
#     # Generate synthetic data with a categorical feature
#     X, y = make_classification(n_samples=1000, n_features=20, n_informative=3, n_classes=n_classes, random_state=42)
#     # Add a categorical feature
#     cat_col = np.random.choice([0, 1, 2], size=X.shape[0])
#     X = np.hstack((X, cat_col.reshape(-1, 1)))
#
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     # Train the classifier with categorical feature
#     if model_name == "resnet":
#         clf = Resnet_RTDL_D_Classifier(device="cpu")
#     elif model_name == "mlp":
#         clf = MLP_RTDL_D_Classifier(device="cpu")
#     elif model_name == "ft_transformer":
#         clf = FTT_D_Classifier(device="cpu")
#     clf.fit(X_train, y_train, cat_indicator=[False] * 20 + [True])
#
#     # Predict and evaluate
#     predictions = clf.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions)
#     assert accuracy > 0.5, "Accuracy should be greater than 50%"
#
#     # Check if the classifier can handle unseen categories
#     X_test[0, -1] = -1  # Unseen category
#     predictions = clf.predict(X_test)
#     # If no error is raised, the classifier can handle unseen categories
#
# @pytest.mark.parametrize("transformed_target", [True, False])
# @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"])
# def test_regressor_numerical_categorical(transformed_target, model_name):
#     # Generate synthetic data with a mix of numerical and categorical features
#     X, y = make_regression(n_samples=1000, n_features=3, n_informative=2, random_state=43)
#     cat_feature = np.random.choice([1, 2, 3], size=X.shape[0])
#     X = np.column_stack((X, cat_feature))
#
#     X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(X.shape[1] - 1)] + ['cat'])
#     cat_features = [False]*3 + [True]
#
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
#
#     if model_name == "resnet":
#         regressor = Resnet_RTDL_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu")
#     elif model_name == "mlp":
#         regressor = MLP_RTDL_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu")
#     elif model_name == "ft_transformer":
#         regressor = FTT_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu")
#     regressor.fit(X_train, y_train, cat_indicator=cat_features)
#     predictions = regressor.predict(X_test)
#
#     # Evaluate the regressor with R2 score
#     score = r2_score(y_test, predictions)
#     assert score > 0.1, f"Regressor R2 score too low with mixed features, got {score}"
#
#     # Test handling of unseen categories
#     X_test.iloc[0, -1] = 4  # Introduce a new category
#     predictions = regressor.predict(X_test)
#     # If no errors and predictions are returned, the regressor can handle unseen categories during test time
#
#
# def create_model(regression, model_name, **kwargs):
#     if model_name == "resnet":
#         model = Resnet_RTDL_D_Regressor(device="cpu", **kwargs) if regression else Resnet_RTDL_D_Classifier(device="cpu", **kwargs)
#     elif model_name == "mlp":
#         model = MLP_RTDL_D_Regressor(device="cpu", **kwargs) if regression else MLP_RTDL_D_Classifier(device="cpu", **kwargs)
#     elif model_name == "ft_transformer":
#         model = FTT_D_Regressor(device="cpu", **kwargs) if regression else FTT_D_Classifier(device="cpu", **kwargs)
#     return model
#
#
# # @pytest.mark.parametrize("regression", [True, False])
# # @pytest.mark.parametrize("resnet_or_mlp", ["resnet", "mlp"])
# # def test_determinist(regression, resnet_or_mlp):
# #     # generate toy data
# #     if regression:
# #         X, y = make_regression(n_samples=300, n_features=20, n_informative=2, random_state=42)
# #     else:
# #         X, y = make_classification(n_samples=300, n_features=20, n_informative=2, random_state=42)
# #
# #     # add categorical feature
# #     cat_feature = np.random.choice([1, 2, 3], size=X.shape[0])
# #     X = np.column_stack((X, cat_feature))
# #
# #     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# #
# #
# #     random_states = [42, 42, 43]
# #     res_list = []
# #     for random_state in random_states:
# #         model = create_model(regression, resnet_or_mlp, random_state=random_state)
# #         model.fit(X_train, y_train, cat_features=[False]*20 + [True])
# #         predictions = model.predict(X_test)
# #         res_list.append(predictions)
# #
# #     assert np.allclose(res_list[0], res_list[1]), "Predictions should be the same with the same random_state"
# #     assert not np.allclose(res_list[0], res_list[2]), "Predictions should be different with different random_state"
#
#
# @pytest.mark.parametrize("regression", [True, False])
# @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"])
# @pytest.mark.parametrize("n_classes", [2, 3])
# def test_all_categorical(regression, model_name, n_classes):
#     X = np.random.randint(n_classes, size=(1000, 10))
#     if regression:
#         y = np.random.rand(1000)
#     else:
#         y = np.random.randint(n_classes, size=(1000,))
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#
#     model = create_model(regression, model_name, random_state=42)
#     model.fit(X_train, y_train, cat_indicator=[True] * 10)
#
#     model.predict(X_test)
#
#
# @pytest.mark.parametrize("seed", list(range(10)))
# @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"])
# def test_high_cardinality(seed, model_name):
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#
#     x_df = pd.DataFrame({'cat_1': [270, 86, 154, 80, 56, 80, 80, 283, 199, 291]}).astype('category')
#     y = np.zeros(len(x_df))
#
#     reg = create_model(True, model_name, random_state=seed)
#     reg.fit(x_df, y, cat_indicator=[True])
#     reg.predict(x_df)
#
#
# # @pytest.mark.parametrize("resnet_or_mlp", ["resnet", "mlp"])
# # @pytest.mark.parametrize("transformed_target", [True, False])
# # def test_constant_predictor(resnet_or_mlp, transformed_target):
# #     # test that the prediction are replaced by the mean of the training set if the val loss
# #     # is infinite or too bad
# #     X, y = make_regression(n_samples=1000, n_features=20, n_informative=2, random_state=42)
# #
# #     # first lr 3 to get bad but finite val_loss
# #     model = create_model(True, resnet_or_mlp, random_state=42, lr=1, max_epochs=10, transformed_target=transformed_target)
# #     model.fit(X, y, val_idxs=np.arange(100))
# #     # check that val_loss is finite
# #     history = model.alg_interface_.sub_split_interfaces[0].model.history
# #     assert np.isfinite(history[:, 'valid_loss']).any()
# #     predictions = model.predict(X)
# #     assert np.allclose(predictions, np.mean(y[100:])), "Predictions should be the mean of the training set"
# #     # this should also correspond to model.alg_interface_.sub_split_interfaces[0].model.y_train_mean if transformed_target=False
# #     if not transformed_target:
# #         assert np.allclose(model.alg_interface_.sub_split_interfaces[0].model.y_train_mean, np.mean(y[100:]))
# #     assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == True
# #
# #     # now lr 1000 to get bad but infinite val_loss
# #     model = create_model(True, resnet_or_mlp, random_state=42, lr=10000, max_epochs=10, transformed_target=transformed_target)
# #     model.fit(X, y, val_idxs=np.arange(100))
# #     # check that val_loss is infinite
# #     history = model.alg_interface_.sub_split_interfaces[0].model.history
# #     assert ~np.isfinite(history[:, 'valid_loss']).all()
# #     predictions = model.predict(X)
# #     assert np.allclose(predictions, np.mean(y[100:])), "Predictions should be the mean of the training set"
# #     # this should also correspond to model.alg_interface_.sub_split_interfaces[0].model.y_train_mean if transformed_target=False
# #     if not transformed_target:
# #         assert np.allclose(model.alg_interface_.sub_split_interfaces[0].model.y_train_mean, np.mean(y[100:]))
# #     assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == True
# #
# #     # now lr=1e-5 to check that the predictions are not replaced by the mean of the training set
# #     model = create_model(True, resnet_or_mlp, random_state=42, lr=1e-5, max_epochs=10, transformed_target=transformed_target)
# #     model.fit(X, y, val_idxs=np.arange(100))
# #     # check that val_loss is finite
# #     history = model.alg_interface_.sub_split_interfaces[0].model.history
# #     assert np.isfinite(history[:, 'valid_loss']).any()
# #     predictions = model.predict(X)
# #     assert not np.allclose(predictions, np.mean(y[100:])), "Predictions should not be the mean of the training set"
# #     assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == False


================================================
FILE: tests/test_sklearn_interfaces.py
================================================
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks

from pytabkit import XRFM_D_Classifier, XRFM_D_Regressor
from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier, RealMLP_TD_Regressor, \
    RealMLP_TD_S_Regressor, LGBM_TD_Classifier, LGBM_TD_Regressor, XGB_TD_Classifier, XGB_TD_Regressor, \
    CatBoost_TD_Classifier, \
    CatBoost_TD_Regressor, MLP_RTDL_D_Classifier, MLP_RTDL_D_Regressor, Resnet_RTDL_D_Classifier, TabR_S_D_Classifier, \
    Resnet_RTDL_D_Regressor, TabR_S_D_Regressor, TabM_D_Classifier, TabM_D_Regressor, MLP_PLR_D_Regressor, \
    MLP_PLR_D_Classifier, FTT_D_Classifier, FTT_D_Regressor, RealMLP_TD_S_Classifier


# decrease min_data_in_leaf for LGBMTDClassifier since otherwise the test check_classifiers_classes fails,
# because LGBM only predicts a single class on the training set
# also increase subsample to 1.0 because otherwise LightGBM fails with n_samples=1.
@parametrize_with_checks([
    XRFM_D_Classifier(device='cpu'), XRFM_D_Regressor(device='cpu'),
    LGBM_TD_Classifier(min_data_in_leaf=2, subsample=1.0, calibration_method='ts-mix', val_metric_name='ref-ll-ts',
                       n_estimators=100),
    LGBM_TD_Classifier(min_data_in_leaf=2, subsample=1.0, n_estimators=100),
    LGBM_TD_Regressor(subsample=1.0, n_estimators=100),
    XGB_TD_Classifier(n_estimators=100), XGB_TD_Regressor(n_estimators=100),
    CatBoost_TD_Classifier(n_estimators=100), CatBoost_TD_Regressor(n_estimators=100),
    # use CPU to avoid Mac OS errors with MPS backend
    RealMLP_TD_Classifier(n_epochs=8, device='cpu'), RealMLP_TD_Regressor(n_epochs=64, device='cpu'),
    TabM_D_Classifier(device='cpu', tabm_k=2, num_emb_type='pwl', arch_type='tabm-mini', num_emb_n_bins=2),
    TabM_D_Regressor(device='cpu', tabm_k=2, num_emb_type='pwl', arch_type='tabm-mini', num_emb_n_bins=2),
    MLP_RTDL_D_Classifier(device='cpu', max_epochs=50), #MLP_RTDL_D_Regressor(device='cpu'),
    Resnet_RTDL_D_Classifier(device='cpu'), Resnet_RTDL_D_Regressor(device='cpu'),
    MLP_PLR_D_Classifier(device='cpu'), MLP_PLR_D_Regressor(device='cpu'),
    FTT_D_Classifier(device='cpu', module_d_token=128, module_n_heads=8, max_epochs=32),
    FTT_D_Regressor(device='cpu', module_d_token=128, module_n_heads=8, max_epochs=32),
    # Tabr_D_Classifier(), Tabr_D_Regressor(),  # needs faiss which is not in the dependencies, so don't test
                          ])
def test_sklearn_compatible_estimator(estimator, check):
    check(estimator)


================================================
FILE: tests/test_tabr.py
================================================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from pytabkit.models.sklearn.sklearn_interfaces import TabR_S_D_Classifier, TabR_S_D_Regressor
from sklearn.datasets import make_classification, make_regression
import pytest
import torch

# tests are currently not executed since TabR needs faiss which is not available via pip,
# therefore it cannot run via hatch test / in CI


# @pytest.mark.parametrize("n_classes", [2, 3])
# def test_numerical_data(n_classes):
#     # Generate synthetic data
#     X, y = make_classification(n_samples=1000, n_features=20, n_informative=3,n_classes=n_classes, random_state=42)
#     X = pd.DataFrame(X)
#     y = pd.Series(y)
#
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     # Train the classifier
#     clf = TabR_S_D_Classifier(n_epochs=5)
#     clf.fit(X_train, y_train, cat_features=[False] * 20)  # Assuming no categorical features
#
#     # Predict and evaluate
#     predictions = clf.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions)
#     assert accuracy > 0.5, "Accuracy should be greater than 50%"
#
#
# @pytest.mark.parametrize("n_classes", [2, 3])
# def test_categorical_data(n_classes):
#     # Generate synthetic data with a categorical feature
#     X, y = make_classification(n_samples=1000, n_features=20, n_informative=3, n_classes=n_classes, random_state=42)
#     # Add a categorical feature
#     cat_col = np.random.choice([0, 1, 2], size=X.shape[0])
#     X = np.hstack((X, cat_col.reshape(-1, 1)))
#
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     # Train the classifier with categorical feature
#     clf = TabR_S_D_Classifier(n_epochs=5)
#     clf.fit(X_train, y_train, cat_features=[False] * 20 + [True])
#
#     # Predict and evaluate
#     predictions = clf.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions)
#     assert accuracy > 0.5, "Accuracy should be greater than 50%"
#
#     # Check if the classifier can handle unseen categories
#     X_test[0, -1] = -1  # Unseen category
#     predictions = clf.predict(X_test)
#     # If no error is raised, the classifier can handle unseen categories
#
#
# @pytest.mark.parametrize("transformed_target", [True, False])
# def test_regressor_numerical_categorical(transformed_target):
#     # Generate synthetic data with a mix of numerical and categorical features
#     X, y = make_regression(n_samples=1000, n_features=5, n_informative=3, random_state=42)
#     cat_feature = np.random.choice([1, 2, 3], size=X.shape[0])
#     X = np.column_stack((X, cat_feature))
#
#     X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(X.shape[1] - 1)] + ['cat'])
#     cat_features = [False]*5 + [True]
#
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     # Train the regressor
#     regressor = TabR_S_D_Regressor(n_epochs=20, transformed_target=transformed_target)
#     regressor.fit(X_train, y_train, cat_features=cat_features)
#     predictions = regressor.predict(X_test)
#
#     # Evaluate the regressor with R2 score
#     score = r2_score(y_test, predictions)
#     assert score > 0.1, f"Regressor R2 score too low with mixed features, got {score}"
#
#     # Test handling of unseen categories
#     X_test.iloc[0, -1] = 4  # Introduce a new category
#     predictions = regressor.predict(X_test)
#     # If no errors and predictions are returned, the regressor can handle unseen categories during test time
#
#
# @pytest.mark.parametrize("regression", [True, False])
# def test_determinist(regression):
#     # generate toy data
#     if regression:
#         X, y = make_regression(n_samples=300, n_features=20, n_informative=2, random_state=42)
#     else:
#         X, y = make_classification(n_samples=300, n_features=20, n_informative=2, random_state=42)
#
#     # add categorical feature
#     cat_feature = np.random.choice([1, 2, 3], size=X.shape[0])
#     X = np.column_stack((X, cat_feature))
#
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
#     random_states = [42, 42, 43]
#     res_list = []
#     for random_state in random_states:
#         if regression:
#             model = TabR_S_D_Regressor(random_state=random_state, n_epochs=5)
#         else:
#             model = TabR_S_D_Classifier(random_state=random_state, n_epochs=5)
#         model.fit(X_train, y_train, cat_features=[False]*20 + [True])
#         predictions = model.predict(X_test)
#         res_list.append(predictions)
#
#     assert np.allclose(res_list[0], res_list[1]), "Predictions should be the same with the same random_state"
#     assert not np.allclose(res_list[0], res_list[2]), "Predictions should be different with different random_state"
#
#
# @pytest.mark.parametrize("regression", [True, False])
# @pytest.mark.parametrize("n_classes", [2, 3])
# @pytest.mark.parametrize("cat_size", [2, 5])
# def test_all_categorical(regression, n_classes, cat_size):
#     X = np.random.randint(cat_size, size=(1000, 10))
#     if regression:
#         y = np.random.rand(1000)
#     else:
#         y = np.random.randint(n_classes, size=(1000,))
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#
#     model = TabR_S_D_Regressor(n_epochs=5) if regression else TabR_S_D_Classifier(n_epochs=5)
#     model.fit(X_train, y_train, cat_features=[True] * 10)
#
#     model.predict(X_test)
#
#
# @pytest.mark.parametrize("seed", list(range(10)))
# def test_high_cardinality(seed):
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#
#     x_df = pd.DataFrame({'cat_1': [270, 86, 154, 80, 56, 80, 80, 283, 199, 291]}).astype('category')
#     y = np.zeros(len(x_df))
#
#     reg = TabR_S_D_Regressor(n_epochs=5)
#     reg.fit(x_df, y, cat_features=[True])
#     reg.predict(x_df)


================================================
FILE: tests/test_variants.py
================================================
import pytest
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import ClassifierMixin
import torch

from pytabkit import TabM_D_Classifier, RealMLP_HPO_Classifier, Ensemble_HPO_Classifier, TabM_HPO_Regressor, \
    TabM_HPO_Classifier, LGBM_HPO_Classifier, CatBoost_HPO_Classifier, XGB_HPO_Classifier, Ensemble_HPO_Regressor, \
    LGBM_HPO_TPE_Regressor, RealMLP_TD_Regressor, RealMLP_HPO_Regressor, TabM_D_Regressor, XRFM_D_Classifier, \
    XRFM_D_Regressor, XRFM_HPO_Classifier, XRFM_HPO_Regressor


@pytest.mark.parametrize('estimator', [
    RealMLP_TD_Regressor(n_cv=2, n_refit=2, n_repeats=2),
    RealMLP_HPO_Regressor(n_hyperopt_steps=2, train_metric_name='multi_pinball(0.1,0.9)',
                          val_metric_name='multi_pinball(0.1,0.9)'),
    TabM_D_Classifier(val_metric_name='cross_entropy', num_emb_type='pwl', tabm_k=16, random_state=0),
    TabM_D_Regressor(val_metric_name='cross_entropy', num_emb_type='pwl', tabm_k=16, random_state=0),
    TabM_HPO_Regressor(val_metric_name='mae', n_hyperopt_steps=2, hpo_space_name='tabarena',
                       random_state=0),
    TabM_HPO_Classifier(val_metric_name='mae', n_hyperopt_steps=2, hpo_space_name='default',
                        random_state=0, use_caruana_ensembling=True),
    XRFM_D_Classifier(val_metric_name='cross_entropy'),
    XRFM_D_Regressor(),
    XRFM_HPO_Classifier(n_hyperopt_steps=2),
    XRFM_HPO_Regressor(n_hyperopt_steps=2),
    # use CPU since GPU might not support some features in the search space (it has problems with rsm for catboost)
    LGBM_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'),
    XGB_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'),
    CatBoost_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'),
    RealMLP_HPO_Classifier(val_metric_name='cross_entropy', n_hyperopt_steps=3, use_caruana_ensembling=True,
                           hpo_space_name='tabarena', n_caruana_steps=10, random_state=0),
    Ensemble_HPO_Classifier(val_metric_name='brier', n_hpo_steps=2, use_full_caruana_ensembling=True,
                            use_tabarena_spaces=True),
    Ensemble_HPO_Regressor(val_metric_name='brier', n_hpo_steps=2, use_full_caruana_ensembling=True,
                           use_tabarena_spaces=True),
    LGBM_HPO_TPE_Regressor(n_cv=2, n_refit=2, n_hyperopt_steps=2),
])
def test_sklearn_not_crash(estimator):
    np.random.seed(0)
    n_train = 100
    X = pd.DataFrame({'a': np.random.randn(n_train), 'b': np.random.randint(5, size=(n_train,))})
    X['b'] = X['b'].astype('category')

    est = sklearn.base.clone(estimator)
    if not torch.cuda.is_available():
        # don't use mps even if it's available
        est.device = 'cpu'
    if isinstance(est, ClassifierMixin):
        y = np.random.randint(3, size=(n_train,))
    else:
        y = np.random.randn(n_train)

    est.fit(X, y)
    est.predict(X)