Repository: dholzmueller/pytabkit Branch: main Commit: c126ea51187c Files: 157 Total size: 2.0 MB Directory structure: gitextract_xlrx7g0c/ ├── .github/ │ └── workflows/ │ └── testing.yml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE.txt ├── README.md ├── docs/ │ ├── Makefile │ ├── make.bat │ ├── requirements.txt │ └── source/ │ ├── bench/ │ │ ├── 00_installation.md │ │ ├── 01_running_the_benchmark.md │ │ ├── 02_stored_data.md │ │ ├── 03_code.md │ │ ├── adding_models.md │ │ ├── download_results.md │ │ ├── refine_then_calibrate.md │ │ └── using_the_scheduler.md │ ├── conf.py │ ├── index.rst │ └── models/ │ ├── 00_overview.md │ ├── 01_sklearn_interfaces.rst │ ├── 02_hpo.md │ ├── 03_training_implementation.md │ ├── examples.md │ ├── nn_classes.md │ └── quantile_reg.md ├── examples/ │ └── tutorial_notebook.ipynb ├── original_requirements/ │ ├── conda_env_2024_06_25.yml │ ├── conda_env_2024_10_28.yml │ ├── conda_env_2025_01_15.yml │ └── requirements_2024_06_25.txt ├── pyproject.toml ├── pytabkit/ │ ├── __about__.py │ ├── __init__.py │ ├── bench/ │ │ ├── __init__.py │ │ ├── alg_wrappers/ │ │ │ ├── __init__.py │ │ │ ├── general.py │ │ │ └── interface_wrappers.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── get_uci.py │ │ │ ├── import_talent_benchmark.py │ │ │ ├── import_tasks.py │ │ │ ├── paths.py │ │ │ ├── tasks.py │ │ │ └── uci_file_ops.py │ │ ├── eval/ │ │ │ ├── __init__.py │ │ │ ├── analysis.py │ │ │ ├── colors.py │ │ │ ├── evaluation.py │ │ │ ├── plotting.py │ │ │ ├── runtimes.py │ │ │ └── tables.py │ │ ├── run/ │ │ │ ├── __init__.py │ │ │ ├── results.py │ │ │ └── task_execution.py │ │ └── scheduling/ │ │ ├── __init__.py │ │ ├── execution.py │ │ ├── jobs.py │ │ ├── resource_manager.py │ │ ├── resources.py │ │ └── schedulers.py │ └── models/ │ ├── __init__.py │ ├── alg_interfaces/ │ │ ├── __init__.py │ │ ├── alg_interfaces.py │ │ ├── autogluon_model_interfaces.py │ │ ├── base.py │ │ ├── calibration.py │ │ ├── catboost_interfaces.py │ │ ├── ensemble_interfaces.py │ │ ├── lightgbm_interfaces.py │ │ ├── nn_interfaces.py │ │ ├── other_interfaces.py │ │ ├── resource_computation.py │ │ ├── resource_params.py │ │ ├── rtdl_interfaces.py │ │ ├── sub_split_interfaces.py │ │ ├── tabm_interface.py │ │ ├── tabr_interface.py │ │ ├── xgboost_interfaces.py │ │ └── xrfm_interfaces.py │ ├── data/ │ │ ├── __init__.py │ │ ├── conversion.py │ │ ├── data.py │ │ ├── nested_dict.py │ │ └── splits.py │ ├── hyper_opt/ │ │ ├── __init__.py │ │ ├── coord_opt.py │ │ └── hyper_optimizers.py │ ├── nn_models/ │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── base.py │ │ ├── categorical.py │ │ ├── models.py │ │ ├── nn.py │ │ ├── pipeline.py │ │ ├── rtdl_num_embeddings.py │ │ ├── rtdl_resnet.py │ │ ├── tabm.py │ │ ├── tabr.py │ │ ├── tabr_context_freeze.py │ │ └── tabr_lib.py │ ├── optim/ │ │ ├── __init__.py │ │ ├── adopt.py │ │ ├── optimizers.py │ │ └── scheduling_adam.py │ ├── sklearn/ │ │ ├── __init__.py │ │ ├── default_params.py │ │ ├── sklearn_base.py │ │ └── sklearn_interfaces.py │ ├── torch_utils.py │ ├── training/ │ │ ├── __init__.py │ │ ├── auc_mu.py │ │ ├── coord.py │ │ ├── lightning_callbacks.py │ │ ├── lightning_modules.py │ │ ├── logging.py │ │ ├── metrics.py │ │ ├── nn_creator.py │ │ └── scheduling.py │ └── utils.py ├── scripts/ │ ├── analyze_hpo_best_params.py │ ├── analyze_tasks.py │ ├── check_missing_values.py │ ├── copy_algs.py │ ├── create_plots_and_tables.py │ ├── create_probclass_plots.py │ ├── create_xrfm_ablations_table.py │ ├── custom_paths.py.default │ ├── download_data.py │ ├── estimate_resource_params.py │ ├── get_sklearn_names.py │ ├── make_plot_animation.py │ ├── meta_hyperopt.py │ ├── move_algs.py │ ├── move_many_algs.py │ ├── print_complete_results.py │ ├── print_runtimes.py │ ├── ray_slurm_launch.py │ ├── ray_slurm_template.sh │ ├── rename_alg.py │ ├── rename_tag.py │ ├── run_evaluation.py │ ├── run_experiments.py │ ├── run_experiments_unused.py │ ├── run_probclass_experiments.py │ ├── run_single_task.py │ ├── run_slurm.py │ ├── run_time_measurement.py │ └── run_xrfm_large_ablations.py └── tests/ ├── __init__.py ├── test_bench.py ├── test_ensemble.py ├── test_metrics.py ├── test_rtdl_nns.py ├── test_sklearn_interfaces.py ├── test_tabr.py └── test_variants.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/testing.yml ================================================ name: 'test' on: push: branches: - "main" - "dev" pull_request: branches: - '*' jobs: test: strategy: fail-fast: false matrix: os: [windows-latest, ubuntu-latest, macos-latest] python-version: ['3.9', '3.10', '3.11', '3.12'] # 3.13 fails on Windows because it doesn't find a ray version runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install uv uses: astral-sh/setup-uv@v3 with: # Install a specific version of uv. version: "0.5.4" - name: Install hatch run: uv pip install --system hatch - name: Install swig run: uv pip install --system swig - name: Run tests run: hatch test # removed codecov upload in v1.7.3 ================================================ FILE: .gitignore ================================================ *.pyc *.pdf *.zip *.ckpt experiments/*/ experiments/trace.json !experiments/meta_hpo !experiments/prototypes public_export dist files plots lightning_logs docs/build docs/source/modules.rst docs/source/pytabkit.* .coverage* .idea catboost_info tab_bench_data rtdl_checkpoints examples/.ipynb_checkpoints scripts/custom_paths.py ================================================ FILE: .readthedocs.yaml ================================================ # Read the Docs configuration file for Sphinx projects # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the OS, Python version and other tools you might need build: os: ubuntu-22.04 tools: python: "3.10" # You can also specify other tool versions: # nodejs: "20" # rust: "1.70" # golang: "1.20" jobs: pre_build: - sphinx-apidoc -o docs/source/ pytabkit # Build documentation in the "docs/" directory with Sphinx sphinx: configuration: docs/source/conf.py # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs # builder: "dirhtml" builder: "html" # Fail on all warnings to avoid broken references # fail_on_warning: true # Optionally build your docs in additional formats such as PDF and ePub # formats: # - pdf # - epub # Optional but recommended, declare the Python requirements required # to build your documentation # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - requirements: docs/requirements.txt ================================================ FILE: LICENSE.txt ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dholzmueller/pytabkit/blob/main/examples/tutorial_notebook.ipynb) [![](https://readthedocs.org/projects/pytabkit/badge/?version=latest&style=flat-default)](https://pytabkit.readthedocs.io/en/latest/) [![test](https://github.com/dholzmueller/pytabkit/actions/workflows/testing.yml/badge.svg)](https://github.com/dholzmueller/pytabkit/actions/workflows/testing.yml) [![Downloads](https://img.shields.io/pypi/dm/pytabkit)](https://pypistats.org/packages/pytabkit) # PyTabKit: Tabular ML models and benchmarking (NeurIPS 2024) [Paper](https://arxiv.org/abs/2407.04491) | [Documentation](https://pytabkit.readthedocs.io) | [RealMLP-TD-S standalone implementation](https://github.com/dholzmueller/realmlp-td-s_standalone) | [Grinsztajn et al. benchmark code](https://github.com/LeoGrin/tabular-benchmark/tree/better_by_default) | [Data archive](https://doi.org/10.18419/darus-4555) | |-------------------------------------------|--------------------------------------------------|---------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------| PyTabKit provides **scikit-learn interfaces for modern tabular classification and regression methods** benchmarked in our [paper](https://arxiv.org/abs/2407.04491), see below. It also contains the code we used for **benchmarking** these methods on our benchmarks. ![Meta-test benchmark results](./figures/meta-test_benchmark_results.png) ## When (not) to use pytabkit - **To get the best possible results**: - Generally we recommend AutoGluon for the best possible results, though it does not include all the models from pytabkit. AutoGluon 1.4 includes RealMLP (though not in a default configuration) and TabM (in the "extreme" preset for <= 30K samples). - To get the best possible results from `pytabkit`, we recommend using `Ensemble_HPO_Classifier(n_cv=8, use_full_caruana_ensembling=True, use_tabarena_spaces=True, n_hpo_steps=50)` with a `val_metric_name` corresponding to your target metric (e.g., `class_error`, `cross_entropy`, `brier`, `1-auc_ovr`), or the corresponding `Regressor`. (This might take very long to fit.) - For only a single model, we recommend using `RealMLP_HPO_Classifier(n_cv=8, hpo_space_name='tabarena-new', use_caruana_ensembling=True, n_hyperopt_steps=50)`, also with `val_metric_name` as above, or the corresponding `Regressor`. - **Models**: [TabArena](https://github.com/AutoGluon/tabarena) also includes some newer models like RealMLP and TabM with more general preprocessing (missing numericals, text, etc.), as well as very good boosted tree implementations. `pytabkit` is currently still easier to use and supports vectorized cross-validation for RealMLP, which can significantly speed up the training. - **Benchmarking**: While pytabkit can be good for quick benchmarking for development, for method evaluation we recommend [TabArena](https://github.com/AutoGluon/tabarena). ## Installation (new in 1.4.0: optional model dependencies) ```bash pip install pytabkit[models] ``` - RealMLP (and TabM) can be used without the `[models]` part. - For xRFM on GPU, faster kernels will be used if you install `kermac[cu12]` or `kermac[cu11]` (depending on your CUDA version). - If you want to use **TabR**, you have to manually install [faiss](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md), which is only available on **conda**. - Please install torch separately if you want to control the version (CPU/GPU etc.) - Use `pytabkit[models,autogluon,extra,hpo,bench,dev]` to install additional dependencies for the other models, AutoGluon models, extra preprocessing, hyperparameter optimization methods beyond random search (hyperopt/SMAC), the benchmarking part, and testing/documentation. For the hpo part, you might need to install *swig* (e.g. via pip) if the build of *pyrfr* fails. See also the [documentation](https://pytabkit.readthedocs.io). To run the data download for the meta-train benchmark, you need one of rar, unrar, or 7-zip to be installed on the system. ## Using the ML models Most of our machine learning models are directly available via scikit-learn interfaces. For example, you can use RealMLP-TD for classification as follows: ```python from pytabkit import RealMLP_TD_Classifier model = RealMLP_TD_Classifier() # or TabR_S_D_Classifier, CatBoost_TD_Classifier, etc. model.fit(X_train, y_train) model.predict(X_test) ``` The code above will automatically select a GPU if available, try to detect categorical columns in dataframes, preprocess numerical variables and regression targets (no standardization required), and use a training-validation split for early stopping. All of this (and much more) can be configured through the constructor and the parameters of the fit() method. For example, it is possible to do bagging (ensembling of models on 5-fold cross-validation) simply by passing `n_cv=5` to the constructor. Here is an example for some of the parameters that can be set explicitly: ```python from pytabkit import RealMLP_TD_Classifier model = RealMLP_TD_Classifier(device='cpu', random_state=0, n_cv=1, n_refit=0, n_epochs=256, batch_size=256, hidden_sizes=[256] * 3, val_metric_name='cross_entropy', use_ls=False, # for metrics like AUC / log-loss lr=0.04, verbosity=2) model.fit(X_train, y_train, X_val, y_val, cat_col_names=['Education']) model.predict_proba(X_test) ``` See [this notebook](https://colab.research.google.com/github/dholzmueller/pytabkit/blob/main/examples/tutorial_notebook.ipynb) for more examples. Missing numerical values are currently *not* allowed and need to be imputed beforehand. ### Available ML models Our ML models are available in up to three variants, all with best-epoch selection: - library defaults (D) - our tuned defaults (TD) - random search hyperparameter optimization (HPO), sometimes also tree parzen estimator (HPO-TPE) or weighted ensembling (Ensemble) We provide the following ML models: - **RealMLP** (TD, HPO, Ensemble): Our new neural net models with tuned defaults (TD), random search hyperparameter optimization (HPO), or Ensembling - **XGB**, **LGBM**, **CatBoost** (D, TD, HPO, HPO-TPE): Interfaces for gradient-boosted tree libraries XGBoost, LightGBM, CatBoost - **MLP**, **ResNet**, **FTT** (D, HPO): Models from [Revisiting Deep Learning Models for Tabular Data](https://proceedings.neurips.cc/paper_files/paper/2021/hash/9d86d83f925f2149e9edb0ac3b49229c-Abstract.html) - **MLP-PLR** (D, HPO): MLP with numerical embeddings from [On Embeddings for Numerical Features in Tabular Deep Learning](https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html) - **TabR** (D, HPO): TabR model from [TabR: Tabular Deep Learning Meets Nearest Neighbors](https://openreview.net/forum?id=rhgIgTSSxW) - **TabM** (D, HPO): TabM model from [TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling](https://arxiv.org/abs/2410.24210) - **XRFM** (D, HPO): xRFM model from [here](https://arxiv.org/abs/2508.10053) ([original repo](https://github.com/dmbeaglehole/xRFM)) - **RealTabR** (D): Our new TabR variant with default parameters - **Ensemble-TD**: Weighted ensemble of all TD models (RealMLP, XGB, LGBM, CatBoost) ## Post-hoc calibration and refinement stopping For using post-hoc temperature scaling and refinement stopping from our paper [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195), you can pass the following parameters to the scikit-learn interfaces: ```python from pytabkit import RealMLP_TD_Classifier clf = RealMLP_TD_Classifier( val_metric_name='ref-ll-ts', # short for 'refinement_logloss_ts-mix_all' calibration_method='ts-mix', # temperature scaling with laplace smoothing use_ls=False # recommended for cross-entropy loss ) ``` Other calibration methods and validation metrics from [probmetrics](https://github.com/dholzmueller/probmetrics) can be used as well. For reproducing the results from this paper, we refer to the [documentation](https://pytabkit.readthedocs.io/en/latest/bench/refine_then_calibrate.html). ## Benchmarking code Our benchmarking code has functionality for - dataset download - running methods highly parallel on single-node/multi-node/multi-GPU hardware, with automatic scheduling and trying to respect RAM constraints - analyzing/plotting results For more details, we refer to the [documentation](https://pytabkit.readthedocs.io). ## Preprocessing code While many preprocessing methods are implemented in this repository, a standalone version of our robust scaling + smooth clipping can be found [here](https://github.com/dholzmueller/realmlp-td-s_standalone/blob/main/preprocessing.py#L65C7-L65C37). ## Citation If you use this repository for research purposes, please cite our [paper](https://arxiv.org/abs/2407.04491): ``` @inproceedings{holzmuller2024better, title={Better by default: {S}trong pre-tuned {MLPs} and boosted trees on tabular data}, author={Holzm{\"u}ller, David and Grinsztajn, Leo and Steinwart, Ingo}, booktitle = {Neural {Information} {Processing} {Systems}}, year={2024} } ``` ## Contributors - David Holzmüller (main developer) - Léo Grinsztajn (deep learning baselines, plotting) - Ingo Steinwart (UCI dataset download) - Katharina Strecker (PyTorch-Lightning interface) - Daniel Beaglehole (part of the xRFM implementation) - Lennart Purucker (some features/fixes) - Jérôme Dockès (deployment, continuous integration) ## Acknowledgements Code from other repositories is acknowledged as well as possible in code comments. Especially, we used code from https://github.com/yandex-research/rtdl and sub-packages (Apache 2.0 license), code from https://github.com/catboost/benchmarks/ (Apache 2.0 license), and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html (Apache 2.0 license). ## Releases (see git tags) - v1.7.3: - disabled RealMLP lightning log file creation that was accidentally introduced in predict() in >=v1.7.0. - removed pynvml dependency. - v1.7.2: - Added scikit-learn 1.8 compatibility. - Removed debug print in RealMLP. - fixed device memory estimation error in the scheduler when `CUDA_VISIBLE_DEVICES` was used. - v1.7.1: - LightGBM now processes the `extra_trees`, `max_cat_to_onehot`, and `min_data_per_group` parameters used in the `'tabarena'` search space, which should improve results. - Scikit-learn interfaces for RealMLP (TD, HPO) now support moving the model to a different device (e.g., before saving). This can be achived using, e.g., `model.to('cpu')` (which is in-place). - Fixed an xRFM bug in handling binary categorical features. - v1.7.0: - added [xRFM](https://arxiv.org/abs/2508.10053) (D, HPO) - added new `'tabarena-new'` search space for RealMLP-HPO, including per-fold ensembling (more expensive) and tuning two more categorical hyperparameters (with [better results](https://github.com/autogluon/tabarena/pull/195)) - reduced RealMLP pickle size by not storing the dataset ([#33](https://github.com/dholzmueller/pytabkit/issues/33)) - fixed gradient clipping for TabM (it did nothing previously, see [#34](https://github.com/dholzmueller/pytabkit/issues/34)). To ensure backward compatibility, it is set to None in the HPO search spaces now (it was already None in the default parameters). - removed debug print in TabM training loop - v1.6.1: - For `n_ens>1`, changed the default behavior for classification to averaging probabilities instead of logits. This can be reverted by setting `ens_av_before_softmax=True`. - Implemented time limit for HPO/ensemble methods through `time_limit_s` parameter. - Support `torch>=2.6` and Python 3.13. - v1.6.0: - Added support for other training losses in TabM through the `train_metric_name` parameter, for example, (multi)quantile regression via `train_metric_name='multi_pinball(0.05,0.95)'`. - RealMLP-TD now adds the `n_ens` hyperparameter, which can be set to values >1 to train ensembles per train-validation split (called PackedEnsemble in the TabM paper). This is especially useful when using holdout validation instead of cross-validation ensembles, and to get more reliable validation predictions and scores for tuning/ensembling. - fixed RealMLP TabArena search space (`hpo_space_name='tabarena'`) for classification (allow no label smoothing through `use_ls=False` instead of `use_ls="auto"`). - v1.5.2: fixed more device bugs for HPO and ensembling - v1.5.1: fixed a device bug in TabM for GPU - v1.5.0: - added `n_repeats` parameter to scikit-learn interfaces for repeated cross-validation - HPO sklearn interfaces (the ones using random search) can now do weighted ensembling instead by setting `use_caruana_ensembling=True`. Removed the `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor` from v1.4.2 since they are now redundant through this feature. - renamed `space` parameter of GBDT HPO interface to `hpo_space_name` so now it also works with non-TPE versions. - Added new [TabArena](https://tabarena.ai) search spaces for boosted trees (not TPE), which should be almost equivalent to the ones from TabArena except for the early stopping logic. - TabM now supports `val_metric_name` for early stopping on different metrics. - fixed issues #20 and #21 regarding HPO - small updates for the ["Rethinking Early Stopping" paper](https://arxiv.org/abs/2501.19195) - v1.4.2: - fixed handling of custom `val_metric_name` HPO models and `Ensemble_TD_Regressor`. - if `tmp_folder` is specified in HPO models, save each model to disk immediately instead of holding all of them in memory. This can considerably reduce RAM/VRAM usage. In this case, pickled HPO models will still rely on the models stored in the `tmp_folder`. - We now provide `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor`, which will use weighted ensembling and usually perform better than HPO (but have slower inference time). We recommend using the new `hpo_space_name='tabarena'` for best results. - v1.4.1: - moved dill to optional dependencies - updated TabM code to a newer version: added option share_training_batches=False (old version: True), exclude certain parameters from weight decay. - added [documentation](https://pytabkit.readthedocs.io/en/latest/bench/using_the_scheduler.html) for using the scheduler with custom jobs. - fixed bug in RealMLP refitting. - updated process start method for scheduler to speed up benchmarking - v1.4.0: - moved some imports to the new `models` optional dependencies to have a more light-weight RealMLP installation - Added GPU support for CatBoost with help from [Maximilian Schambach](https://github.com/MaxSchambach) in #16 (not guaranteed to produce exactly the same results). - Ensembling now saves models after training if a path is supplied, to reduce memory usage - Added more search spaces - fixed error in multiquantile output when the passed y was one-dimensional instead of having shape `(n_samples, 1)` - Added some examples to the documentation - v1.3.0: - Added multiquantile regression for RealMLP: see the [documentation](https://pytabkit.readthedocs.io/en/latest/models/quantile_reg.html) - More hyperparameters for RealMLP - Added [TabICL](github.com/soda-inria/tabicl) wrapper - Small fixes - v1.2.1: avoid error for older skorch versions - v1.2.0: - Included post-hoc calibration and more metrics through [probmetrics](https://github.com/dholzmueller/probmetrics). - Added benchmarking code for [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195). - Updated format for saving predictions, allow to stop on multiple metrics during the same training in the benchmark. - Better categorical handling, avoiding an error for string and object columns, not ignoring boolean columns by default but treating them as categorical. - Added Ensemble_HPO_Classifier and Ensemble_HPO_Regressor. - v1.1.3: - Fixed a bug where the categorical encoding was incorrect if categories were missing in the training or validation set. The bug affected XGBoost and potentially many other models except RealMLP. - Scikit-learn interfaces now accept and auto-detect categorical datatypes (category, string, object) in dataframes. - v1.1.2: - Some compatibility improvements for scikit-learn 1.6 (but disabled 1.6 since skorch is not compatible with it). - Improved documentation for Pytorch-Lightning interface. - Other small bugfixes and improvements. - v1.1.1: - Added parameters `weight_decay`, `tfms`, and `gradient_clipping_norm` to TabM. The updated default parameters now apply the RTDL quantile transform. - v1.1.0: - Included TabM - Replaced `__` by `_` in parameter names for MLP, MLP-PLR, ResNet, and FTT, to comply with scikit-learn interface requirements. - Fixed non-determinism in NN baselines by initializing the random state of quantile (and KDI) preprocessing transforms. - n_threads parameter is not ignored by NNs anymore. - Changes by [Lennart Purucker](https://github.com/LennartPurucker): Add time limit for RealMLP, add support for `lightning` (but also still allowing `pytorch-lightning`), making skorch a lazy import, removed msgpack\_numpy dependency. - v1.0.0: Release for the NeurIPS version and arXiv v2+v3. - More baselines (MLP-PLR, FT-Transformer, TabR-HPO, RF-HPO), also some un-polished internal interfaces for other methods, esp. the ones in AutoGluon. - Updated benchmarking code (configurations, plots) including the new version of the Grinsztajn et al. benchmark - Updated fit() parameters in scikit-learn interfaces, etc. - v0.0.1: First release for arXiv v1. Code and data are archived at [DaRUS](https://doi.org/10.18419/darus-4255). ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/requirements.txt ================================================ adjustText>=1.0 autorank>=1.0 catboost>=1.2 dask[dataframe]>=2023 dill fire lightgbm>=4.1 matplotlib>=3.0 msgpack>=1.0 myst_parser>=3.0 numba>=0.59.0 numpy>=1.25 openml>=0.14 openpyxl>=3.0 pandas>=2.0 patool>=1.0 probmetrics>=0.0.1 psutil>=5.0 pytest-cov>=4.0 pytest>=7.0 pytorch_lightning>=2.0 pyyaml>=5.0 ray>=2.8 requests>=2.0 scikit-learn>=1.3 seaborn>=0.0.13 skorch>=0.15 sphinx>=7.0 sphinx_rtd_theme>=2.0 torch>=2.0 torchmetrics>=1.2.1 tqdm tueplots>=0.0.12 xgboost>=2.0 xlrd>=2.0 xrfm>=0.4.3 ================================================ FILE: docs/source/bench/00_installation.md ================================================ # Overview and Installation of the Benchmarking code Our benchmarking code contains several features: - Automatic dataset download - Running models (parallelized) with automatic scheduling, trying to respect RAM constraints - Evaluation and plotting ## Installation Our code has been tested with python 3.9 and 3.10. After cloning/forking the repo, the required libraries can be installed as follows: ```commandline # in the repo folder: pip3 install -e .[extra,hpo,bench] ``` Note that the version requirements in our `pyproject.toml` are somewhat restrictive to avoid problems, they can potentially be relaxed. To more closely reproduce the installation we used for running the benchmarks, we refer to the configuration files in the `original_requirements` folder: - The pip-only requirements in `requirements_2024_06_25.txt` were used to compute many of the older NN results (not TabR). - The conda requirements in `conda_env_2024_06_25.yml` and `conda_env_2024_10_28.yml` were used to compute GBDT-HPO results and TabR results as well as a few newer NN results. They can be installed as a new conda environment using `conda env create -f conda_env_2024_10_28.yml`. Note that the older of the two conda environments was very slow for TabR on some datasets since it uses an older torchmetrics version with slow implementations. ## Using Sphinx Documentation Go to the repo root dir and run ```commandline sphinx-apidoc -o docs/source/ pytabkit sphinx-build -M html docs/source/ docs/build/ ``` then open `docs/build/html/index.html`. ================================================ FILE: docs/source/bench/01_running_the_benchmark.md ================================================ # Running the benchmark ## Configuration of data paths The paths for storing data and results are configured through the `tab_bench.data.paths.Paths` class. There are several options to configure which folders are used, which will be automatically recognized by `Paths.from_env_variables()`: - **Through environmental variables**: The base folder can be configured by setting the environmental variable `TAB_BENCH_DATA_BASE_FOLDER`. Optionally, some sub-folders can be set separately (e.g. for moving them to another partition). These are `TAB_BENCH_DATA_TASKS_FOLDER`, `TAB_BENCH_DATA_RESULTS_FOLDER`, `TAB_BENCH_DATA_RESULT_SUMMARIES_FOLDER`, `TAB_BENCH_DATA_UCI_DOWNLOAD_FOLDER`. - **Through a python file**: If `TAB_BENCH_DATA_BASE_FOLDER` is not available, the code will try to get the base folder (as a string) from `scripts.custom_paths.get_base_folder()`. This can be implemented by copying `scripts/custom_paths.py.default` to `scripts/custom_paths.py` (ignored by git) and adjusting the path therein. - If neither of the two options above is used, all data will be stored in `./tab_bench_data`. ## Download datasets To download all datasets for the meta-train and meta-test benchmarks, run (with your desired OpenML cache directory, optionally) ```commandline python3 scripts/download_data.py openml_cache_dir --import_meta_train --import_meta_test --import_grinsztajn_medium ``` To run methods on the benchmarks, there are two options: ## Run experiments with slurm Our benchmarking code contains its own scheduling code that will start subprocesses for each algorithm-dataset-split combination. Therefore, it is in principle possible to run all experiments through a single slurm job, though experiments can be divided into smaller pieces by running them separately. First, in `scripts/ray_slurm_template.sh`, replace the line `cd ~/git/pytabkit` according to your folder location. Also, make sure that the data path is specified there if you want to set it via an environmental variable. Run the following command (replacing some of the parameters with your own values) on the login node: ```commandline python3 scripts/ray_slurm_launch.py --exp_name=my_exp_name --num_nodes=num_nodes --queue="queue_name" --time=24:00:00 --mail_user="my@address.edu" --log_folder=log_folder --command="python3 -u scripts/run_slurm.py" ``` This will submit a job to the configured queue that will run `scripts/run_slurm.py` and create logfiles. Your experiments then have to be configured in `scripts/run_slurm.py`, see below. Multi-node is supported: `ray` will start instances on each node and our benchmarking code will schedule the individual experiments on the nodes. ## Run experiments without slurm Run the file with the corresponding experiments directly. For example, many of our experiment configurations can be found in `scripts/run_experiments.py`. One possible way to run the experiments detached from the shell with log-files is ````commandline systemd-run --scope --user python3 -u scripts/run_experiments.py > ./out.log 2> ./err.log & ```` ## Time measurements For time measurements, simply run `scripts/run_time_measurements.py` (with or without slurm). Results can be printed using `scripts/print_runtimes.py` (but these are averaged total times, not averaged per 1K samples as in the paper). ## Evaluating the benchmark results Aggregated algorithm results can be printed using ````commandline python3 scripts/run_evaluation.py meta-train-class ```` where `meta-train-class` can be replaced by the name of any other task collection (that is stored in the `task_collections` folder in the configured data directory), or a single dataset such as `openml-class/Higgs`. This script also has many more command line options, see the python file. For example, one can print only those methods with a certain tag using the `--tag` option, print results on individual datasets, for different metrics, etc. The parameters are the same as the ones of the following method: ```{eval-rst} .. autofunction:: scripts.run_evaluation.show_eval ``` ## Creating plots and tables Plots and tables can be created using ````commandline python3 scripts/create_plots_and_tables.py ```` The plots without missing value datasets require running ```commandline python3 scripts/check_missing_values.py ``` once beforehand. ## Single-task experiments You can also run a configuration on a single data set, without saving the results, by adjusting and running `scripts/run_single_task.py`. ## Other utilities - Use `scripts/analyze_tasks.py` to print some dataset statistics. - You can rename a method using `python3 scripts/rename_alg.py old_name new_name`. - We used some code in `scripts/meta_hyperopt.py` to optimize the default parameters for GBDTs. - The code in `scripts/estimate_resource_params.py` has been used to get more precise estimates for RAM usage etc. for running methods on the benchmark. - `scripts/print_complete_results.py` can be used to check which methods have results available on all splits for all tasks in a given collection. ================================================ FILE: docs/source/bench/02_stored_data.md ================================================ # Data format Here, we describe how the main data is stored inside the main data folder configured in the `tab_bench.data.paths.Paths` object (see the documentation on running the benchmark). As file formats, we mostly use `.yaml` (for small, human-readable files), `.msgpack.gz` (for efficiently storing dicts, lists, etc.), and `.npy` (standard format for storing numpy arrays). ## Algs folder The following files are stored in `algs/`, see `tab_bench.run.task_execution.TabBenchJobManager.add_jobs()` for details on how they are stored: - `tags.yaml` contains a list of tags, which can be used to only load results for algs with certain tags. - `extended_config.yaml` contains a dictionary with the wrapper parameters, as well as the alg_name and the wrapper class name. - `wrapper.pkl`: Optionally, a pickled version (using `dill`) of the wrapper. (However, our code does not load these as pickle is an unsafe format.) - `src`: A folder containing the source files at the time of execution, as a backup. ## Tasks folder We store datasets (tasks) in folders `tasks//`, where source_name and task_name are derived from how the tasks are imported (see also the `tab_bench.data.tasks.TaskDescription` class). In each of these folders, we store the following files: - `x_cont.npy`, `x_cat.npy`, `y.npy` store the three relevant tensors for the DictDataset (see the `tab_models` documentation). - `task_info.yaml` stores the information of a `TaskInfo` object. ## Task collections folder In `task_collections/.yaml`, we store the list of tasks that a task collection with name `coll_name` consists of. ## Results folder We store the results of experiments in the folder `results////-fold//`. Here, - alg_name is the name given to the method, - source_name and task_name identify a task, - k refers to the number of cross-validation folds (training-validation, not test), - split_type is either `random-split` (usually the case) or `default-split` (not used in our benchmark), - split_idx is the index (starting from zero) of the trainval-test-split. The results are stored in files `metrics.yaml` and `other.msgpack.gz`. The former contains only the errors in different metrics, the latter contains other things like predictions (if configured to be saved), best stopping epoch, and possibly optimized hyperparameters. These files are stored by `tab_bench.run.results.ResultManager`. The involved dictionaries are generated by `tab_models.alg_interfaces.alg_interfaces.AlgInterface.eval()`. ## Result summaries folder Since loading the results directly can be slow, we store accumulated versions of them in a more efficient format. Specifically, `tab_bench.run.task_execution.TabBenchJobManager.run_jobs()` will call `tab_bench.run.task_execution.results.save_summaries()`, which will generate files `result_summaries////-fold/metrics.msgpack.gz` that contain the metrics results for all splits. ## Other folders - Plots and LaTeX tables will be saved in the `plots` folder. - Results of estimating resource prediction parameters are saved in the `resources` folder. - Results of time measurements are saved in the `times` folder. - Downloaded datasets from the UCI repository are saved in the `uci_download` folder. They can be deleted after the data import in `download_data.py` is completed. - The `tmp` folder can be used for storing temporary files. When running experiments, methods can store intermediate results in a temporary folder in their respective results folder. ================================================ FILE: docs/source/bench/03_code.md ================================================ # Code structure ## Algorithm wrappers To run methods in `tab_bench`, one needs to provide them as a subclass of `tab_bench.alg_wrappers.general.AlgWrapper`. Generally, we use models from the `tab_models` library that implement the `AlgInterface` from there, and wrap them lightly as an `AlgInterfaceWrapper` in `tab_bench/alg_wrappers/interface_wrappers.py`, see the numerous classes there for examples. As in `tab_models`, we pass parameters to these models via `**kwargs`. The scikit-learn interfaces in `tab_models` provide in their constructors a list of the most important hyperparameters. ## Datasets We represent our datasets using the `DictDataset` class from `tab_models`. These datasets can be loaded as follows: ```python from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription paths = Paths.from_env_variables() task_desc = TaskDescription('openml-reg', 'fifa') task_info = task_desc.load_info(paths) # a TaskInfo object task = task_info.load_task(paths) ds = task.ds # this is the DictDataset object ``` We can convert `ds` to a Pandas DataFrame using `ds.to_df()`. It is also possible to load a list of all TaskInfo objects for an entire task collection: ```python from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection paths = Paths.from_env_variables() task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) ``` ## Scheduling code We implement general scheduling code in `tab_bench/scheduling`. This code can take a list of jobs with certain functionalities and run them in parallel in a single-node or multi-node setup, respecting the provided resource requirements (on RAM usage, number of threads, etc.). It can be used independently as follows: ```python from typing import List from pytabkit.bench.scheduling.jobs import AbstractJob from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler jobs: List[AbstractJob] = [] # create a list of jobs here scheduler = SimpleJobScheduler(RayJobManager()) scheduler.add_jobs(jobs) scheduler.run() ``` For our tabular benchmarking code, the `AbstractJob` objects will be created by the `tab_bench.run.task_execution.TabBenchJobManager`. Numerous examples for this can be found in `run_final_experiments.py`. ## Resource estimation ## Evaluation and plotting ================================================ FILE: docs/source/bench/adding_models.md ================================================ # Adding your own models to the benchmark To run your own models, - implement an `AlgInterface` subclass. There are numerous examples already implemented. For models that can only run a single train-validation-test split at a time, you might want to subclass or modify `SklearnSubSplitInterface` from `pytabkit/models/alg_interfaces/sub_split_interfaces.py`. Examples can be found in `pytabkit/models/alg_interfaces/other_interfaces.py` or `pytabkit/models/alg_interfaces/rtdl_interfaces.py`. - add an `AlgInterfaceWrapper` subclass. This is often just a three-liner that specifies which AlgInterfaces subclass to instantiate. See the numerous examples in `pytabkit/bench/alg_wrappers/interface_wrappers.py`, especially the later ones. - adjust the code to run your `AlgInterfaceWrapper` on the benchmark, see `scripts/run_experiments.py` for many examples. Note that `RunConfig` has an option to save the model predictions on the whole datasets, which can significantly increase the disk usage (can be up to 2 GB per model on the meta-test-class benchmark). ================================================ FILE: docs/source/bench/download_results.md ================================================ # Downloading the benchmark results The benchmark data (as well as the code) is archived at [DaRUS](https://doi.org/10.18419/darus-4555). To download the benchmark data, - create a folder for the data (which is then linked in the environmental variable `TAB_BENCH_DATA_BASE_FOLDER` or in `custom_paths.py`) - in the folder, unpack `main_no_results.tar.gz`, this should create the folders `algs`, `result_summaries`, `times`, `plots`, `task_collections`, and `tasks_only_infos` (which should be renamed to `tasks` if no `tasks` folder has been created). Since `result_summaries` stores the main metrics of the results, this is already enough for plotting/evaluating the results. - If you want the non-summarized results, download and unpack `results_small.tar.gz`, which contains the `results` folder (you might need to rename it from `results_no_gz` to `results`). However, this does not contain the additional files storing the predictions and optimal hyperparameters. - If you want the full results, download and unpack `results_main.tar.gz` (180 GB!) into the results folder (overwriting/replacing the contents of `results_small.tar.gz`) Moreover, there are additional files containing the results of the individual random search steps for the different methods, which could be used for retrospectively optimizing on a different metric etc. The file `cv_refit.tar.gz` contains the results of the cross-validation/refitting experiments, which are also somewhat large. - If you need the datasets (in the `tasks` folder), you can normally just obtain it by running `scripts/download_data.py`. However, there is the option to request access to download `tasks.tar.gz` directly. ================================================ FILE: docs/source/bench/refine_then_calibrate.md ================================================ # Reproducing results of "Rethinking Early Stopping: Refine, Then Calibrate" Here, we document how to reproduce results from our paper [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195). For general instructions on how to set data paths and use slurm, we refer to the installation page. The following will be the parts specific to this paper. ## Installation ```bash pip install probmetrics[extra] # to get smECE pip install pytabkit[bench,dev] ``` ### Original environment The original conda environment for exact reproduction is stored in `original_requirements/conda_env_2025_01_15.yml`. ## Downloading datasets Download the zipped datasets (`dataset-latest.zip`) of the TALENT benchmark from [here](https://drive.google.com/drive/folders/1j1zt3zQIo8dO6vkO-K-WE6pSrl71bf0z). Extract them into a folder. Then, use ```commandline python3 scripts/download_data.py --import_talent_class_small --talent_folder= ``` where the provided data folder should be the `data` folder inside the unzipped results. ## Running experiments Experiments can be run using `python3 scripts/run_probclass_experiments.py`, then plots can be generated using `python3 scripts/create_probclass_plots.py`. ================================================ FILE: docs/source/bench/using_the_scheduler.md ================================================ # Using the scheduler `pytabkit` includes a flexible scheduler that can schedule jobs within python using `ray` and `multiprocessing`. Essentially, it is a much fancier version of `multiprocessing.Pool`. Custom jobs need to provide an estimate of their required resources. The scheduler will - run as many jobs in parallel as possible on the current hardware while respecting the RAM and resource constraints - try to run the slowest jobs first, to avoid waiting for a few slow jobs in the end - measure free CPU RAM in the beginning, and add the fixed RAM that a CPU process uses to the requested RAM. For processes requesting a GPU, the fixed RAM used by a process using torch CUDA will be added to the requested RAM. - print info including remaining time estimates after each new started job, failed jobs etc. (unless the jobs run so fast that multiple ones are started at once). The time estimates will be based on the time estimates by the jobs, but they will be adapted by a factor learned based on the actual time taken by already finished jobs. Hence, the time estimate is only accurate after a few jobs have finished. It often underestimates the actually needed time to some extent. (This is probably also due to selection bias, since the estimated longest jobs are run first.) The scheduler also works on multi-GPU systems, and it even works on multi-node systems thanks to `ray`'s multi-node support. See [`ray_slurm_launch.py`](https://github.com/dholzmueller/pytabkit/blob/main/scripts/ray_slurm_launch.py) and [`ray_slurm_template.sh`](https://github.com/dholzmueller/pytabkit/blob/main/scripts/ray_slurm_template.sh). To use the scheduler, install `pytabkit[models,bench]`. Here is some example code: ```python from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.jobs import AbstractJob from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler class CustomJob(AbstractJob): def get_group(self): # group name, for all jobs with the same group name # one joint time multiplier will be fitted in the scheduler return 'default' def get_desc(self) -> str: return 'CustomJob' # name for displaying def __call__(self, assigned_resources: NodeResources) -> bool: # the main job, should only use the assigned resources print(f'Running job with {assigned_resources.get_n_threads()} threads', flush=True) return True # job finished successfully def get_required_resources(self) -> RequiredResources: # Return the resources requested by this job (RAM should be upper bounds, time doesn't need to be) return RequiredResources(time_s=1.0, n_threads=1, cpu_ram_gb=0.1, n_gpus=0, gpu_ram_gb=0.0, gpu_usage=1.0) sched = SimpleJobScheduler(RayJobManager(available_gpu_ram_multiplier=0.7)) sched.add_jobs([CustomJob() for _ in range(1000)]) sched.run() ``` ================================================ FILE: docs/source/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information # following https://stackoverflow.com/questions/10324393/sphinx-build-fail-autodoc-cant-import-find-module import os import sys sys.path.insert(0, os.path.abspath('../..')) from pytabkit.__about__ import __version__ project = 'pytabkit' copyright = '2024, David Holzmüller, Léo Grinsztajn, Ingo Steinwart' author = 'David Holzmüller, Léo Grinsztajn, Ingo Steinwart' release = __version__ # release = "0.0.1" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = ['myst_parser', 'sphinx.ext.autodoc'] templates_path = ['_templates'] exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output # html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme' # html_theme = 'default' html_static_path = ['_static'] # Automatically extract typehints when specified and place them in # descriptions of the relevant function/method. autodoc_typehints = "description" # python_maximum_signature_line_length = 88 # Don't show class signature with the class' name. autodoc_class_signature = "separated" ================================================ FILE: docs/source/index.rst ================================================ Welcome to PyTabKit's documentation! ====================================== .. toctree:: :maxdepth: 2 :caption: Contents: Tabular ML models in pytabkit.models =============================== .. toctree:: models/00_overview models/01_sklearn_interfaces models/02_hpo models/examples models/nn_classes models/03_training_implementation models/quantile_reg Tabular benchmarking using pytabkit.bench ==================================== .. toctree:: bench/00_installation bench/01_running_the_benchmark bench/adding_models bench/02_stored_data bench/03_code bench/download_results bench/refine_then_calibrate bench/using_the_scheduler Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/source/models/00_overview.md ================================================ # Overview of the `models` part ## Scikit-learn interfaces We provide scikit-learn interfaces for various methods in `sklearn/sklearn_interfaces.py`. These use the default parameter dictionaries defined in `sklearn/default_params.py`. ## AlgInterface: more fine-grained control We implement all our methods through subclassing `AlgInterface` in `alg_interfaces/alg_interfaces.py`. `AlgInterface` provides more functionality than scikit-learn interfaces, which is crucial for our benchmarking in `pytabkit.bench`. All our scikit-learn interfaces are wrappers around `AlgInterface` classes, using the `sklearn.sklearn_base.AlgInterfaceEstimator` base class. Compared to scikit-learn interfaces, `AlgInterface` provides the following additional features: - Vectorized evaluation on multiple train-validation-test splits (used by RealMLP-TD and RealMLP-TD-S). - Specification of train-validation-test splits, random seeds, temporary folder, custom loggers - Inclusion of required resource estimates (CPU RAM, GPU RAM, GPU usage, n_threads, time) - Evaluation on a list of metrics - Refitting with best found parameters ## Hyperparameter handling Hyperparameters are explicitly defined in scikit-learn constructors. Elsewhere, we generally pass all configuration parameters as **kwargs, then the corresponding functions pick out the parameters that they need and pass the rest on to nested function calls. This allows for very convenient coding, but one has to pay attention for typos in parameter names, which will often not be caught. For example, one could have the following structure: ```python def fit(**kwargs): model = build_model(**kwargs) train_model(model, **kwargs) def build_model(n_layers=4, **kwargs): ... def train_model(model, lr=4e-2, batch_size=256, **kwargs): ... ``` We usually write `**config` instead of `**kwargs`. We also generally try to give unique names to parameters. For example, the epsilon parameter of the optimizer is called `opt_eps` and the epsilon parameter of label smoothing is called `ls_eps`. ## Internal data representation We represent datasets internally using the `DictDataset` class. It contains a dictionary of PyTorch tensors. In our case, there are usually three tensors: `'x_cont'` for continuous features, `'x_cat'` for categorical features (`dtype=torch.long`), and `'y'` for labels. A `DictDataset` also contains a dictionary `tensor_infos`, which for each of these keys contains a `TensorInfo` object. The latter describes the number of features and, if applicable, the number of categories for each feature (for categorical variables or classification labels). We reserve the category `0` as the category for missing values (and values that have not been known to exist at train time). Missing numerical values are currently not handled by the NN code, so they need to be encoded beforehand. ## Data preprocessing (also available for other models) Most models offer to customize the data preprocessing through the `tfms` parameter. This is done using the NN preprocessing code in `nn_models.models.PreprocessingFactory` (see the corresponding documentation page for an explanation of the Factory classes). ## NN implementation For the implementation of RealMLP, we extend and alter the typical PyTorch structure, see the documentation page on NN classes. ## Vectorization Due to the vectorization of NN models, we use different terms for similar things: - `n_cv` refers to the number of training-validation splits in cross-validation (bagging) - `n_refit` refers to the number of models that are refitted on training+validation data after the CV stage - `n_tv_splits` (or `n_models`) refers to the number of training-validation splits used in the current training (could be `n_cv` or `n_refit`) - `n_tt_splits` (or `n_parallel`) refers to the number of trainval-test splits used (this is normally 1 when used through the scikit-learn interface, but can be larger when using RealMLP through the benchmark) ================================================ FILE: docs/source/models/01_sklearn_interfaces.rst ================================================ Scikit-learn interfaces ======================= We provide scikit-learn interfaces for numerous methods in ``pytabkit.models.sklearn.sklearn_interfaces``. Below, we provide an overview. All of our interfaces allow to specify the validation set(s) and categorical features in the ``fit`` method: .. autofunction:: pytabkit.models.sklearn.sklearn_base.AlgInterfaceEstimator.fit Important: For HPO and ensemble interfaces, it is recommended to set `tmp_folder` to allow these methods to store fitted models instead of holding them in the RAM. This means that `tmp_folder` should not be deleted while the associated interface still exists (even when it is pickled). RealMLP ------- For RealMLP, we provide TD (tuned default), HPO (hyperparameter optimization with random search), and Ensemble (weighted ensembling of random search configurations) variants: - RealMLP_TD_Classifier - RealMLP_TD_Regressor - RealMLP_HPO_Classifier - RealMLP_HPO_Regressor - RealMLP_Ensemble_Classifier - RealMLP_Ensemble_Regressor While the TD variants have good defaults, they provide the option to override any hyperparameters. The classifier and regressor have the same hyperparameters, therefore we only show the constructor of the classifier here. The first parameters until (including) verbosity are provided for every scikit-learn interface, although ``random_state``, ``n_threads``, ``tmp_folder``, and ``verbosity`` may be ignored by some of the methods. .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_TD_Classifier.__init__ For the HPO and Ensemble variants, we currently only provide few options: .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_HPO_Classifier.__init__ Boosted Trees ------------- For boosted trees, we provide the same interfaces as for RealMLP (TD, D, and HPO variants), but do not wrap the full parameter space from the respective libraries. Here are some representative examples: .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.XGB_TD_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.LGBM_TD_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.CatBoost_TD_Classifier.__init__ Other NN baselines --------- We offer interfaces (D and HPO variants) for - MLP (from the RTDL code) - ResNet (from the RTDL code) - FTT (FT-Transformer from the RTDL code) - MLP-PLR (from the RTDL code) - TabR (requires installing faiss) - TabM .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.MLP_RTDL_D_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.Resnet_RTDL_D_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.FTT_D_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.MLP_PLR_D_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.TabR_S_D_Classifier.__init__ .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.TabM_D_Classifier.__init__ xRFM ------ We offer D and HPO variants for xRFM. .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.XRFM_D_Classifier.__init__ Other methods ------------- For convenience, we wrap the scikit-learn RF and MLP interfaces with our scikit-learn interfaces, although in this case the validation sets are not used. The respective classes are called ``RF_SKL_Classifier`` and ``MLP_SKL_Classifier`` etc. We also provide our ``Ensemble_TD_Classifier`` and ``Ensemble_HPO_Classifier``, a weighted ensemble of our TD / HPO models (and similar for regression). .. test .. autoclass:: pytabkit.models.sklearn.sklearn_interfaces.RealMLPConstructorMixin test2 .. automodule:: pytabkit.models.sklearn.sklearn_interfaces :members: :undoc-members: :show-inheritance: Saving and loading ------------------ RealMLP and possibly other models (except probably TabR) can be saved using pickle-like modules. With standard pickling, a model trained on a GPU will be restored to use the same GPU, and fail to load if the GPU is not present. (Note that dill fails to save torch models in newer torch versions, while pickle can still save them.) The following code allows to load GPU-trained models to the CPU, but fails to run predict() due to pytorch-lightning device issues. .. code-block:: language import torch import dill # might also work with pickle instead torch.save(model, 'model.pkl', pickle_module=dill, _use_new_zipfile_serialization=False) model = torch.load('model.pkl', map_location='cpu', pickle_module=dill) ================================================ FILE: docs/source/models/02_hpo.md ================================================ # Hyperparameter optimization This is a guide how to perform hyperparameter optimization (HPO) to get the best results out of RealMLP. We consider RealMLP for classification here, but most of the guide applies to regression and other baselines as well. ## Option 1: Using the HPO interface The easiest option is to use the direct HPO interface: ```python from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_HPO_Classifier X, y = make_classification(random_state=42, n_samples=200, n_features=5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) clf = RealMLP_HPO_Classifier(n_hyperopt_steps=10, n_cv=1, verbosity=2, val_metric_name='brier') clf.fit(X_train, y_train) clf.predict(X_test) ``` The code above - runs random search with 10 configurations from the HPO space in the paper (should be increased to, say, 50 for better results) - only uses one training-validation split (should be increased to, say, 5 for better results) - prints validation results of each epoch and best found parameters thanks to `verbosity=2` - selects the best model and best epoch based on the Brier score (default would be classification error) While using the interface directly is convenient, it has certain drawbacks: - It is not possible to change the search space, e.g. to reduce label smoothing for other metrics than classification error. - It is not possible to save and resume from an intermediate state. - It is not possible to use another HPO method than random search. - It is not (easily) possible to access intermediate results. Therefore, we now look at a more manual approach. ## Option 2: Performing your own HPO The following code provides an example on how to do HPO manually. ```python import numpy as np import torch from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split, StratifiedKFold from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier from pytabkit.models.training.metrics import Metrics n_hyperopt_steps = 10 n_cv = 1 is_classification = True X, y = make_classification(random_state=42, n_samples=200, n_features=5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # We compute train-validation splits here instead of letting the sklearn interface do it # such that we can compute the validation error ourselves if n_cv == 1: # we cannot do 1-fold CV, so we do an 80%-20% train-validation split _, val_idxs = train_test_split(np.arange(X_train.shape[0]), test_size=0.2, random_state=0) val_idxs = val_idxs[None, :] else: skf = StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=0) val_idxs_list = [val_idxs for train_idxs, val_idxs in skf.split(X_train, y_train)] # make sure that each validation set has the same length, so we can exploit vectorization max_len = max([len(val_idxs) for val_idxs in val_idxs_list]) val_idxs_list = [val_idxs[:max_len] for val_idxs in val_idxs_list] val_idxs = np.asarray(val_idxs_list) best_val_loss = np.Inf best_clf = None best_params = None for hpo_step in range(n_hyperopt_steps): # sample random params according to the proposed search space, but this can be replaced by a custom HPO method params = RealMLPParamSampler(is_classification=is_classification).sample_params(seed=hpo_step) # we only use one classifier that will fit n_cv sub-models, since RealMLP can vectorize the fitting, # but it would also be possible to use one classifier per cross-validation split. clf = RealMLP_TD_Classifier(**params, n_cv=n_cv, verbosity=2, val_metric_name='brier') clf.fit(X_train, y_train, val_idxs=val_idxs) # evaluate validation loss # for n_cv >= 2, predict_proba() only outputs averaged predictions of the cross-validation models, # but we need separate predictions of each of the cross-validation members to extract the out-of-bag ones, # so we use predict_proba_ensemble(). # There is also predict_ensemble() which replaces predict(). y_pred_prob = clf.predict_proba_ensemble(X_train) val_predictions = np.concatenate([y_pred_prob[i, val_idxs[i, :]] for i in range(n_cv)], axis=0) val_labels = np.concatenate([y_train[val_idxs[i, :]] for i in range(n_cv)], axis=0) val_logits = np.log(val_predictions + 1e-30) val_loss = Metrics.apply(torch.as_tensor(val_logits, dtype=torch.float32), torch.as_tensor(val_labels), metric_name='brier').item() # update best model if loss improved if val_loss < best_val_loss: best_val_loss = val_loss best_clf = clf best_params = params best_clf.predict(X_test) print(f'best params: {best_params}') ``` Here is the equivalent search space for `hyperopt`: ```python from hyperopt import hp import numpy as np space = { 'num_emb_type': hp.choice('num_emb_type', ['none', 'pbld', 'pl', 'plr']), 'add_front_scale': hp.pchoice('add_front_scale', [(0.6, True), (0.4, False)]), 'lr': hp.loguniform('lr', np.log(2e-2), np.log(3e-1)), 'p_drop': hp.pchoice('p_drop', [(0.3, 0.0), (0.5, 0.15), (0.2, 0.3)]), 'wd': hp.choice('wd', [0.0, 2e-2]), 'plr_sigma': hp.loguniform('plr_sigma', np.log(0.05), np.log(0.5)), 'hidden_sizes': hp.pchoice('hidden_sizes', [(0.6, [256] * 3), (0.2, [64] * 5), (0.2, [512])]), 'act': hp.choice('act', ['selu', 'mish', 'relu']), 'ls_eps': hp.pchoice('ls_eps', [(0.3, 0.0), (0.7, 0.1)]) } ``` ================================================ FILE: docs/source/models/03_training_implementation.md ================================================ # Training directly with PyTorch Lightning ## Using PyTorch Lightning The TabNN models are implemented using [Pytorch Lightning](https://lightning.ai/docs/pytorch/stable/). It follows the following training implementation principle as described [here](https://lightning.ai/docs/pytorch/stable/model/train_model_basic.html): ```python # define Dataloader train_loader = DataLoader(x_train, y_train) val_loader = DataLoader(x_val, y_val) test_loader = DataLoader(x_test, y_test) # define model using a Pytorch LightningModule nn_model = MyModel(hyper_param1, hyper_param2, ...) # train model using the Pytorch Lightning Trainer trainer = pl.Trainer() trainer.fit(model=nn_model, train_dataloaders=train_loader, val_dataloaders=val_loader) # make predictions using the Trainer pred = trainer.predict(nn_model, dataloaders=test_loader) ``` In our use case, adapted to the Tabular NN Network, the implementation looks like this: ``` { .python .annotate } from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources from pytabkit.models.data.data import DictDataset, TensorInfo from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.training.lightning_modules import TabNNModule import lightning.pytorch as pl # or: import pytorch_lightning as pl import numpy as np import torch n_epochs = 200 X, y = make_classification() idxs = np.arange(len(X)) trainval_idxs, test_idxs = train_test_split(idxs, test_size=0.2) n_trainval_splits = 5 train_idxs_list = [] val_idxs_list = [] for i in range(n_trainval_splits): train_idxs, val_idxs = train_test_split(trainval_idxs, test_size=0.2) train_idxs_list.append(train_idxs) val_idxs_list.append(val_idxs) # define datasets ds = DictDataset(tensors={'x_cont': torch.as_tensor(X, dtype=torch.float32), 'x_cat': torch.zeros(len(X), 0), 'y': torch.as_tensor(y, dtype=torch.long)[:, None]}, tensor_infos={'x_cont': TensorInfo(feat_shape=[X.shape[1]]), 'x_cat': TensorInfo(cat_sizes=[]), 'y': TensorInfo(cat_sizes=[np.max(y) + 1])}, ) # (1) train_val_splitting_idxs_list = [ SplitIdxs(train_idxs=torch.as_tensor(np.stack(train_idxs_list, axis=0), dtype=torch.long), val_idxs=torch.as_tensor(np.stack(val_idxs_list, axis=0), dtype=torch.long), test_idxs=torch.as_tensor(test_idxs, dtype=torch.long), split_seed=0, sub_split_seeds=list(range(len(train_idxs_list))), split_id=0)] test_ds = ds.get_sub_dataset(torch.as_tensor(test_idxs, dtype=torch.long)) # Create assigned resources # interface_resources = InterfaceResources(n_threads=4, gpu_devices=['cuda:0']) # (2) interface_resources = InterfaceResources(n_threads=4, gpu_devices=[]) # (2) # define the model using our LightningModule TabNNModule nn_model = TabNNModule(**DefaultParams.RealMLP_TD_CLASS) # build and 'compile' the model using the data, now it is ready to use nn_model.compile_model(ds, train_val_splitting_idxs_list, interface_resources) # train the model using the Pytorch Lightning Trainer trainer = pl.Trainer( callbacks=nn_model.create_callbacks(), max_epochs=n_epochs, enable_checkpointing=False, enable_progress_bar=False, num_sanity_val_steps=0, logger=pl.loggers.logger.DummyLogger(), ) # (3) trainer.fit( model=nn_model, train_dataloaders=nn_model.train_dl, val_dataloaders=nn_model.val_dl ) # make predictions using the Trainer pred = trainer.predict( model=nn_model, dataloaders=nn_model.get_predict_dataloader(test_ds) ) ``` 1. The NN Models have special requirements for their dataloaders, therefore we need to use the `DictDataset` first to create a dataset for both training and validation. 2. We handle our resource management manually, not with Lightning, therefore we need to create an `InterfaceResources` object 3. We use the original [`Trainer`](https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api) Class from Lightning. However, all of the parameters specified here are obligatory for the TabNNModule to work properly. ================================================ FILE: docs/source/models/examples.md ================================================ # Examples ## Refitting RealMLP on train+val data using the best epoch from a previous run You can refit RealMLP by simply using $n_refit=1$ (or, better, larger values to ensemble multiple NNs). But in case you want more control, you can do it manually (e.g., if you only want to refit the best configuration from HPO, but you're not using the HPO within pytabkit). ```python import numpy as np from sklearn.model_selection import train_test_split from pytabkit import RealMLP_TD_Regressor np.random.seed(0) X = np.random.randn(500, 5) y = np.random.randn(500) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0) reg = RealMLP_TD_Regressor(verbosity=2, random_state=0) reg.fit(X_train, y_train, X_val, y_val) refit = RealMLP_TD_Regressor(verbosity=2, stop_epoch=list(reg.fit_params_['stop_epoch'].values())[0], val_fraction=0.0, random_state=0) refit.fit(X, y) ``` ## Fitting again after HPO on a smaller subset Here is an example on how to fit HPO on a smaller subset and fit the best configuration again with validation. (It might be better to just use `n_refit` in the HPO classifier/regressor instead.) ```python import numpy as np from sklearn.model_selection import train_test_split from pytabkit import LGBM_HPO_TPE_Regressor, LGBM_TD_Regressor # This is an example on how to fit a HPO method on a smaller subset of the data, # and then refit the best hyperparams on the full dataset np.random.seed(0) X = np.random.randn(500, 5) y = np.random.randn(500) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.9, random_state=0) # use 90% for validation to train faster # if there is too much validation data, validation data might be the bottleneck, then you should pass model = LGBM_HPO_TPE_Regressor(val_fraction=0.9, n_hyperopt_steps=5) model.fit(X, y) # unfortunately params are not always called the same way, so we need to rename a few params = model.fit_params_['hyper_fit_params'] params['subsample'] = params.pop('bagging_fraction') params['colsample_bytree'] = params.pop('feature_fraction') params['lr'] = params.pop('learning_rate') # unfortunately, it is hard right now to check if this is exactly the same config, # as this might set some default params that are not used in the HPO config model_refit = LGBM_TD_Regressor(**params) model_refit.fit(X, y) ``` ================================================ FILE: docs/source/models/nn_classes.md ================================================ # NN implementation While RealMLP is implemented in PyTorch, we extend the conventional `nn.Module` logic. Traditionally, one writes some PyTorch code to assemble a NN model, which is a nn.Module composed of building blocks that are also nn.Module objects (Composite design pattern). The nn.Module classes initialize the parameters in the constructor and are then callable objects providing the forward() transformation. Data preprocessing is done separately via different code/classes. We use a different structure of classes that unifies preprocessing and NN layers, which is useful for vectorized NNs: The vectorized NNs can share a single non-preprocessed data set, loaded into GPU RAM, while having different preprocessing parameters (fitted on different training sets since different splits are used). Individual preprocessed data sets are never fully instantiated in GPU RAM; instead, the vectorized NN models do preprocessing on batches individually, which saves GPU RAM (we're talking e.g. about having 50-100 NNs on the same GPU at the same time). The class structure uses three base classes: - `Layer` classes are similar to nn.Module, but they do not perform random initialization in the constructor. Instead, they simply take the already initialized parameters as input. There are some additional features: Layer objects of the same type can be combined into a vectorized Layer. The vectorized NN is not built directly, but first NNs are built and initialized sequentially for better reproducibility (random seed etc.) and RAM saving, and then they are vectorized after initialization using the Layer.stack() function. Additionally, Layer classes work with the DictDataset class, which usually contains 'x_cont' and 'x_cat' tensors for continuous and categorical variables. Moreover, during training, we also pass the labels 'y' through the Layer, which allows to implement mixup, label smoothing, and output standardization as Layer objects. - `Fitter` classes initialize the NN based on a single forward pass on the (subsampled) training (and possibly validation) set. This is done using the `fit()` or `fit_transform()` functions similar to scikit-learn preprocessing classes, which return a `Layer` object (and, in case of `fit_transform()`, the transformed dataset). Initialization can be random or depending on the so far transformed training set. Typically, parameters of preprocessing layers such as standardization depend on the training set, while NN parameters do not depend on the training set. However, we also use weight and bias initializations that depend on the training set, and the unification of NN and preprocessing makes this much more convenient. - `FitterFactory` (could also be called ArchitectureBuilder) classes build the NN structure based on the input and output shape and type. Specifically, `FitterFactory` objects can build `Fitter` objects given the corresponding 'tensor_infos' of the data set, which specifies the number of continuous variables, the number of categorical variables and the category sizes, and the same for the labels. For example, a `FitterFactory` can decide to use one-hot encoding for categorical variables with small category sizes, and Embedding layers for larger category sizes. The `Layer`, `Fitter`, and `FitterFactory` classes are defined in `model/base.py`. Other subclasses are also defined in `model` folder. There are some more features: - We introduce a class called `Variable` that inherits from `torch.nn.Parameter`. Variable has a parameter `trainable: bool`, and in the case `trainable==False`, the `Layer` class will register it using `register_buffer()`. One might also be able to just use `nn.Parameter(..., requires_grad=False)` for this, though we did not check whether it has the same effect (will it be saved when using `model.state_dict()`?). There is also the convenience function `Variable.stack()` used by `Layer.stack()`. Moreover, Variables can have names (to assign individual hyperparameter values to them), and they can have custom hyperparameter factors (e.g. to specify that the lr should be multiplied by a certain value for this Variable). - The classes above can be given scope names, which are then prepended to variable names. For example, using scope names, the weight of the first linear layer in a NN could be called 'net/first_layer/layer-0/weight', where 0 is the layer index and 'first_layer' is redundant information that can be useful when regex matching variable names. One can assign an individual lr to this layer by using `lr={'': global_lr, '.*first_layer.*weight': first_layer_weight_lr}` in `**kwargs` to the `NNAlgInterface`. This works as follows: The `HyperparamManager`, which is available through a global context managed by the `TrainContext` class, stores the hyperparameter configurations obtained through **kwargs. Different classes can require getters for specific hyperparameters for specific variables. If multiple lr values are specified above, the one from the last matching regex is taken. The scope names are passed on from FitterFactory to Fitter and then to Layer and Variable by a somewhat complicated context manager system, for which I didn't find a more elegant solution. - Fitter objects can be split up in three parts using the `split_off_dynamic()` and `split_off_individual()` functions. The static part would typically be the one-hot encoding, since it does not depend on the data and is not trainable, which means that even in a vectorized context, it can be applied once to the single shared data set since it does not depend on the train/val/test split. Then, there is the dynamic but not individual part, which can depend on the fitting data but is not trained or randomized, and can therefore be shared by models with the same trainval-test split. Finally, there is the individual (trainable/randomized) part, which is usually the NN part. - `Fitter` classes should implement methods that allow to estimate the RAM usage of the parameters and a forward pass, which allows to decide how many NNs fit onto a GPU when running the benchmark. ================================================ FILE: docs/source/models/quantile_reg.md ================================================ # (Multi)quantile regression with RealMLP RealMLP supports multiquantile regression, for example by using ```python from pytabkit import RealMLP_TD_Regressor reg = RealMLP_TD_Regressor( train_metric_name='multi_pinball(0.25,0.5,0.75)', val_metric_name='multi_pinball(0.25,0.5,0.75)' ) ``` This will adjust the training objective as well as the metric for best-epoch selection on the validation set. The quantiles can be specified in any format that Python can convert to a float. There must be no space between the commas, and the quantiles need to be in ascending order. The latter is relevant because RealMLP will by default sort the prediction outputs, to always have ascending quantile predictions. This can be deactivated by passing `sort_quantile_predictions=False`. ================================================ FILE: examples/tutorial_notebook.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "enZVuzCHCy1n" }, "source": [ "**To train neural networks faster, you need to enable GPUs for the notebook:**\n", "* Navigate to Edit→Notebook Settings\n", "* select GPU from the Hardware Accelerator drop-down" ] }, { "cell_type": "markdown", "metadata": { "id": "rtKFT1oSCy1p" }, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": { "id": "Sr0lfFYqCy1q" }, "source": [ "## Installation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d-Zn1o8jCy1q" }, "outputs": [], "source": [ "!pip install pytabkit\n", "!pip install openml" ] }, { "cell_type": "markdown", "metadata": { "id": "V1Qo43ciCy1r" }, "source": [ "## Getting a dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "o-MpREHMCy1r" }, "outputs": [], "source": [ "import openml\n", "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "\n", "task = openml.tasks.get_task(361113) # covertype dataset\n", "dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False)\n", "X, y, categorical_indicator, attribute_names = dataset.get_data(\n", " dataset_format='dataframe',\n", " target=task.target_name\n", ")\n", "# we restrict to 15K samples for demonstration purposes\n", "index = np.random.choice(range(len(X)), 15_000, replace=False)\n", "X = X.iloc[index]\n", "y = y.iloc[index]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" ] }, { "cell_type": "markdown", "metadata": { "id": "PeMtLz0ICy1s" }, "source": [ "# Using RealMLP" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CgSOr3l0Cy1s", "outputId": "d2b0ea97-45ac-4a9e-ff3d-291d72094615" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of RealMLP: 0.8770666666666667\n", "CPU times: user 1min 11s, sys: 192 ms, total: 1min 11s\n", "Wall time: 1min 11s\n" ] } ], "source": [ "%%time\n", "from pytabkit import RealMLP_TD_Classifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "model = RealMLP_TD_Classifier()\n", "model.fit(X_train, y_train)\n", "\n", "y_pred = model.predict(X_test)\n", "acc = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy of RealMLP: {acc}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "-G8Oblk5Cy1s" }, "source": [ "## With bagging\n", "It is possible to do bagging (ensembling of models on 5-fold cross-validation) simply by passing `n_cv=5` to the constructor. Note that it doesn't take 5x as long because of vectorized training." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i0NpWvjKCy1s", "outputId": "89c07496-fd0e-4f46-ea59-3457f8a35371" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of RealMLP with bagging: 0.8930666666666667\n", "CPU times: user 1min 8s, sys: 180 ms, total: 1min 9s\n", "Wall time: 1min 8s\n" ] } ], "source": [ "%%time\n", "from pytabkit import RealMLP_TD_Classifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "model = RealMLP_TD_Classifier(n_cv=5)\n", "model.fit(X_train, y_train)\n", "\n", "y_pred = model.predict(X_test)\n", "acc = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy of RealMLP with bagging: {acc}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "KHphiGKBCy1t" }, "source": [ "## With hyperparameter optimization\n", "It is possible to do hyperparameter optimization directly inside a sklearn interface by using the `RealMLP_HPO_Regressor` interface.\n", "This is also available for classification, and for other models, for instance `LGBM_HPO_Classifier` or `LGBM_HPO_TPE_Classifier` (to use the Tree-structured Parzen Estimator algorithm)." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7e4wjdYJCy1t", "outputId": "a7ed7867-c808-4ed9-dbc2-badea992eae2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of RealMLP with 3 steps HPO: 0.8605333333333334\n", "CPU times: user 2min 27s, sys: 442 ms, total: 2min 28s\n", "Wall time: 2min 28s\n" ] } ], "source": [ "%%time\n", "from pytabkit import RealMLP_HPO_Classifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "n_hyperopt_steps = 3 # small number for demonstration purposes\n", "model = RealMLP_HPO_Classifier(n_hyperopt_steps=n_hyperopt_steps)\n", "model.fit(X_train, y_train)\n", "\n", "y_pred = model.predict(X_test)\n", "acc = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy of RealMLP with {n_hyperopt_steps} steps HPO: {acc}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "SB0D5MnbCy1t" }, "source": [ "# Using improved default for tree based models" ] }, { "cell_type": "markdown", "metadata": { "id": "OLulH2rGCy1t" }, "source": [ "`TD` stands for *tuned defaults*, which are the improved default we propose. `D` stands for *defaults*, which are the libraries defaults." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UEZU3kaDCy1t", "outputId": "1c5bd06f-caf6-499c-8f84-5496db9d0ce6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of CatBoost_TD_Classifier: 0.8685333333333334\n", "Accuracy of CatBoost_D_Classifier: 0.8464\n", "Accuracy of LGBM_TD_Classifier: 0.8602666666666666\n", "Accuracy of LGBM_D_Classifier: 0.8344\n", "Accuracy of XGB_TD_Classifier: 0.8544\n", "Accuracy of XGB_D_Classifier: 0.8472\n", "CPU times: user 1min 55s, sys: 44.3 s, total: 2min 40s\n", "Wall time: 24 s\n" ] } ], "source": [ "%%time\n", "from pytabkit import CatBoost_TD_Classifier, CatBoost_D_Classifier, LGBM_TD_Classifier, LGBM_D_Classifier, XGB_TD_Classifier, XGB_D_Classifier\n", "\n", "for model in [CatBoost_TD_Classifier(), CatBoost_D_Classifier(), LGBM_TD_Classifier(), LGBM_D_Classifier(), XGB_TD_Classifier(), XGB_D_Classifier()]:\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " acc = accuracy_score(y_test, y_pred)\n", " print(f\"Accuracy of {model.__class__.__name__}: {acc}\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "tMzbmtJMCy1t" }, "source": [ "# Ensembling tuned defaults of tree-based methods and RealMLP: a very good baseline" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JZJH1sWfCy1t", "outputId": "8d059418-5236-4a84-b55a-6829200bb330" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Ensemble_TD_Classifier: 0.8834666666666666\n", "CPU times: user 2min 34s, sys: 38 s, total: 3min 12s\n", "Wall time: 1min 30s\n" ] } ], "source": [ "%%time\n", "from pytabkit import Ensemble_TD_Classifier\n", "\n", "model = Ensemble_TD_Classifier()\n", "model.fit(X_train, y_train)\n", "y_pred = model.predict(X_test)\n", "acc = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy of Ensemble_TD_Classifier: {acc}\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "undefined.undefined.undefined" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: original_requirements/conda_env_2024_06_25.yml ================================================ name: tab_bench_venv_3 channels: - pytorch - nvidia - defaults dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=5.1 - _py-xgboost-mutex=2.0 - abseil-cpp=20211102.0 - arrow-cpp=11.0.0 - atk-1.0=2.36.0 - aws-c-common=0.6.8 - aws-c-event-stream=0.1.6 - aws-checksums=0.1.11 - aws-sdk-cpp=1.8.185 - blas=1.0 - boost-cpp=1.82.0 - bottleneck=1.3.5 - brotli=1.0.9 - brotli-bin=1.0.9 - brotlipy=0.7.0 - bzip2=1.0.8 - c-ares=1.19.1 - ca-certificates=2024.3.11 - cairo=1.16.0 - catboost=1.2 - certifi=2024.6.2 - cffi=1.16.0 - charset-normalizer=2.0.4 - configparser=5.0.2 - contourpy=1.2.0 - coverage=7.2.2 - cryptography=41.0.7 - cuda-cudart=11.7.99 - cuda-cupti=11.7.101 - cuda-libraries=11.7.1 - cuda-nvrtc=11.7.99 - cuda-nvtx=11.7.91 - cuda-runtime=11.7.1 - cudatoolkit=11.4.1 - cycler=0.11.0 - cyrus-sasl=2.1.28 - cython=3.0.6 - dbus=1.13.18 - dill=0.3.7 - et_xmlfile=1.1.0 - exceptiongroup=1.2.0 - expat=2.5.0 - faiss-gpu=1.7.4 - filelock=3.13.1 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=2.001 - font-ttf-source-code-pro=2.030 - font-ttf-ubuntu=0.83 - fontconfig=2.14.1 - fonts-anaconda=1 - fonts-conda-ecosystem=1 - fonttools=4.25.0 - freetype=2.12.1 - fribidi=1.0.10 - fsspec=2023.10.0 - future=0.18.3 - gdk-pixbuf=2.42.10 - gflags=2.2.2 - giflib=5.2.1 - glib=2.69.1 - glog=0.5.0 - gmp=6.2.1 - gmpy2=2.1.2 - gobject-introspection=1.72.0 - graphite2=1.3.14 - graphviz=2.50.0 - grpc-cpp=1.48.2 - gst-plugins-base=1.14.1 - gstreamer=1.14.1 - gtk2=2.24.33 - gts=0.7.6 - harfbuzz=4.3.0 - icu=73.1 - idna=3.4 - iniconfig=1.1.1 - intel-openmp=2021.4.0 - jinja2=3.1.2 - joblib=1.2.0 - jpeg=9e - kiwisolver=1.4.4 - krb5=1.20.1 - lcms2=2.12 - ld_impl_linux-64=2.38 - lerc=3.0 - liac-arff=2.5.0 - libboost=1.82.0 - libbrotlicommon=1.0.9 - libbrotlidec=1.0.9 - libbrotlienc=1.0.9 - libclang=14.0.6 - libclang13=14.0.6 - libcublas=11.10.3.66 - libcufft=10.7.2.124 - libcufile=1.8.1.2 - libcups=2.4.2 - libcurand=10.3.4.107 - libcurl=8.5.0 - libcusolver=11.4.0.1 - libcusparse=11.7.4.91 - libdeflate=1.17 - libedit=3.1.20230828 - libev=4.33 - libevent=2.1.12 - libfaiss=1.7.4 - libffi=3.4.4 - libgcc-ng=11.2.0 - libgd=2.3.3 - libgfortran-ng=11.2.0 - libgfortran5=11.2.0 - libgomp=11.2.0 - libiconv=1.16 - libllvm14=14.0.6 - libnghttp2=1.57.0 - libnpp=11.7.4.75 - libnvjpeg=11.8.0.2 - libpng=1.6.39 - libpq=12.17 - libprotobuf=3.20.3 - librsvg=2.54.4 - libssh2=1.10.0 - libstdcxx-ng=11.2.0 - libthrift=0.15.0 - libtiff=4.5.1 - libtool=2.4.6 - libuuid=1.41.5 - libwebp=1.3.2 - libwebp-base=1.3.2 - libxcb=1.15 - libxgboost=1.7.3 - libxkbcommon=1.0.1 - libxml2=2.10.4 - lightgbm=4.1.0 - lightning-utilities=0.9.0 - llvm-openmp=14.0.6 - lz4-c=1.9.4 - markupsafe=2.1.3 - matplotlib=3.8.0 - matplotlib-base=3.8.0 - minio=7.1.0 - mkl=2021.4.0 - mkl-service=2.4.0 - mkl_fft=1.3.1 - mkl_random=1.2.2 - mpc=1.1.0 - mpfr=4.0.2 - mpmath=1.3.0 - munkres=1.1.4 - mysql=5.7.24 - ncurses=6.4 - networkx=3.1 - ninja=1.10.2 - ninja-base=1.10.2 - nspr=4.35 - nss=3.89.1 - numexpr=2.8.4 - openjpeg=2.4.0 - openml=0.12.2 - openpyxl=3.0.10 - openssl=3.0.13 - orc=1.7.4 - packaging=23.1 - pandas=2.1.4 - pango=1.50.7 - pcre=8.45 - pigz=2.6 - pillow=10.0.1 - pip=23.3.1 - pixman=0.40.0 - platformdirs=3.10.0 - plotly=5.9.0 - pluggy=1.0.0 - ply=3.11 - pooch=1.7.0 - poppler=22.12.0 - poppler-data=0.4.11 - psutil=5.9.0 - py-xgboost=1.7.3 - pyarrow=11.0.0 - pycparser=2.21 - pyopenssl=23.2.0 - pyparsing=3.0.9 - pyqt=5.15.10 - pyqt5-sip=12.13.0 - pysocks=1.7.1 - pytest=7.4.0 - pytest-cov=4.1.0 - python=3.10.13 - python-dateutil=2.8.2 - python-graphviz=0.20.1 - python-tzdata=2023.3 - pytorch=2.0.1 - pytorch-cuda=11.7 - pytorch-lightning=2.0.3 - pytorch-mutex=1.0 - pytz=2023.3.post1 - pyyaml=6.0.1 - qt-main=5.15.2 - re2=2022.04.01 - readline=8.2 - requests=2.31.0 - scikit-learn=1.3.0 - scipy=1.10.1 - seaborn=0.12.2 - setuptools=68.2.2 - sip=6.7.12 - six=1.16.0 - snappy=1.1.10 - sqlite=3.41.2 - swig=4.0.2 - sympy=1.12 - tbb=2021.8.0 - tenacity=8.2.2 - threadpoolctl=2.2.0 - tk=8.6.12 - toml=0.10.2 - tomli=2.0.1 - torchmetrics=1.1.2 - torchtriton=2.0.0 - tornado=6.3.3 - tqdm=4.65.0 - typing-extensions=4.9.0 - typing_extensions=4.9.0 - tzdata=2023d - urllib3=1.26.16 - utf8proc=2.6.1 - wheel=0.41.2 - xgboost=1.7.3 - xlrd=2.0.1 - xmltodict=0.13.0 - xz=5.4.5 - yaml=0.2.5 - zlib=1.2.13 - zstd=1.5.5 - pip: - adjusttext==1.0.4 - aiosignal==1.3.1 - annotated-types==0.6.0 - attrs==23.2.0 - babel==2.14.0 - blis==0.7.11 - catalogue==2.0.10 - cir-model==0.2.0 - click==8.1.7 - cloudpathlib==0.16.0 - cloudpickle==3.0.0 - colorama==0.4.6 - confection==0.1.4 - configspace==0.7.1 - cramjam==2.8.1 - cymem==2.0.8 - dask==2024.1.1 - dask-jobqueue==0.8.2 - distributed==2024.1.1 - einops==0.7.0 - emcee==3.1.4 - fastparquet==2023.10.1 - fire==0.5.0 - frozenlist==1.4.1 - gensim==4.3.2 - ghp-import==2.1.0 - griffe==0.39.1 - hyperopt==0.2.7 - importlib-metadata==7.0.1 - imutils==0.5.4 - jsonschema==4.21.1 - jsonschema-specifications==2023.12.1 - kditransform==0.2.0 - langcodes==3.3.0 - llvmlite==0.41.1 - locket==1.0.0 - markdown==3.5.2 - mergedeep==1.3.4 - mkdocs==1.5.3 - mkdocs-autorefs==0.5.0 - mkdocs-material==9.5.6 - mkdocs-material-extensions==1.3.1 - mkdocstrings==0.24.0 - mkdocstrings-python==1.8.0 - more-itertools==10.2.0 - msgpack==1.0.7 - msgpack-numpy==0.4.8 - murmurhash==1.0.10 - numba==0.58.1 - numpy==1.26.4 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==8.9.2.26 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.18.1 - nvidia-nvjitlink-cu12==12.3.101 - nvidia-nvtx-cu12==12.1.105 - opencv-contrib-python==4.9.0.80 - paginate==0.5.6 - partd==1.4.1 - pathspec==0.12.1 - patool==2.1.1 - preshed==3.0.9 - protobuf==4.25.2 - py4j==0.10.9.7 - pydantic==2.5.3 - pydantic-core==2.14.6 - pygments==2.17.2 - pymdown-extensions==10.7 - pynisher==1.0.10 - pynvml==11.5.0 - pyrfr==0.9.0 - pytorch-widedeep==1.4.0 - pyyaml-env-tag==0.1 - ray==2.9.1 - referencing==0.32.1 - regex==2023.12.25 - rpds-py==0.17.1 - skorch==0.15.0 - smac==2.0.2 - smart-open==6.4.0 - sortedcontainers==2.4.0 - spacy==3.7.2 - spacy-legacy==3.0.12 - spacy-loggers==1.0.5 - srsly==2.4.8 - tabulate==0.9.0 - tblib==3.0.0 - termcolor==2.4.0 - thinc==8.2.2 - toolz==0.12.1 - torch==2.1.2 - torchvision==0.16.2 - triton==2.1.0 - tueplots==0.0.13 - typer==0.9.0 - venn-abers==1.4.1 - wasabi==1.1.2 - watchdog==3.0.0 - weasel==0.3.4 - wrapt==1.16.0 - zict==3.0.0 - zipp==3.17.0 ================================================ FILE: original_requirements/conda_env_2024_10_28.yml ================================================ name: tab_bench_conda channels: - pytorch - nvidia - defaults dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=5.1 - _py-xgboost-mutex=2.0 - abseil-cpp=20211102.0 - arrow-cpp=11.0.0 - atk-1.0=2.36.0 - aws-c-common=0.6.8 - aws-c-event-stream=0.1.6 - aws-checksums=0.1.11 - aws-sdk-cpp=1.8.185 - blas=1.0 - boost-cpp=1.82.0 - bottleneck=1.3.5 - brotli=1.0.9 - brotli-bin=1.0.9 - brotlipy=0.7.0 - bzip2=1.0.8 - c-ares=1.19.1 - ca-certificates=2024.7.2 - cairo=1.16.0 - catboost=1.2.3 - certifi=2024.8.30 - cffi=1.16.0 - charset-normalizer=2.0.4 - configparser=5.0.2 - contourpy=1.2.0 - coverage=7.2.2 - cryptography=41.0.7 - cuda-cudart=11.7.99 - cuda-cupti=11.7.101 - cuda-libraries=11.7.1 - cuda-nvrtc=11.7.99 - cuda-nvtx=11.7.91 - cuda-runtime=11.7.1 - cudatoolkit=11.4.1 - cycler=0.11.0 - cyrus-sasl=2.1.28 - cython=3.0.6 - dbus=1.13.18 - dill=0.3.7 - et_xmlfile=1.1.0 - exceptiongroup=1.2.0 - expat=2.5.0 - faiss-gpu=1.7.4 - filelock=3.13.1 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=2.001 - font-ttf-source-code-pro=2.030 - font-ttf-ubuntu=0.83 - fontconfig=2.14.1 - fonts-anaconda=1 - fonts-conda-ecosystem=1 - fonttools=4.25.0 - freetype=2.12.1 - fribidi=1.0.10 - fsspec=2023.10.0 - future=0.18.3 - gdk-pixbuf=2.42.10 - gflags=2.2.2 - giflib=5.2.1 - glib=2.69.1 - glog=0.5.0 - gmp=6.2.1 - gmpy2=2.1.2 - gobject-introspection=1.72.0 - graphite2=1.3.14 - graphviz=2.50.0 - grpc-cpp=1.48.2 - gst-plugins-base=1.14.1 - gstreamer=1.14.1 - gtk2=2.24.33 - gts=0.7.6 - harfbuzz=4.3.0 - icu=73.1 - idna=3.4 - iniconfig=1.1.1 - intel-openmp=2021.4.0 - jinja2=3.1.2 - joblib=1.2.0 - jpeg=9e - kiwisolver=1.4.4 - krb5=1.20.1 - lcms2=2.12 - ld_impl_linux-64=2.38 - lerc=3.0 - liac-arff=2.5.0 - libboost=1.82.0 - libbrotlicommon=1.0.9 - libbrotlidec=1.0.9 - libbrotlienc=1.0.9 - libclang=14.0.6 - libclang13=14.0.6 - libcublas=11.10.3.66 - libcufft=10.7.2.124 - libcufile=1.8.1.2 - libcups=2.4.2 - libcurand=10.3.4.107 - libcurl=8.5.0 - libcusolver=11.4.0.1 - libcusparse=11.7.4.91 - libdeflate=1.17 - libedit=3.1.20230828 - libev=4.33 - libevent=2.1.12 - libfaiss=1.7.4 - libffi=3.4.4 - libgcc-ng=11.2.0 - libgd=2.3.3 - libgfortran-ng=11.2.0 - libgfortran5=11.2.0 - libgomp=11.2.0 - libiconv=1.16 - libllvm14=14.0.6 - libnghttp2=1.57.0 - libnpp=11.7.4.75 - libnvjpeg=11.8.0.2 - libpng=1.6.39 - libpq=12.17 - libprotobuf=3.20.3 - librsvg=2.54.4 - libssh2=1.10.0 - libstdcxx-ng=11.2.0 - libthrift=0.15.0 - libtiff=4.5.1 - libtool=2.4.6 - libuuid=1.41.5 - libwebp=1.3.2 - libwebp-base=1.3.2 - libxcb=1.15 - libxgboost=1.7.3 - libxkbcommon=1.0.1 - libxml2=2.10.4 - lightgbm=4.1.0 - lightning-utilities=0.9.0 - llvm-openmp=14.0.6 - lz4-c=1.9.4 - markupsafe=2.1.3 - matplotlib=3.8.0 - matplotlib-base=3.8.0 - minio=7.1.0 - mkl=2021.4.0 - mkl-service=2.4.0 - mkl_fft=1.3.1 - mkl_random=1.2.2 - mpc=1.1.0 - mpfr=4.0.2 - mpmath=1.3.0 - munkres=1.1.4 - mysql=5.7.24 - ncurses=6.4 - networkx=3.1 - ninja=1.10.2 - ninja-base=1.10.2 - nspr=4.35 - nss=3.89.1 - numexpr=2.8.4 - numpy-base=1.24.3 - openjpeg=2.4.0 - openml=0.12.2 - openpyxl=3.0.10 - openssl=3.0.15 - orc=1.7.4 - packaging=23.1 - pandas=2.1.4 - pango=1.50.7 - pcre=8.45 - pigz=2.6 - pillow=10.0.1 - pip=23.3.1 - pixman=0.40.0 - platformdirs=3.10.0 - plotly=5.9.0 - pluggy=1.0.0 - ply=3.11 - pooch=1.7.0 - poppler=22.12.0 - poppler-data=0.4.11 - psutil=5.9.0 - py-xgboost=1.7.3 - pyarrow=11.0.0 - pycparser=2.21 - pyopenssl=23.2.0 - pyparsing=3.0.9 - pyqt=5.15.10 - pyqt5-sip=12.13.0 - pysocks=1.7.1 - pytest=7.4.0 - pytest-cov=4.1.0 - python=3.10.13 - python-dateutil=2.8.2 - python-graphviz=0.20.1 - python-tzdata=2023.3 - pytorch=2.0.1 - pytorch-cuda=11.7 - pytorch-lightning=2.0.3 - pytorch-mutex=1.0 - pytz=2023.3.post1 - pyyaml=6.0.1 - qt-main=5.15.2 - re2=2022.04.01 - readline=8.2 - requests=2.31.0 - scikit-learn=1.3.0 - scipy=1.10.1 - setuptools=68.2.2 - sip=6.7.12 - six=1.16.0 - snappy=1.1.10 - sqlite=3.41.2 - swig=4.0.2 - sympy=1.12 - tbb=2021.8.0 - tenacity=8.2.2 - threadpoolctl=2.2.0 - tk=8.6.12 - toml=0.10.2 - tomli=2.0.1 - torchmetrics=1.4.0.post0 - torchtriton=2.0.0 - tornado=6.3.3 - tqdm=4.65.0 - typing-extensions=4.9.0 - typing_extensions=4.9.0 - tzdata=2023d - urllib3=1.26.16 - utf8proc=2.6.1 - wheel=0.41.2 - xgboost=1.7.3 - xlrd=2.0.1 - xmltodict=0.13.0 - xz=5.4.5 - yaml=0.2.5 - zlib=1.2.13 - zstd=1.5.5 - pip: - adjusttext==1.0.4 - aiosignal==1.3.1 - annotated-types==0.6.0 - attrs==23.2.0 - autorank==1.1.3 - babel==2.14.0 - baycomp==1.0.3 - blis==0.7.11 - catalogue==2.0.10 - cir-model==0.2.0 - click==8.1.7 - cloudpathlib==0.16.0 - cloudpickle==3.0.0 - colorama==0.4.6 - confection==0.1.4 - configspace==0.7.1 - cramjam==2.8.1 - cymem==2.0.8 - dask==2024.1.1 - dask-jobqueue==0.8.2 - distributed==2024.1.1 - einops==0.7.0 - emcee==3.1.4 - fastparquet==2023.10.1 - fire==0.5.0 - frozenlist==1.4.1 - gensim==4.3.2 - ghp-import==2.1.0 - griffe==0.39.1 - hyperopt==0.2.7 - importlib-metadata==7.0.1 - imutils==0.5.4 - jsonschema==4.21.1 - jsonschema-specifications==2023.12.1 - kditransform==0.2.0 - langcodes==3.3.0 - llvmlite==0.41.1 - locket==1.0.0 - markdown==3.5.2 - mergedeep==1.3.4 - mkdocs==1.5.3 - mkdocs-autorefs==0.5.0 - mkdocs-material==9.5.6 - mkdocs-material-extensions==1.3.1 - mkdocstrings==0.24.0 - mkdocstrings-python==1.8.0 - more-itertools==10.2.0 - msgpack==1.0.7 - msgpack-numpy==0.4.8 - murmurhash==1.0.10 - numba==0.58.1 - numpy==1.26.4 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==8.9.2.26 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.18.1 - nvidia-nvjitlink-cu12==12.3.101 - nvidia-nvtx-cu12==12.1.105 - opencv-contrib-python==4.9.0.80 - paginate==0.5.6 - partd==1.4.1 - pathspec==0.12.1 - patool==2.1.1 - patsy==0.5.6 - preshed==3.0.9 - protobuf==4.25.2 - py4j==0.10.9.7 - pydantic==2.5.3 - pydantic-core==2.14.6 - pygments==2.17.2 - pymdown-extensions==10.7 - pynisher==1.0.10 - pynvml==11.5.0 - pyrfr==0.9.0 - pytorch-widedeep==1.4.0 - pyyaml-env-tag==0.1 - ray==2.9.1 - referencing==0.32.1 - regex==2023.12.25 - rpds-py==0.17.1 - rtdl-revisiting-models==0.0.2 - seaborn==0.13.2 - skorch==0.15.0 - smac==2.0.2 - smart-open==6.4.0 - sortedcontainers==2.4.0 - spacy==3.7.2 - spacy-legacy==3.0.12 - spacy-loggers==1.0.5 - srsly==2.4.8 - statsmodels==0.14.3 - tabulate==0.9.0 - tblib==3.0.0 - termcolor==2.4.0 - thinc==8.2.2 - toolz==0.12.1 - torch==2.1.2 - torchvision==0.16.2 - triton==2.1.0 - tueplots==0.0.13 - typer==0.9.0 - venn-abers==1.4.1 - wasabi==1.1.2 - watchdog==3.0.0 - weasel==0.3.4 - wrapt==1.16.0 - zict==3.0.0 - zipp==3.17.0 ================================================ FILE: original_requirements/conda_env_2025_01_15.yml ================================================ name: probclass channels: - pytorch - nvidia - defaults dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=5.1 - blas=1.0 - brotli-python=1.0.9 - bzip2=1.0.8 - ca-certificates=2024.12.31 - certifi=2024.12.14 - charset-normalizer=3.3.2 - cuda-cudart=11.8.89 - cuda-cupti=11.8.87 - cuda-libraries=11.8.0 - cuda-nvrtc=11.8.89 - cuda-nvtx=11.8.86 - cuda-runtime=11.8.0 - cuda-version=12.6 - expat=2.6.4 - ffmpeg=4.3 - filelock=3.13.1 - freetype=2.12.1 - giflib=5.2.2 - gmp=6.2.1 - gnutls=3.6.15 - idna=3.7 - intel-openmp=2023.1.0 - jinja2=3.1.4 - jpeg=9e - lame=3.100 - lcms2=2.16 - ld_impl_linux-64=2.40 - lerc=4.0.0 - libcublas=11.11.3.6 - libcufft=10.9.0.58 - libcufile=1.11.1.6 - libcurand=10.3.7.77 - libcusolver=11.4.1.48 - libcusparse=11.7.5.86 - libdeflate=1.22 - libffi=3.4.4 - libgcc-ng=11.2.0 - libgomp=11.2.0 - libiconv=1.16 - libidn2=2.3.4 - libjpeg-turbo=2.0.0 - libnpp=11.8.0.86 - libnvjpeg=11.9.0.86 - libpng=1.6.39 - libstdcxx-ng=11.2.0 - libtasn1=4.19.0 - libtiff=4.5.1 - libunistring=0.9.10 - libuuid=1.41.5 - libwebp=1.3.2 - libwebp-base=1.3.2 - llvm-openmp=14.0.6 - lz4-c=1.9.4 - markupsafe=2.1.3 - mkl=2023.1.0 - mkl-service=2.4.0 - mkl_fft=1.3.11 - mkl_random=1.2.8 - mpmath=1.3.0 - ncurses=6.4 - nettle=3.7.3 - networkx=3.2.1 - openh264=2.1.1 - openjpeg=2.5.2 - openssl=3.0.15 - pillow=11.0.0 - pip=24.2 - pysocks=1.7.1 - python=3.12.8 - pytorch=2.5.1 - pytorch-cuda=11.8 - pytorch-mutex=1.0 - pyyaml=6.0.2 - readline=8.2 - requests=2.32.3 - setuptools=72.1.0 - sqlite=3.45.3 - tbb=2021.8.0 - tk=8.6.14 - torchtriton=3.1.0 - torchvision=0.20.1 - typing_extensions=4.12.2 - urllib3=2.2.3 - wheel=0.44.0 - xz=5.4.6 - yaml=0.2.5 - zlib=1.2.13 - zstd=1.5.6 - pip: - absl-py==2.1.0 - adjusttext==1.3.0 - aiohappyeyeballs==2.4.4 - aiohttp==3.11.11 - aiosignal==1.3.2 - alabaster==1.0.0 - argon2-cffi==23.1.0 - argon2-cffi-bindings==21.2.0 - attrs==24.3.0 - autorank==1.2.1 - babel==2.16.0 - baycomp==1.0.3 - catboost==1.2.7 - cffi==1.17.1 - cir-model==0.2.0 - click==8.1.8 - cloudpickle==3.1.0 - contourpy==1.3.1 - coverage==7.6.10 - cycler==0.12.1 - dask==2024.12.1 - dask-expr==1.1.21 - deprecation==2.1.0 - dill==0.3.9 - docutils==0.21.2 - et-xmlfile==2.0.0 - fire==0.7.0 - fonttools==4.55.3 - frozenlist==1.5.0 - fsspec==2024.12.0 - gpytorch==1.13 - grpcio==1.69.0 - imagesize==1.4.1 - iniconfig==2.0.0 - jaxtyping==0.2.19 - joblib==1.4.2 - jsonschema==4.23.0 - jsonschema-specifications==2024.10.1 - kiwisolver==1.4.8 - liac-arff==2.5.0 - lightgbm==4.5.0 - lightning-utilities==0.11.9 - linear-operator==0.5.3 - locket==1.0.0 - markdown==3.7 - markdown-it-py==3.0.0 - matplotlib==3.7.5 - mdit-py-plugins==0.4.2 - mdurl==0.1.2 - minio==7.2.14 - msgpack==1.1.0 - msgpack-numpy==0.4.8 - multidict==6.1.0 - myst-parser==4.0.0 - netcal==1.3.6 - numpy==1.26.4 - nvidia-ml-py==12.560.30 - nvidia-nccl-cu12==2.24.3 - openml==0.15.0 - openpyxl==3.1.5 - opt-einsum==3.4.0 - packaging==24.2 - pandas==2.2.3 - partd==1.4.2 - patool==3.1.0 - patsy==1.0.1 - plotly==5.24.1 - pluggy==1.5.0 - probmetrics==0.0.1 - propcache==0.2.1 - protobuf==5.29.3 - psutil==6.1.1 - pyarrow==18.1.0 - pycparser==2.22 - pycryptodome==3.21.0 - pygments==2.19.1 - pynvml==12.0.0 - pyparsing==3.2.1 - pyro-api==0.1.2 - pyro-ppl==1.9.1 - pytabkit==1.1.3 - pytest==8.3.4 - pytest-cov==6.0.0 - python-dateutil==2.9.0.post0 - python-graphviz==0.20.3 - pytorch-lightning==2.5.0.post0 - pytz==2024.2 - ray==2.40.0 - referencing==0.35.1 - relplot==1.0 - rpds-py==0.22.3 - scikit-learn==1.5.2 - scipy==1.15.1 - seaborn==0.13.2 - six==1.17.0 - skorch==1.1.0 - snowballstemmer==2.2.0 - sphinx==8.1.3 - sphinx-rtd-theme==3.0.2 - sphinxcontrib-applehelp==2.0.0 - sphinxcontrib-devhelp==2.0.0 - sphinxcontrib-htmlhelp==2.1.0 - sphinxcontrib-jquery==4.1 - sphinxcontrib-jsmath==1.0.1 - sphinxcontrib-qthelp==2.0.0 - sphinxcontrib-serializinghtml==2.0.0 - statsmodels==0.14.4 - swig==4.3.0 - sympy==1.13.1 - tabulate==0.9.0 - tenacity==9.0.0 - tensorboard==2.18.0 - tensorboard-data-server==0.7.2 - termcolor==2.5.0 - threadpoolctl==3.5.0 - tikzplotlib==0.9.8 - toolz==1.0.0 - torchmetrics==1.6.1 - tqdm==4.67.1 - tueplots==0.0.17 - typeguard==4.4.1 - tzdata==2024.2 - venn-abers==1.4.6 - werkzeug==3.1.3 - xgboost==2.1.3 - xlrd==2.0.1 - xmltodict==0.14.2 - yarl==1.18.3 ================================================ FILE: original_requirements/requirements_2024_06_25.txt ================================================ adjustText==1.0.4 aiohttp==3.9.1 aiosignal==1.3.1 annotated-types==0.6.0 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asttokens==2.4.1 async-timeout==4.0.3 attrs==23.1.0 autorank==1.1.3 Babel==2.14.0 baycomp==1.0.3 blis==0.7.11 boltons==23.0.0 brotlipy==0.7.0 catalogue==2.0.10 catboost==1.2.2 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==2.0.4 cir-model==0.2.0 click==8.1.7 cloudpathlib==0.16.0 cloudpickle==3.0.0 cmake==3.28.1 colorama==0.4.6 comm==0.2.0 confection==0.1.4 ConfigSpace==0.7.1 contourpy==1.2.0 coverage==7.3.3 cramjam==2.7.0 cryptography==41.0.3 cycler==0.12.1 cymem==2.0.8 dask==2023.12.1 dask-jobqueue==0.8.2 debugpy==1.8.0 decorator==5.1.1 dill==0.3.7 distinctipy==1.3.4 distributed==2023.12.1 einops==0.7.0 emcee==3.1.4 et-xmlfile==1.1.0 exceptiongroup==1.1.3 executing==2.0.1 fastparquet==2023.10.1 filelock==3.13.1 fire==0.5.0 fonttools==4.46.0 frozenlist==1.4.1 fsspec==2023.12.2 future==0.18.3 gensim==4.3.2 ghp-import==2.1.0 graphviz==0.20.1 griffe==0.38.1 hyperopt==0.2.7 idna==3.4 importlib-metadata==6.8.0 importlib-resources==6.1.1 imutils==0.5.4 iniconfig==2.0.0 ipykernel==6.26.0 ipython==8.17.2 jedi==0.19.1 Jinja2==3.1.2 joblib==1.3.2 jsonpatch==1.32 jsonpointer==2.1 jsonschema==4.20.0 jsonschema-specifications==2023.11.2 jupyter_client==8.6.0 jupyter_core==5.5.0 kditransform==0.2.0 kiwisolver==1.4.5 langcodes==3.3.0 liac-arff==2.5.0 lightgbm==4.1.0 lightning-utilities==0.10.0 lit==17.0.6 llvmlite==0.41.1 locket==1.0.0 Markdown==3.5.1 MarkupSafe==2.1.3 matplotlib==3.8.2 matplotlib-inline==0.1.6 mergedeep==1.3.4 minio==7.2.0 mkdocs==1.5.3 mkdocs-autorefs==0.5.0 mkdocs-material==9.5.2 mkdocs-material-extensions==1.3.1 mkdocstrings==0.24.0 mkdocstrings-python==1.7.5 more-itertools==10.1.0 mpmath==1.3.0 msgpack==1.0.7 msgpack-numpy==0.4.8 multidict==6.0.4 murmurhash==1.0.10 nest-asyncio==1.5.8 networkx==3.2.1 numba==0.58.1 numpy==1.26.2 nvidia-cublas-cu11==11.10.3.66 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu11==11.7.101 nvidia-cuda-cupti-cu12==12.1.105 nvidia-cuda-nvrtc-cu11==11.7.99 nvidia-cuda-nvrtc-cu12==12.1.105 nvidia-cuda-runtime-cu11==11.7.99 nvidia-cuda-runtime-cu12==12.1.105 nvidia-cudnn-cu11==8.5.0.96 nvidia-cudnn-cu12==8.9.2.26 nvidia-cufft-cu11==10.9.0.58 nvidia-cufft-cu12==11.0.2.54 nvidia-curand-cu11==10.2.10.91 nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu11==11.4.0.1 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu11==11.7.4.91 nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu11==2.14.3 nvidia-nccl-cu12==2.18.1 nvidia-nvjitlink-cu12==12.3.101 nvidia-nvtx-cu11==11.7.91 nvidia-nvtx-cu12==12.1.105 opencv-contrib-python==4.8.1.78 openml==0.14.1 openpyxl==3.1.2 packaging==23.1 paginate==0.5.6 pandas==2.1.4 parso==0.8.3 partd==1.4.1 pathspec==0.12.1 patool==1.15.0 patsy==0.5.6 pexpect==4.8.0 Pillow==10.1.0 pkg_resources==0.0.0 platformdirs==3.11.0 plotly==5.18.0 pluggy==1.0.0 preshed==3.0.9 prompt-toolkit==3.0.39 protobuf==4.25.1 psutil==5.9.6 ptyprocess==0.7.0 pure-eval==0.2.2 py4j==0.10.9.7 pyarrow==14.0.2 pycosat==0.6.6 pycparser==2.21 pycryptodome==3.19.0 pydantic==1.10.13 pydantic_core==2.14.5 Pygments==2.16.1 pymdown-extensions==10.5 pynisher==1.0.10 pynvml==11.5.0 pyOpenSSL==23.2.0 pyparsing==3.1.1 pyrfr==0.9.0 PySocks==1.7.1 pytest==7.4.3 pytest-cov==4.1.0 python-dateutil==2.8.2 pytorch-lightning==2.1.2 pytorch-widedeep==1.4.0 pytz==2023.3.post1 PyYAML==6.0.1 pyyaml_env_tag==0.1 pyzmq==25.1.1 ray==2.8.1 referencing==0.32.0 regex==2023.10.3 requests==2.31.0 rpds-py==0.15.2 ruamel.yaml==0.17.21 ruamel.yaml.clib==0.2.6 scikit-learn==1.3.2 scipy==1.11.4 seaborn==0.13.1 six==1.16.0 skorch==0.15.0 smac==2.0.2 smart-open==6.4.0 sortedcontainers==2.4.0 spacy==3.7.2 spacy-legacy==3.0.12 spacy-loggers==1.0.5 srsly==2.4.8 stack-data==0.6.3 statsmodels==0.14.2 sympy==1.12 tabulate==0.9.0 tblib==3.0.0 tenacity==8.2.3 termcolor==2.4.0 textalloc==0.0.7 thinc==8.2.2 threadpoolctl==3.2.0 tomli==2.0.1 toolz==0.12.0 torch==2.0.0 torchmetrics==1.2.1 torchvision==0.16.2 tornado==6.3.3 tqdm==4.65.0 traitlets==5.13.0 triton==2.0.0 tueplots==0.0.12 typer==0.9.0 typing_extensions==4.8.0 tzdata==2023.3 urllib3==1.26.16 venn-abers==1.4.1 wasabi==1.1.2 watchdog==3.0.0 wcwidth==0.2.9 weasel==0.3.4 wrapt==1.16.0 xgboost==2.0.2 xlrd==2.0.1 xmltodict==0.13.0 yarl==1.9.4 zict==3.0.0 zipp==3.17.0 zstandard==0.19.0 ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["hatchling>=1.26.1"] # https://github.com/pypa/hatch/issues/1818 build-backend = "hatchling.build" [project] name = "pytabkit" dynamic = ["version"] description = 'ML models + benchmark for tabular data classification and regression' readme = "README.md" requires-python = ">=3.9" license = "Apache-2.0" keywords = ['tabular data', 'scikit-learn', 'deep learning', 'gradient boosting', 'RealMLP'] authors = [ { name = "David Holzmüller" }, #, email = "a@b.org" }, { name = "Léo Grinsztajn" }, #, email = "a@b.org" }, { name = "Ingo Steinwart" }, #, email = "a@b.org" }, ] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "License :: OSI Approved :: Apache Software License", ] dependencies = [ "torch>=2.0", "numpy>=1.25", # hopefully don't need <2.0 anymore? "pandas>=2.0", "scikit-learn>=1.3", # these could be made optional with lazy imports # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics used in TabR slow: # https://github.com/Lightning-AI/torchmetrics/pull/2184 "torchmetrics>=1.2.1", # can also install the newer lightning package with more dependencies instead, it will be prioritized "pytorch_lightning>=2.0", "psutil>=5.0", # used for getting logical CPU count in the sklearn base and for getting process RAM usage ] [project.optional-dependencies] models = [ # use <2.6 for now since it can run into pickling issues with skorch if the skorch version is too old # see https://github.com/skorch-dev/skorch/commit/be93b7769d61aa22fb928d2e89e258c629bfeaf9 "torch>=2.0", "xgboost>=2.0", "catboost>=1.2", "lightgbm>=4.1", "xrfm>=0.4.3", # lower bound is not checked extensively # for rtdl models (MLP, ResNet) but also lightly used in TabR # note that scikit-learn 1.6 needs skorch >= 1.1.0 "skorch>=0.15", "dask[dataframe]>=2023", # this is here because of a pandas warning: # "Dask dataframe query planning is disabled because dask-expr is not installed" # "packaging", # unclear why this is here? "tqdm", # for TabM with verbosity >= 1 # more classification metrics and post-hoc calibrators # not necessary unless these things are actually used "probmetrics>=0.0.1", # more powerful pickle, used for file-saving and multiprocessing. # Unfortunately it can't save certain torch objects "dill", # saving objects in yaml/msgpack # needed if used in utils.serialize() / deserialize() "pyyaml>=5.0", "msgpack>=1.0", # apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack? # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occurred # maybe it occurred because we tried to save hyperparameters that were numpy scalars instead of python scalars # "msgpack_numpy>=0.4", # this is needed because probmetrics uses unpinned numba, # but for some reason the github actions CI wants to install 0.53.1 # which is incompatible with Python 3.11 and 3.12. # 0.59.0 is the lowest version that is compatible with 3.12 "numba>=0.59.0", ] autogluon = [ "autogluon.tabular[all]>=1.0", "autogluon.multimodal>=1.0", ] extra = [ "kditransform>=0.2", ] hpo = [ "ConfigSpace>=0.7", "smac>=2.0", "hyperopt>=0.2", ] bench = [ "fire", # argparse utilities "ray>=2.8", # parallelization "openml>=0.14", # OpenML data download # ----- UCI import ------ "requests>=2.0", "patool>=1.0", "openpyxl>=3.0", "xlrd>=2.0", # ----- plotting ----- "matplotlib>=3.0", "tueplots>=0.0.12", "seaborn>=0.0.13", "adjustText>=1.0", "autorank>=1.0", ] dev = [ "pytest>=7.0", "pytest-cov>=4.0", "sphinx>=7.0", "myst_parser>=3.0", "sphinx_rtd_theme>=2.0", ] [tool.hatch.version] path = "pytabkit/__about__.py" [tool.hatch.envs.default] installer = "uv" features = ["models", "bench", "autogluon", "extra", "hpo", "dev"] [tool.hatch.envs.hatch-test] installer = "uv" features = ["models", "bench", "dev", "hpo"] #features = ["models","bench","autogluon","extra","hpo","dev"] [tool.hatch.build.targets.sdist] package = ['pytabkit'] only-include = ['pytabkit'] [tool.hatch.build.targets.wheel] package = ['pytabkit'] only-include = ['pytabkit'] [project.urls] Documentation = "https://github.com/dholzmueller/pytabkit#readme" Issues = "https://github.com/dholzmueller/pytabkit/issues" Source = "https://github.com/dholzmueller/pytabkit" [tool.hatch.envs.types] extra-dependencies = [ "mypy>=1.0.0", ] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:pytabkit tests}" [tool.coverage.run] source_pkgs = ["pytabkit", "tests"] branch = true parallel = true omit = [ "pytabkit/__about__.py", ] [tool.coverage.paths] models = ["pytabkit/models", "*/pytabkit/pytabkit/models"] bench = ["pytabkit/bench", "*/pytabkit/pytabkit/bench"] tests = ["tests", "*/pytabkit/tests"] [tool.coverage.report] exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] ================================================ FILE: pytabkit/__about__.py ================================================ # SPDX-FileCopyrightText: 2024-present David Holzmüller # # SPDX-License-Identifier: Apache-2.0 __version__ = "1.7.3" ================================================ FILE: pytabkit/__init__.py ================================================ from .models.sklearn.sklearn_interfaces import * ================================================ FILE: pytabkit/bench/__init__.py ================================================ ================================================ FILE: pytabkit/bench/alg_wrappers/__init__.py ================================================ ================================================ FILE: pytabkit/bench/alg_wrappers/general.py ================================================ from pathlib import Path from typing import List, Dict, Optional from pytabkit.bench.data.tasks import TaskPackage, TaskInfo from pytabkit.bench.run.results import ResultManager from pytabkit.models.training.logging import Logger from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.training.metrics import Metrics class AlgWrapper: """ Base class for ML methods that can be run in the benchmarking code. """ def __init__(self, **config): """ Constructor. :param config: Configuration parameters. """ self.config = config def run(self, task_package: TaskPackage, logger: Logger, assigned_resources: NodeResources, tmp_folders: List[Path], metrics: Optional[Metrics] = None) -> Dict[str, List[ResultManager]]: """ Run the ML method on the given task. Should be overridden in subclasses. :param task_package: Information about the task to be run. :param logger: Logger. :param assigned_resources: Assigned resources (e.g. number of threads). :param tmp_folders: Temporary folders, one for each train/test split, to save temporary data to. :return: A dictionary of lists of ResultManager objects. The dict key is the predict params name, which is used as a suffix for the alg_name, and each list contains ResultManagers for each train/test split. """ raise NotImplementedError() def get_required_resources(self, task_package: TaskPackage) -> RequiredResources: """ Should be overridden in subclasses. :param task_package: Information about the task that should be executed. :return: Information about the estimated required resources that will be needed to run this task. """ raise NotImplementedError() def get_max_n_vectorized(self, task_info: TaskInfo) -> int: """ Returns 1 by default, should be overridden in subclasses if they benefit from vectorization. :param task_info: Information about the task that this method should run on. :return: Maximum number of train/test splits that this method can be run on at once. """ return 1 def get_pred_param_names(self, task_package: TaskPackage) -> List[str]: """ Return the possible prediction parameter names, used as suffixes for alg names :param task_package: Task package. :return: List of the possible names. """ raise NotImplementedError() # want to have: # - more general / easy ResourceComputation # - generic thread-allocation parameters for such a ResourceComputation # that allow to allocate more threads for larger workloads # - better NodeResources class that supports mps or perhaps a new class that summarizes the allocated resources # - should the resource estimation be moved to AlgInterface? # Then, we would need to instantiate an AlgInterface in the wrapper to do the estimation # - maybe a code that estimates RAM (and time) constants? With fake data sets? # better ResourceComputation: # have identical components for CPU and GPU, and maybe also for RAM and time # components: # - dataset size # - factory (model) size # - RAM for forward (and backward) pass # - generic calculation (constant, per-tree, per-class, per-sample), # for the NN we might also need to include the batch size, number of epochs, etc. # what about the number of threads etc.? # want to have one per device? # better NodeResources: # maybe just have a dict with the devices that are being referred to by the array? ================================================ FILE: pytabkit/bench/alg_wrappers/interface_wrappers.py ================================================ import shutil from pathlib import Path from typing import Callable, List, Optional, Dict import torch from pytabkit.bench.data.paths import Paths from pytabkit.models import utils from pytabkit.models.alg_interfaces.autogluon_model_interfaces import AutoGluonModelAlgInterface from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface, CatBoostHyperoptAlgInterface, \ CatBoostSklearnSubSplitInterface, RandomParamsCatBoostAlgInterface from pytabkit.models.alg_interfaces.ensemble_interfaces import PrecomputedPredictionsAlgInterface, \ CaruanaEnsembleAlgInterface, AlgorithmSelectionAlgInterface from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface, LGBMHyperoptAlgInterface, \ LGBMSklearnSubSplitInterface, RandomParamsLGBMAlgInterface from pytabkit.bench.alg_wrappers.general import AlgWrapper from pytabkit.bench.data.tasks import TaskPackage, TaskInfo from pytabkit.bench.run.results import ResultManager from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface, SklearnMLPSubSplitInterface, \ KANSubSplitInterface, GrandeSubSplitInterface, GBTSubSplitInterface, RandomParamsRFAlgInterface, \ TabPFN2SubSplitInterface, TabICLSubSplitInterface, RandomParamsExtraTreesAlgInterface, RandomParamsKNNAlgInterface, \ ExtraTreesSubSplitInterface, KNNSubSplitInterface, RandomParamsLinearModelAlgInterface, \ LinearModelSubSplitInterface from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, MultiSplitWrapperAlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, RequiredResources from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface, ResnetSubSplitInterface, \ FTTransformerSubSplitInterface, RandomParamsResnetAlgInterface, RandomParamsRTDLMLPAlgInterface, \ RandomParamsFTTransformerAlgInterface from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface, \ RandomParamsTabRAlgInterface from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface, RandomParamsNNAlgInterface, \ NNHyperoptAlgInterface from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface, XGBHyperoptAlgInterface, \ XGBSklearnSubSplitInterface, RandomParamsXGBAlgInterface from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface, RandomParamsxRFMAlgInterface from pytabkit.models.data.data import TaskType, DictDataset from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.torch_utils import TorchTimer from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import Metrics # what is the value of wrappers around AlgInterface? # - it has a create-function that can create multiple instances, # and can wrap with MultiSplitAlgInterface and SingleSplitAlgInterface # - there is some wrapping code in run(), but this could be moved to where the wrapper is used # - it provides get_max_n_vectorized() # perhaps we should generalize TreeResourceComputation to also work for NNs? # But this would require extra functionality for backprop, GPU RAM, etc. def get_prep_factory(**config): return config.get('factory', None) or PreprocessingFactory(**config) class AlgInterfaceWrapper(AlgWrapper): """ Base class for wrapping AlgInterface classes for benchmarking. """ def __init__(self, create_alg_interface_fn: Optional[Callable[[...], AlgInterface]], **config): """ Constructor. :param create_alg_interface_fn: Function to create an AlgInterface via create_alg_interface_fn(**config). :param config: Configuration parameters. """ super().__init__(**config) self.create_alg_interface_fn = create_alg_interface_fn # def _create_alg_interface_impl(self, n_cv: int, n_splits: int, task_type: TaskType) -> AlgInterface: def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface: """ Factory method to create an AlgInterface. Should be overridden unless ``create_alg_interface_fn`` has been provided in the constructor. This method should not be used directly, instead create_alg_interface() should be used. :param task_package: Task information. :return: An AlgInterface corresponding to an ML method. """ if self.create_alg_interface_fn is not None: return self.create_alg_interface_fn(**self.config) else: raise NotImplementedError() def create_alg_interface(self, task_package: TaskPackage) -> AlgInterface: """ Method to create an AlgInterface. :param task_package: Task information. :return: An AlgInterface corresponding to an ML method. """ alg_interface = self._create_alg_interface_impl(task_package) if 'calibration_method' in self.config: try: from pytabkit.models.alg_interfaces.calibration import PostHocCalibrationAlgInterface alg_interface = PostHocCalibrationAlgInterface(alg_interface, **self.config) except ImportError: raise ValueError('Calibration methods are not implemented') if 'quantile_calib_alpha' in self.config: try: from pytabkit.models.alg_interfaces.custom_interfaces import QuantileCalibrationAlgInterface alg_interface = QuantileCalibrationAlgInterface(alg_interface, **self.config) except ImportError: raise ValueError('Quantile Calibration methods are not implemented') return alg_interface def run(self, task_package: TaskPackage, logger: Logger, assigned_resources: NodeResources, tmp_folders: List[Path], metrics: Optional[Metrics] = None) -> Dict[str, List[ResultManager]]: task = task_package.task_info.load_task(task_package.paths) task_desc = task_package.task_info.task_desc n_cv = task_package.n_cv n_refit = task_package.n_refit interface_resources = assigned_resources.get_interface_resources() old_torch_n_threads = torch.get_num_threads() old_torch_n_interop_threads = torch.get_num_interop_threads() torch.set_num_threads(interface_resources.n_threads) # don't set this because it can throw # Error: cannot set number of interop threads after parallel work has started or set_num_interop_threads called # torch.set_num_interop_threads(interface_resources.n_threads) ds = task.ds name = 'alg ' + task_package.alg_name + ' on task ' + str(task_desc) # return_preds = self.config.get(f'save_y_pred', False) return_preds = task_package.save_y_pred if metrics is None: metrics = Metrics.defaults(ds.tensor_infos['y'].cat_sizes, val_metric_name=self.config.get('val_metric_name', None)) cv_idxs_list = [] refit_idxs_list = [] n_splits = len(task_package.split_infos) if n_splits == 1: logger.log(1, f'Running on split {task_package.split_infos[0].id} of task {task_package.task_info.task_desc}') else: logger.log(1, f'Running on {n_splits} splits of task {task_package.task_info.task_desc}') for split_id, split_info in enumerate(task_package.split_infos): # this will usually be called with len(task_package.split_infos) == 1, but do a loop for safety test_split = split_info.splitter.split_ds(task.ds) trainval_idxs, test_idxs = test_split.idxs[0], test_split.idxs[1] trainval_ds = test_split.get_sub_ds(0) cv_sub_splits = split_info.get_sub_splits(trainval_ds, n_splits=n_cv, is_cv=True) cv_train_idxs = [] cv_val_idxs = [] for sub_idx, sub_split in enumerate(cv_sub_splits): cv_train_idxs.append(trainval_idxs[sub_split.idxs[0]]) cv_val_idxs.append(trainval_idxs[sub_split.idxs[1]]) cv_train_idxs = torch.stack(cv_train_idxs, dim=0) cv_val_idxs = torch.stack(cv_val_idxs, dim=0) cv_alg_seeds = [split_info.get_sub_seed(split_idx, is_cv=True) for split_idx in range(n_cv)] cv_idxs_list.append(SplitIdxs(cv_train_idxs, cv_val_idxs, test_idxs, split_seed=split_info.alg_seed, sub_split_seeds=cv_alg_seeds, split_id=split_id)) if n_refit > 0: refit_train_idxs = torch.stack([trainval_idxs] * n_refit, dim=0) refit_alg_seeds = [split_info.get_sub_seed(split_idx, is_cv=False) for split_idx in range(n_refit)] refit_idxs_list.append(SplitIdxs(refit_train_idxs, None, test_idxs, split_seed=split_info.alg_seed, sub_split_seeds=refit_alg_seeds, split_id=split_id)) if task_package.rerun: for tmp_folder in tmp_folders: if utils.existsDir(tmp_folder): # delete the folder such that the method doesn't load old results from the tmp folder shutil.rmtree(tmp_folder) cv_tmp_folders = [tmp_folder / 'cv' for tmp_folder in tmp_folders] refit_tmp_folders = [tmp_folder / 'refit' for tmp_folder in tmp_folders] cv_alg_interface = self.create_alg_interface(task_package) pred_param_names = list(cv_alg_interface.get_available_predict_params().keys()) if n_refit > 0 and len(pred_param_names) > 1: raise NotImplementedError('Refitting with multiple prediction parameters is currently not implemented') rms = {name: [ResultManager() for _ in task_package.split_infos] for name in pred_param_names} with TorchTimer() as cv_fit_timer: cv_alg_interface.fit(ds, cv_idxs_list, interface_resources, logger, cv_tmp_folders, name) for pred_param_name in pred_param_names: cv_alg_interface.set_current_predict_params(pred_param_name) with TorchTimer() as cv_eval_timer: cv_results_list = cv_alg_interface.eval(ds, cv_idxs_list, metrics, return_preds) for rm, cv_results in zip(rms[pred_param_name], cv_results_list): rm.add_results(is_cv=True, results_dict=cv_results.get_dict() | dict(fit_time_s=cv_fit_timer.elapsed, eval_time_s=cv_eval_timer.elapsed)) if n_refit > 0: refit_alg_interface = cv_alg_interface.get_refit_interface(n_refit) with TorchTimer() as refit_fit_timer: refit_alg_interface.fit(ds, refit_idxs_list, interface_resources, logger, refit_tmp_folders, name) with TorchTimer() as refit_eval_timer: refit_results_list = refit_alg_interface.eval(ds, refit_idxs_list, metrics, return_preds) for rm, refit_results in zip(rms[pred_param_name], refit_results_list): rm.add_results(is_cv=False, results_dict=refit_results.get_dict() | dict(fit_time_s=refit_fit_timer.elapsed, eval_time_s=refit_eval_timer.elapsed)) torch.set_num_threads(old_torch_n_threads) # torch.set_num_interop_threads(old_torch_n_interop_threads) return rms def get_required_resources(self, task_package: TaskPackage) -> RequiredResources: ds = DictDataset(tensors=None, tensor_infos=task_package.task_info.tensor_infos, device='cpu', n_samples=task_package.task_info.n_samples) alg_interface = self.create_alg_interface(task_package) n_train, n_val = task_package.split_infos[0].get_train_and_val_size(n_samples=task_package.task_info.n_samples, n_splits=len(task_package.split_infos), is_cv=True) # n_train = split_info.get_sub_splits(trainval_ds, n_splits=n_cv, is_cv=True) return alg_interface.get_required_resources(ds=ds, n_cv=task_package.n_cv, n_refit=task_package.n_refit, n_splits=len(task_package.split_infos), split_seeds=[si.alg_seed for si in task_package.split_infos], n_train=n_train) def get_pred_param_names(self, task_package: TaskPackage) -> List[str]: return list(self.create_alg_interface(task_package).get_available_predict_params().keys()) class LoadResultsWrapper(AlgInterfaceWrapper): def __init__(self, alg_name: str, **config): super().__init__(create_alg_interface_fn=None, **config) self.alg_name = alg_name def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface: assert len(task_package.split_infos) == 1 # only support single-split paths = self.config.get('paths', Paths.from_env_variables()) task_info = task_package.task_info split_info = task_package.split_infos[0] split_id = split_info.id results_path = paths.results_alg_task_split(task_desc=task_info.task_desc, alg_name=self.alg_name, n_cv=task_package.n_cv, split_type=split_info.split_type, split_id=split_id) rm = ResultManager.load(results_path) y_preds_cv = rm.y_preds_cv if rm.y_preds_cv is not None else rm.other_dict['cv']['y_preds'] y_preds_cv = torch.as_tensor(y_preds_cv, dtype=torch.float32) y_preds_refit = None if rm.y_preds_refit is not None: y_preds_refit = torch.as_tensor(rm.y_preds_refit, dtype=torch.float32) elif 'refit' in rm.other_dict: y_preds_refit = torch.as_tensor(rm.other_dict['refit']['y_preds'], dtype=torch.float32) fit_params_cv = rm.other_dict['cv']['fit_params'] fit_params_refit = None if 'refit' not in rm.other_dict else rm.other_dict['refit']['fit_params'] return PrecomputedPredictionsAlgInterface(y_preds_cv=y_preds_cv, y_preds_refit=y_preds_refit, fit_params_cv=fit_params_cv, fit_params_refit=fit_params_refit) def get_required_resources(self, task_package: TaskPackage) -> RequiredResources: # do this here such that we don't have to load the results for computing the required resources return RequiredResources(time_s=1e-5 * task_package.task_info.n_samples, cpu_ram_gb=1.5, n_threads=1) class CaruanaEnsembleWrapper(AlgInterfaceWrapper): def __init__(self, sub_wrappers: List[AlgInterfaceWrapper], **config): super().__init__(create_alg_interface_fn=None, **config) self.sub_wrappers = sub_wrappers def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface: single_split_alg_interfaces = [] for split_info in task_package.split_infos: single_alg_interfaces = [] for sub_wrapper in self.sub_wrappers: sub_tp = TaskPackage(task_info=task_package.task_info, split_infos=[split_info], n_cv=task_package.n_cv, n_refit=task_package.n_refit, paths=task_package.paths, rerun=task_package.rerun, alg_name=task_package.alg_name, save_y_pred=task_package.save_y_pred) single_alg_interfaces.append(sub_wrapper.create_alg_interface(sub_tp)) single_split_alg_interfaces.append(CaruanaEnsembleAlgInterface(single_alg_interfaces, **self.config)) return MultiSplitWrapperAlgInterface(single_split_alg_interfaces) def get_required_resources(self, task_package: TaskPackage) -> RequiredResources: single_resources = [sub_wrapper.get_required_resources(task_package) for sub_wrapper in self.sub_wrappers] return RequiredResources.combine_sequential(single_resources) class AlgorithmSelectionWrapper(AlgInterfaceWrapper): def __init__(self, sub_wrappers: List[AlgInterfaceWrapper], **config): super().__init__(create_alg_interface_fn=None, **config) self.sub_wrappers = sub_wrappers def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface: single_split_alg_interfaces = [] for split_info in task_package.split_infos: single_alg_interfaces = [] for sub_wrapper in self.sub_wrappers: sub_tp = TaskPackage(task_info=task_package.task_info, split_infos=[split_info], n_cv=task_package.n_cv, n_refit=task_package.n_refit, paths=task_package.paths, rerun=task_package.rerun, alg_name=task_package.alg_name, save_y_pred=task_package.save_y_pred) single_alg_interfaces.append(sub_wrapper.create_alg_interface(sub_tp)) single_split_alg_interfaces.append(AlgorithmSelectionAlgInterface(single_alg_interfaces, **self.config)) return MultiSplitWrapperAlgInterface(single_split_alg_interfaces) def get_required_resources(self, task_package: TaskPackage) -> RequiredResources: # too pessimistic for refit... single_resources = [sub_wrapper.get_required_resources(task_package) for sub_wrapper in self.sub_wrappers] return RequiredResources.combine_sequential(single_resources) class MultiSplitAlgInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, **config): super().__init__(create_alg_interface_fn=None, **config) def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: raise NotImplementedError() def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface: n_cv = task_package.n_cv task_type = task_package.task_info.task_type n_splits = len(task_package.split_infos) return MultiSplitWrapperAlgInterface( single_split_interfaces=[self.create_single_alg_interface(n_cv, task_type) for i in range(n_splits)], **self.config) class SubSplitInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def __init__(self, create_sub_split_learner_fn: Optional[Callable[[...], AlgInterface]] = None, **config): super().__init__(**config) self.create_sub_split_learner_fn = create_sub_split_learner_fn def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: if self.create_sub_split_learner_fn is not None: return self.create_sub_split_learner_fn(**self.config) raise NotImplementedError() def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return SingleSplitWrapperAlgInterface([self.create_sub_split_interface(task_type) for i in range(n_cv)], **self.config) class NNInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, **config): super().__init__(NNAlgInterface, **config) def get_max_n_vectorized(self, task_info: TaskInfo) -> int: ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples) max_ram_gb = 8.0 max_n_vectorized = self.config.get('max_n_vectorized', 50) alg_interface = NNAlgInterface(**self.config) while max_n_vectorized > 1: required_resources = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=max_n_vectorized, split_seeds=[0] * max_n_vectorized, n_train=task_info.n_samples) if required_resources.gpu_ram_gb <= max_ram_gb and required_resources.cpu_ram_gb <= max_ram_gb: return max_n_vectorized max_n_vectorized -= 1 return 1 class NNHyperoptInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, **config): super().__init__(NNHyperoptAlgInterface, **config) def get_max_n_vectorized(self, task_info: TaskInfo) -> int: ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples) max_ram_gb = 8.0 max_n_vectorized = self.config.get('max_n_vectorized', 50) alg_interface = NNHyperoptAlgInterface(**self.config) while max_n_vectorized > 1: required_resources = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=max_n_vectorized, split_seeds=[0] * max_n_vectorized, n_train=task_info.n_samples) if required_resources.gpu_ram_gb <= max_ram_gb and required_resources.cpu_ram_gb <= max_ram_gb: return max_n_vectorized max_n_vectorized -= 1 return 1 class RandomParamsNNInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsNNAlgInterface, model_idx=model_idx, **config) class LGBMSklearnInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType): return LGBMSklearnSubSplitInterface(**self.config) class LGBMInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return LGBMSubSplitInterface(**self.config) class LGBMHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return LGBMHyperoptAlgInterface(**self.config) class RandomParamsLGBMInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return RandomParamsLGBMAlgInterface(**self.config) class XGBSklearnInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return XGBSklearnSubSplitInterface(**self.config) class XGBInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return XGBSubSplitInterface(**self.config) class RandomParamsXGBInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return RandomParamsXGBAlgInterface(**self.config) class XGBHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return XGBHyperoptAlgInterface(**self.config) class CatBoostSklearnInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return CatBoostSklearnSubSplitInterface(**self.config) class CatBoostInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return CatBoostSubSplitInterface(**self.config) class CatBoostHyperoptInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return CatBoostHyperoptAlgInterface(**self.config) class RandomParamsCatBoostInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return RandomParamsCatBoostAlgInterface(**self.config) class RFInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return RFSubSplitInterface(**self.config) class ExtraTreesInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return ExtraTreesSubSplitInterface(**self.config) class KNNInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return KNNSubSplitInterface(**self.config) class LinearModelInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return LinearModelSubSplitInterface(**self.config) class GBTInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return GBTSubSplitInterface(**self.config) class SklearnMLPInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return SklearnMLPSubSplitInterface(**self.config) class KANInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return KANSubSplitInterface(**self.config) class GrandeInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return GrandeSubSplitInterface(**self.config) class TabPFN2InterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return TabPFN2SubSplitInterface(**self.config) class TabICLInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return TabICLSubSplitInterface(**self.config) class MLPRTDLInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return RTDL_MLPSubSplitInterface(**self.config) class ResNetRTDLInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return ResnetSubSplitInterface(**self.config) class FTTransformerInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return FTTransformerSubSplitInterface(**self.config) class TabRInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return TabRSubSplitInterface(**self.config) class TabMInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return TabMSubSplitInterface(**self.config) class RandomParamsResnetInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsResnetAlgInterface, model_idx=model_idx, **config) class RandomParamsRTDLMLPInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsRTDLMLPAlgInterface, model_idx=model_idx, **config) class RandomParamsFTTransformerInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsFTTransformerAlgInterface, model_idx=model_idx, **config) class AutoGluonModelInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(AutoGluonModelAlgInterface, **config) class RandomParamsTabRInterfaceWrapper(SubSplitInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return RandomParamsTabRAlgInterface(**self.config) class RandomParamsRFInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsRFAlgInterface, model_idx=model_idx, **config) class RandomParamsExtraTreesInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsExtraTreesAlgInterface, model_idx=model_idx, **config) class RandomParamsKNNInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsKNNAlgInterface, model_idx=model_idx, **config) class RandomParamsLinearModelInterfaceWrapper(AlgInterfaceWrapper): def __init__(self, model_idx: int, **config): # model_idx should be the random search iteration (i.e. start from zero) super().__init__(RandomParamsLinearModelAlgInterface, model_idx=model_idx, **config) class xRFMInterfaceWrapper(SubSplitInterfaceWrapper): def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface: return xRFMSubSplitInterface(**self.config) class RandomParamsxRFMInterfaceWrapper(MultiSplitAlgInterfaceWrapper): def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \ -> AlgInterface: return RandomParamsxRFMAlgInterface(**self.config) ================================================ FILE: pytabkit/bench/data/__init__.py ================================================ ================================================ FILE: pytabkit/bench/data/common.py ================================================ class TaskSource: UCI_BIN_CLASS = 'uci-bin-class' UCI_MULTI_CLASS = 'uci-multi-class' UCI_REGRESSION = 'uci-reg' OPENML_CLASS = 'openml-class' OPENML_CLASS_BIN_EXTRA = 'openml-class-bin-extra' OPENML_REGRESSION = 'openml-reg' AUTOML_CLASS_SMALL = 'automl-class-small' TABARENA_CLASS = 'tabarena-class' TABARENA_REG = 'tabarena-reg' CUSTOM = 'custom' class SplitType: RANDOM = 'random-split' DEFAULT = 'default-split' ================================================ FILE: pytabkit/bench/data/get_uci.py ================================================ #!/usr/bin/python3 import os import shutil import ssl import pandas from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.uci_file_ops import prepare_new_data_set_group_id, download_and_save, replace_chars_in_file, \ load_raw_data, remove_columns, save_data_to_file, unzip_raw_data, concat_files, remove_files, UCIVars, \ move_label_in_front, remove_rows_with_label, ungz_raw_data, load_mixed_raw_data, \ auto_replace_categories_in_mixed_data, write_mixed_raw_data, replace_ordinals_in_mixed_data, \ replace_isodate_by_day_in_mixed_data, replace_circulars_in_mixed_data, get_categories_in_mixed_data, \ replace_time_by_seconds_in_mixed_data, unrar_raw_data, unarff_raw_data, un_z_raw_data, untar_raw_data, \ replace_categories_in_mixed_data, replace_bin_cats_in_mixed_data, auto_replace_missing_in_mixed_data, \ replace_manual_in_mixed_data from pytabkit.models import utils import numpy import sklearn.datasets as datasets import re as re #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- def get_skill_craft(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00272/SkillCraft1_Dataset.csv', 'skill_craft.data') replace_chars_in_file('skill_craft.data', '"', '') data = load_raw_data('skill_craft.data', sep = ',') data = remove_columns(data, [0]) save_data_to_file(data, 'skill_craft', is_classification = True) #--------------------------------------------------------------------------------------------------- def get_cargo_2000(): prepare_new_data_set_group_id() print("Cargo 2000 data set is currently not processed since:") print(" - from the description it is completely unclear how this data set can be used") #--------------------------------------------------------------------------------------------------- def get_KDC_4007(): prepare_new_data_set_group_id() print("KDC 4007 data set is currently not processed since:") print(" - from the description it is completely unclear how this data set can be used") #--------------------------------------------------------------------------------------------------- def get_sml2010(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00274/NEW-DATA.zip', 'sml2010.zip') unzip_raw_data('sml2010.zip') concat_files(UCIVars.raw_data_folder + 'NEW-DATA*.txt', UCIVars.raw_data_folder + 'sml2010.data') remove_files(UCIVars.raw_data_folder, 'NEW-DATA*.txt') replace_chars_in_file('sml2010.data', '#', '') data = load_raw_data('sml2010.data', sep = ' ', description_columns = 2) data_dining = remove_columns(data, [1]) save_data_to_file(data_dining, 'sml2010_dining', is_classification = False) data_room = remove_columns(data, [0]) save_data_to_file(data_room, 'sml2010_room', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_wine_quality(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', 'wine_quality_white.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names', 'wine_quality.description') # The first task is to create data sets in which the quality is the label. # To this end, we add a column at the right, which indicates whether the wine is white or read. data_white = load_raw_data('wine_quality_white.data', sep = ';', header = True) data_white = move_label_in_front(data_white, 11) white_label = numpy.ones((numpy.shape(data_white)[0], 1)) data_white = numpy.concatenate((data_white, white_label), axis = 1) save_data_to_file(data_white, 'wine_quality_white', is_classification = True) data_red = load_raw_data('wine_quality_red.data', sep = ';', header = True) data_red = move_label_in_front(data_red, 11) red_label = numpy.zeros((numpy.shape(data_red)[0], 1)) data_red = numpy.concatenate((data_red, red_label), axis = 1) data_all = numpy.concatenate((data_red, data_white), axis = 0) save_data_to_file(data_all, 'wine_quality_all', is_classification = True) # The next task is to combine the white and red wine data set and # to add a label describing the color of the wine. We further remove # the quality of the wine, since this may give too much information # about the color. data_white = load_raw_data('wine_quality_white.data', sep = ';', header = True) data_white = remove_columns(data_white, [11]) white_label = numpy.ones((numpy.shape(data_white)[0], 1)) data_white = numpy.concatenate((white_label, data_white), axis = 1) data_red = load_raw_data('wine_quality_red.data', sep = ';', header = True) data_red = remove_columns(data_red, [11]) red_label = numpy.zeros((numpy.shape(data_red)[0], 1)) data_red = numpy.concatenate((red_label, data_red), axis = 1) data_all = numpy.concatenate((data_red, data_white), axis = 0) save_data_to_file(data_all, 'wine_quality_type', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_parkinson(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data', 'parkinson_updrs.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names', 'parkinson_updrs.description') data = load_raw_data('parkinson_updrs.data', sep = ',', description_columns = 1) # The data has two variables that can be predicted, namely updrs_motor and updrs_total. # For both prediction tasks, the other target variable needs to be removed from the data data_motor = remove_columns(data, [4]) data_motor = move_label_in_front(data_motor, 3) save_data_to_file(data_motor, 'parkinson_motor', is_classification = False) data_total = remove_columns(data, [3]) data_total = move_label_in_front(data_total, 3) save_data_to_file(data_total, 'parkinson_total', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_insurance_benchmark(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt', 'insurance_benchmark.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticeval2000.txt', 'insurance_benchmark.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/tictgts2000.txt', 'insurance_benchmark.test.labels.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/TicDataDescr.txt', 'insurance_benchmark.description') train_data = load_raw_data('insurance_benchmark.train.data', sep = '\t') test_data = load_raw_data('insurance_benchmark.test.data', sep = '\t') test_label = load_raw_data('insurance_benchmark.test.labels.data', sep = '\t') test_data = numpy.concatenate((test_data, test_label), axis = 1) data = numpy.concatenate((train_data, test_data), axis = 0) data = move_label_in_front(data, 85) save_data_to_file(data, 'insurance_benchmark', is_classification = True) #--------------------------------------------------------------------------------------------------- def get_EEG_steady_state(): prepare_new_data_set_group_id() print("EEG Steady State Visual data set is currently not processed since:") print(" - the description indicates that it is time series data") #--------------------------------------------------------------------------------------------------- def get_air_quality(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip', 'air_quality.zip') unzip_raw_data('air_quality.zip') os.rename(UCIVars.raw_data_folder + 'AirQualityUCI.csv', UCIVars.raw_data_folder + 'air_quality.data') os.remove(UCIVars.raw_data_folder + 'AirQualityUCI.xlsx') data = load_raw_data('air_quality.data', sep = ';', date_column = 0, date_sep = '/', date_order = 'dmY', time_column = 1, time_sep = '.', german_decimal = True) # The data has five variables that can be predicted, # namely those in columns 2, 4, 5, 7, and 9 (C++ like). # For these prediction tasks, the other target variables # need to be removed from the data. data_co2 = remove_columns(data, [4, 5, 7, 9]) data_co2 = move_label_in_front(data_co2, 2) data_co2 = remove_rows_with_label(data_co2, -200.0) save_data_to_file(data_co2, 'air_quality_co2', is_classification = False) # The hydrocarbon reference measurements have only been taken 914 times # For this reason, they are not included in the constructed data sets. data_bc = remove_columns(data, [2, 4, 7, 9]) data_bc = move_label_in_front(data_bc, 3) data_bc = remove_rows_with_label(data_bc, -200.0) save_data_to_file(data_bc, 'air_quality_bc', is_classification = False) data_nox = remove_columns(data, [2, 4, 5, 9]) data_nox = move_label_in_front(data_nox, 4) data_nox = remove_rows_with_label(data_nox, -200.0) save_data_to_file(data_nox, 'air_quality_nox', is_classification = False) data_no2 = remove_columns(data, [2, 4, 5, 7]) data_no2 = move_label_in_front(data_no2, 5) data_no2 = remove_rows_with_label(data_no2, -200.0) save_data_to_file(data_no2, 'air_quality_no2', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_cycle_power_plant(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip', 'cycle_power_plant.zip') unzip_raw_data('cycle_power_plant.zip') # The zip file contains some junk and in addition, the data is in EXCEL format. This is addressed now: excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'CCPP/Folds5x2_pp.xlsx', engine = 'openpyxl') excel_data.to_csv(UCIVars.raw_data_folder + 'cycle_power_plant.data') shutil.rmtree(UCIVars.raw_data_folder + 'CCPP') # The response variable is in the last column data = load_raw_data('cycle_power_plant.data', sep = ',', description_columns = 1) data = move_label_in_front(data, 4) save_data_to_file(data, 'cycle_power_plant', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_carbon_nanotubes(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00448/carbon_nanotubes.csv', 'carbon_nanotubes.data') data = load_raw_data('carbon_nanotubes.data', sep = ';', german_decimal = True) data_u = remove_columns(data, [6, 7]) data_u = move_label_in_front(data_u, 5) save_data_to_file(data_u, 'carbon_nanotubes_u', is_classification = False) data_v = remove_columns(data, [5, 7]) data_v = move_label_in_front(data_v, 5) save_data_to_file(data_v, 'carbon_nanotubes_v', is_classification = False) data_w = remove_columns(data, [5, 6]) data_w = move_label_in_front(data_w, 5) save_data_to_file(data_w, 'carbon_nanotubes_w', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_naval_propulsion(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00316/UCI%20CBM%20Dataset.zip', 'naval_propulsion.zip') unzip_raw_data('naval_propulsion.zip') # The zip file contains quite a bit of junk, which is removed in the following shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/data.txt', UCIVars.raw_data_folder + 'naval_propulsion.data') shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/Features.txt', UCIVars.raw_data_folder + 'naval_propulsion.features.txt') shutil.copy(UCIVars.raw_data_folder + 'UCI CBM Dataset/README.txt', UCIVars.raw_data_folder + 'naval_propulsion.description') shutil.rmtree(UCIVars.raw_data_folder + 'UCI CBM Dataset/') shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX') data = load_raw_data('naval_propulsion.data', sep = ' ') # The data has actually three response variables, but one of those, namely the ship speed # is affine linear in the lever position, which is also recorded in the data. For this # reason, only the other two response variables are considered. data_comp = remove_columns(data, [17]) data_comp = move_label_in_front(data_comp, 16) save_data_to_file(data_comp, 'naval_propulsion_comp', is_classification = False) data_turb = remove_columns(data, [16]) data_turb = move_label_in_front(data_turb, 16) save_data_to_file(data_turb, 'naval_propulsion_turb', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_blood_pressure(): prepare_new_data_set_group_id() print("Cuff-Less Blood pressure Estimation is currently not processed since:") print(" - the zip file is about 3.1GB large") print(" - the description indicates that each of the three features is actually a times series") print(" - the file is in matlab format") #print('The following download may take a while, since the .zip file is about 3.1GB large.') #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00340/data.zip', 'blood_pressure.zip') #unzip_raw_data('blood_pressure.zip') #--------------------------------------------------------------------------------------------------- def get_gas_sensor_drift(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00270/driftdataset.zip', 'gas_sensor_drift.zip') unzip_raw_data('gas_sensor_drift.zip') concat_files(UCIVars.raw_data_folder + 'batch*.dat', UCIVars.raw_data_folder + 'gas_sensor_drift.data') remove_files(UCIVars.raw_data_folder, 'batch*.dat') # Next we need to replace ; by , in .data file, since otherwise the routines for libsvm-like formats won't work. # Also, the first label is multiplied by 10000 since the routine for libsvm-like formats seem to sort the # labels. By multiplying the label by 10000, we actually can guarantee that the first label is always the larger # one, so that the routine places it at the second position in the list of labels. # Then we read a libsvm like file with multiple labels and convert it from Compressed Sparse Row format to normal format replace_chars_in_file('gas_sensor_drift.data', ';', '0000,') data = datasets.load_svmlight_file(UCIVars.raw_data_folder + 'gas_sensor_drift.data', multilabel = True) x_data = data[0].toarray() all_labels = numpy.reshape(data[1], newshape = (-1, 2)) ## The data has two response variables, one indicating which chemical is measured ## and one reporting its concentration. We simply take both as being of interest ... class_labels = numpy.reshape(all_labels[ :, 1], newshape = (-1, 1)) / 10000.0 data_class = numpy.concatenate((class_labels, x_data), axis = 1) save_data_to_file(data_class, 'gas_sensor_drift_class', is_classification = True) conc_labels = numpy.reshape(all_labels[ :, 0], newshape = (-1, 1)) data_conc = numpy.concatenate((conc_labels, x_data), axis = 1) save_data_to_file(data_conc, 'gas_sensor_drift_conc', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_bike_sharing(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip', 'bike_sharing.zip') unzip_raw_data('bike_sharing.zip') os.remove(UCIVars.raw_data_folder + 'day.csv') os.rename(UCIVars.raw_data_folder + 'hour.csv', UCIVars.raw_data_folder + 'bike_sharing.data') os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'bike_sharing.description') data = load_raw_data('bike_sharing.data', sep = ',', description_columns = 2) data_casual = remove_columns(data, [13, 14]) data_casual = move_label_in_front(data_casual, 12) save_data_to_file(data_casual, 'bike_sharing_casual', is_classification = False) data_total = remove_columns(data, [12, 13]) data_total = move_label_in_front(data_total, 12) save_data_to_file(data_total, 'bike_sharing_total', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_appliances_energy(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv', 'appliances_energy.data') # The data entries are saved as strings, that is as "...". In addition, date and time are not separated by commas. # The following lines cure this. replace_chars_in_file('appliances_energy.data', '"', '') replace_chars_in_file('appliances_energy.data', ', ', ',') replace_chars_in_file('appliances_energy.data', ', ', ',') replace_chars_in_file('appliances_energy.data', ' ', ',') data = load_raw_data('appliances_energy.data', sep = ',', date_column = 0, date_sep = '-', date_order = 'Ymd', time_column = 1, time_sep = ':') data = move_label_in_front(data, 2) save_data_to_file(data, 'appliances_energy', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_indoor_loc(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00310/UJIndoorLoc.zip', 'indoor_loc.zip') unzip_raw_data('indoor_loc.zip') os.rename(UCIVars.raw_data_folder + 'UJIndoorLoc/trainingData.csv', UCIVars.raw_data_folder + 'indoor_loc.train.csv') os.rename(UCIVars.raw_data_folder + 'UJIndoorLoc/validationData.csv', UCIVars.raw_data_folder + 'indoor_loc.val.csv') shutil.rmtree(UCIVars.raw_data_folder + 'UJIndoorLoc') concat_files(UCIVars.raw_data_folder + 'indoor*.csv', UCIVars.raw_data_folder + 'indoor_loc.data') remove_files(UCIVars.raw_data_folder, 'indoor*.csv') # --- Regression part ------ data = load_raw_data('indoor_loc.data', sep = ',') data = remove_columns(data, range(523, 529)) data_long = remove_columns(data, [521, 522]) data_long = move_label_in_front(data_long, 520) save_data_to_file(data_long, 'indoor_loc_long', is_classification = False) data_lat = remove_columns(data, [520, 522]) data_lat = move_label_in_front(data_lat, 520) save_data_to_file(data_lat, 'indoor_loc_lat', is_classification = False) data_alt = remove_columns(data, [520, 521]) data_alt = move_label_in_front(data_alt, 520) save_data_to_file(data_alt, 'indoor_loc_alt', is_classification = False) # --- Classification part ----- data = load_raw_data('indoor_loc.data', sep = ',') data = remove_columns(data, range(526, 529)) data_relative = move_label_in_front(data, 525) data_relative = remove_columns(data_relative, range(521, 526)) save_data_to_file(data_relative, 'indoor_loc_relative', is_classification = True, is_regression = False) data_building = move_label_in_front(data, 523) data_building = remove_columns(data_building, range(521, 526)) save_data_to_file(data_building, 'indoor_loc_building', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_online_news_popularity(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip', 'online_news_popularity.zip') unzip_raw_data('online_news_popularity.zip') os.rename(UCIVars.raw_data_folder + 'OnlineNewsPopularity/OnlineNewsPopularity.csv', UCIVars.raw_data_folder + 'online_news_popularity.data') os.rename(UCIVars.raw_data_folder + 'OnlineNewsPopularity/OnlineNewsPopularity.names', UCIVars.raw_data_folder + 'online_news_popularity.description') shutil.rmtree(UCIVars.raw_data_folder + 'OnlineNewsPopularity') data = load_raw_data('online_news_popularity.data', sep = ', ', description_columns = 2) data = move_label_in_front(data, 58) save_data_to_file(data, 'online_news_popularity', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_facebook_comment_volume(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip', 'facebook_comment_volume.zip') unzip_raw_data('facebook_comment_volume.zip') os.rename(UCIVars.raw_data_folder + 'Dataset/Training/Features_Variant_1.csv', UCIVars.raw_data_folder + 'facebook_comment_volume.data') shutil.rmtree(UCIVars.raw_data_folder + 'Dataset') shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX') data = load_raw_data('facebook_comment_volume.data', sep = ',') data = move_label_in_front(data, 53) save_data_to_file(data, 'facebook_comment_volume', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_bejing_pm25(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv', 'bejing_pm25.data') replace_chars_in_file('bejing_pm25.data', 'cv', '0,0') replace_chars_in_file('bejing_pm25.data', 'NW', '1,2') replace_chars_in_file('bejing_pm25.data', 'NE', '1,1') replace_chars_in_file('bejing_pm25.data', 'SE', '2,1') replace_chars_in_file('bejing_pm25.data', 'SW', '2,2') data = load_raw_data('bejing_pm25.data', sep = ',', description_columns = 1) data = move_label_in_front(data, 4) save_data_to_file(data, 'bejing_pm25', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_protein_tertiary_structure(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv', 'protein_tertiary_structure.data') data = load_raw_data('protein_tertiary_structure.data', sep = ',') save_data_to_file(data, 'protein_tertiary_structure', is_classification = False) #--------------------------------------------------------------------------------------------------- def get_tamilnadu_electricity(): prepare_new_data_set_group_id() print("Tamilnadu Electricity data set is currently not processed since:") print(" - from the description it is completely unclear how this data set can be used") #--------------------------------------------------------------------------------------------------- def get_metro_interstate_traffic_volume(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz', 'metro_interstate_traffic_volume.zip') ungz_raw_data('metro_interstate_traffic_volume.zip') os.rename(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.zip.data', UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.data') data = load_mixed_raw_data('metro_interstate_traffic_volume.data', sep = ',', header = True) size = data.shape[0] data[0:size, 7] = [date_and_time.replace(' ', ',') for date_and_time in data[0:size, 7]] # Deal with the holidays: we put all holidays in one category, and all non-holidays in the other. # There are 11 holidays and 'None'. The latter receives the value 0, while all holidays receive the # value 1. The following code is based on string replacement and the particular form of the entries. data[0:size, 0] = [re.sub(r" ", '', holiday) for holiday in data[0:size, 0]] data[0:size, 0] = [re.sub(r"None", '0', holiday) for holiday in data[0:size, 0]] data[0:size, 0] = [re.sub(r"D", '1', holiday) for holiday in data[0:size, 0]] data[0:size, 0] = [re.sub(r"WashingtonsBirthday", '1', holiday) for holiday in data[0:size, 0]] data[0:size, 0] = [re.sub(r"StateFair", '1', holiday) for holiday in data[0:size, 0]] data[0:size, 0] = [re.sub(r"[a-zA-Z]", '', holiday) for holiday in data[0:size, 0]] # The weather is briefly described in column 5 and in more detail in column 6. # We create two data sets, one for each type of description. data_short = auto_replace_categories_in_mixed_data(data, 5, ',') data_short = remove_columns(data_short, 6) write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume_short.data', data_short, sep = ",") data_long = auto_replace_categories_in_mixed_data(data, 6, ',') data_long = remove_columns(data_long, 5) write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume_long.data', data_long, sep = ",") write_mixed_raw_data(UCIVars.raw_data_folder + 'metro_interstate_traffic_volume.data', data, sep = ",") replace_chars_in_file('metro_interstate_traffic_volume.data', ' ', ' ') # Now we are in the position to read the data, convert the time and date, and movel the labels data = load_raw_data('metro_interstate_traffic_volume_short.data', ',', description_columns = 0, date_column = 16, date_sep = '-', date_order = 'Ymd', time_column = 17, time_sep = ':') data = move_label_in_front(data, 18) save_data_to_file(data, 'metro_interstate_traffic_volume_short', is_classification = False, is_regression = True) data = load_raw_data('metro_interstate_traffic_volume_long.data', ',', description_columns = 0, date_column = 43, date_sep = '-', date_order = 'Ymd', time_column = 44, time_sep = ':') data = move_label_in_front(data, 45) save_data_to_file(data, 'metro_interstate_traffic_volume_long', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_facebook_live_sellers_thailand(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv', 'facebook_live_sellers_thailand.data') data = load_mixed_raw_data('facebook_live_sellers_thailand.data', sep = ",", header = True) # Columns 0 and 2 contain id and time information. These are deleted. The last 4 columns are empty, # and thus deleted, too. data = remove_columns(data, [0, 2, 12, 13, 14, 15]) # Next we replace the status_type by some numbers categories = [u'link', u'photo', u'status', u'video'] data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'facebook_live_sellers_thailand.data', data, sep = ",") data = load_raw_data('facebook_live_sellers_thailand.data', ',') # The classes 1 and 3 contain 63 and 365 samples, only. We remove them for the classification data set data_class = remove_rows_with_label(data, 1) data_class = remove_rows_with_label(data_class, 3) save_data_to_file(data_class, 'facebook_live_sellers_thailand_status', is_classification = True, is_regression = False) # For the regression data set, we pick the 'shares' column as label data_regr = move_label_in_front(data, 3) save_data_to_file(data_regr, 'facebook_live_sellers_thailand_shares', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_parking_birmingham(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00482/dataset.zip', 'parking_birmingham.zip') unzip_raw_data('parking_birmingham.zip') os.rename(UCIVars.raw_data_folder + 'dataset.csv', UCIVars.raw_data_folder + 'parking_birmingham.data') # One could also convert the name of the parking spot into a binary vector. However, this vector is # of dimension 30 and therefore it would dominate the remaining features. We thus use a one dimensional # representation, instead. data = load_mixed_raw_data('parking_birmingham.data', sep = ',', header = True) categories = ['BHMEURBRD01', 'BHMEURBRD02', 'Bull Ring', 'BHMBRCBRG02', 'BHMBRCBRG03', 'BHMBRCBRG01', 'Shopping', 'BHMNCPLDH01', 'BHMBCCSNH01', 'BHMNCPRAN01', 'BHMBCCPST01', 'Others-CCCPS133', 'BHMBRTARC01', 'Others-CCCPS98', 'NIA North', 'BHMNCPHST01', 'BHMNCPNST01', 'BHMNCPNHS01', 'BHMBCCTHL01', 'Others-CCCPS119a', 'Others-CCCPS8', 'Others-CCCPS105a', 'Broad Street', 'NIA South', 'NIA Car Parks', 'BHMBCCMKT01', 'BHMMBMMBX01', 'Others-CCCPS202', 'Others-CCCPS135a', 'BHMNCPPLS01'] data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.data', data, sep = ",") # Next we split date-time into two features replace_chars_in_file('parking_birmingham.data', ' ', ',') # Now, we convert the date into a weekday and then into a point on the circle # Furthermore, we create a second data set with rounded times fur possible future time series # treatment. data = load_mixed_raw_data('parking_birmingham.data', sep = ",", header = False) data = replace_isodate_by_day_in_mixed_data(data, 3) data = replace_circulars_in_mixed_data(data, get_categories_in_mixed_data(data, 3), 3, ",") write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.data', data, sep = ",") data = replace_time_by_seconds_in_mixed_data(data, 4, sep = ':', rounded = 1800) write_mixed_raw_data(UCIVars.raw_data_folder + 'parking_birmingham.rounded.data', data, sep = ",") # Now we compute the relative occupancy and use it as label # Note that we keep both the parking spot number and its capacity data = load_raw_data('parking_birmingham.data', ',', time_column = 5, time_sep = ':') data[:, 2] = data[:, 2] / data[:, 1] data = move_label_in_front(data, 2) save_data_to_file(data, 'parking_birmingham', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_tarvel_review_ratings(): prepare_new_data_set_group_id() # Download the data and correct the misspelling of its name download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00485/google_review_ratings.csv', 'travel_review_ratings.data') # Remove the commas at the end of each row and clean a few messy lines replace_chars_in_file('travel_review_ratings.data', ',\r', '\r') replace_chars_in_file('travel_review_ratings.data', '"', '') replace_chars_in_file('travel_review_ratings.data', ',,', ',') replace_chars_in_file('travel_review_ratings.data', '\t', '') data = load_raw_data('travel_review_ratings.data', ',', description_columns = 1, header = True) # Determine the first column that contains the most ratings, use it as label, and remove possible rows # with label = 0 ratings_counts = data.astype(bool).sum(axis=0) most_rated_column = numpy.argmax(ratings_counts) data = move_label_in_front(data, most_rated_column) remove_rows_with_label(data, 0.0) save_data_to_file(data, 'travel_review_ratings', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_superconductivity(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip', 'superconductivity.zip') unzip_raw_data('superconductivity.zip') os.rename(UCIVars.raw_data_folder + 'train.csv', UCIVars.raw_data_folder + 'superconductivity.data') os.remove(UCIVars.raw_data_folder + 'unique_m.csv') data = load_raw_data('superconductivity.data', ',', header = True) data_regr = move_label_in_front(data, 81) save_data_to_file(data_regr, 'superconductivity', is_classification = False, is_regression = True) # We also create a classification daat set, in which we try to identify materials with critical temperature above 77K. # We refer to https://en.wikipedia.org/wiki/Superconductivity for the importance of this threhsod in view of liquid nitrogen. data_class = move_label_in_front(data, 81) temperature_above_77K = data_class[:,0] > 77 data_class[:,0] = temperature_above_77K.astype(float) save_data_to_file(data_class, 'superconductivity_class', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_gnfuv_unmanned_surface_vehicles(): prepare_new_data_set_group_id() print("GNFUV Unmanned Surface Vehicles is currently not processed since:") print(" - the description indicates that it is actually very complicated times series data") #--------------------------------------------------------------------------------------------------- def get_five_cities_pm25(): prepare_new_data_set_group_id() print("PM2.5 of Five Chinese Cities is used since:") print(" - it actually contains 5 data sets of around 20.000 samples, each.") download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00394/FiveCitiePMData.rar', 'five_cities_pm25.rar') unrar_raw_data('five_cities_pm25.rar') cities = {} pm_locs = {} cities[0] = 'ShenyangPM20100101_20151231.csv' cities[1] = 'ChengduPM20100101_20151231.csv' cities[2] = 'BeijingPM20100101_20151231.csv' cities[3] = 'GuangzhouPM20100101_20151231.csv' cities[4] = 'ShanghaiPM20100101_20151231.csv' pm_locs[0] = (5,6,7) pm_locs[1] = (5,6,7) pm_locs[2] = (5,6,7,8) pm_locs[3] = (5,6,7) pm_locs[4] = (5,6,7) for i in range(0, 5): new_city_name = 'five_cities_' + cities[i][:-23].lower() + '_pm25.data' os.rename(UCIVars.raw_data_folder + cities[i], UCIVars.raw_data_folder + new_city_name) cities[i] = new_city_name replace_chars_in_file(cities[i], 'cv', '0,0') replace_chars_in_file(cities[i], 'NW', '1,2') replace_chars_in_file(cities[i], 'NE', '1,1') replace_chars_in_file(cities[i], 'SE', '2,1') replace_chars_in_file(cities[i], 'SW', '2,2') data = load_raw_data(cities[i], sep = ',', description_columns = 1) number_of_rows = numpy.shape(data)[0] pm_concs = data[0:number_of_rows, pm_locs[i]] pm_concs = numpy.mean(pm_concs, axis = 1) pm_concs = numpy.reshape(pm_concs, newshape = (number_of_rows, 1)) data = remove_columns(data, pm_locs[i]) data = numpy.concatenate((pm_concs, data), axis = 1) save_data_to_file(data, new_city_name[:-5], is_classification = False) #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- def get_phishing(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00327/Training%20Dataset.arff', 'phishing.arff') replace_chars_in_file('phishing.arff', ' -1', '-1') replace_chars_in_file('phishing.arff', ' 1', '1') replace_chars_in_file('phishing.arff', '1 ', '1') replace_chars_in_file('phishing.arff', '-1 ', '-1') replace_chars_in_file('phishing.arff', '0 ', '0') replace_chars_in_file('phishing.arff', ' 0', '0') unarff_raw_data('phishing') data = load_raw_data('phishing.data', sep = ',', description_columns = 0) data = move_label_in_front(data, 30) save_data_to_file(data, 'phishing', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_ozone_level(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.data', 'ozone_level_8hr.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.names', 'ozone_level_8hr.description') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data', 'ozone_level_1hr.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.names', 'ozone_level_1hr.description') data = load_raw_data('ozone_level_8hr.data', sep = ',', description_columns = 1, na_string = '?') data = move_label_in_front(data, 72) save_data_to_file(data, 'ozone_level_8hr', is_classification = True, is_regression = False) data = load_raw_data('ozone_level_1hr.data', sep = ',', description_columns = 1, na_string = '?') data = move_label_in_front(data, 72) save_data_to_file(data, 'ozone_level_1hr', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_opportunity_activity(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00226/OpportunityUCIDataset.zip', 'opportunity_activity.zip') #unzip_raw_data('opportunity_activity.zip') print("Opportunity Activity Recognition is currently not processed since:") print(" - the zip file is about 292MB large") print(" - the description indicates that it is actually times series data") #--------------------------------------------------------------------------------------------------- def get_australian_sign_language(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/auslan2-mld/tctodd.tar.gz', 'australian_sign_language.tar.gz') print("Australian Sign Language is currently not processed since:") print(" - each sign only has 27 samples") #--------------------------------------------------------------------------------------------------- def get_seismic_bumps(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff', 'seismic_bumps.arff') unarff_raw_data('seismic_bumps') replace_chars_in_file('seismic_bumps.data', 'a', '1') replace_chars_in_file('seismic_bumps.data', 'b', '2') replace_chars_in_file('seismic_bumps.data', 'c', '3') replace_chars_in_file('seismic_bumps.data', 'd', '4') replace_chars_in_file('seismic_bumps.data', 'N', '0') replace_chars_in_file('seismic_bumps.data', 'W', '1') data = load_raw_data('seismic_bumps.data', sep = ',') data = move_label_in_front(data, 18) save_data_to_file(data, 'seismic_bumps', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_meu_mobile_ksd(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00399/MEU-Mobile%20KSD%202016.xlsx', 'meu_mobile_ksd.xlsx') print("MEU-Mobile KSD is currently not processed since:") print(" - according to the description it seems to be a anomaly detection data set") #--------------------------------------------------------------------------------------------------- def get_character_trajectories(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/character-trajectories/mixoutALL_shifted.mat', 'character_trajectories.mat') print("Character Trajectories is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the file is in matlab format") #--------------------------------------------------------------------------------------------------- def get_vicon_physical_action(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00214/Vicon%20Physical%20Action%20Data%20Set.rar', 'vicon_physical_action.rar') print("Vicon Physical Action is currently not processed since:") print(" - according to the description and an follow-up inspection it seems to be a time series data set") #--------------------------------------------------------------------------------------------------- def get_simulated_falls(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00455/Tests.rar', 'simulated_falls.rar') print("Simulated Falls and Daily Living Activities is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the data set size is 1.2GB") #--------------------------------------------------------------------------------------------------- def get_chess(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data', 'chess.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.names', 'chess.description') replace_chars_in_file('chess.data', 'nowin', '-1') replace_chars_in_file('chess.data', 'won', '1') replace_chars_in_file('chess.data', 'b', '0') replace_chars_in_file('chess.data', 'f', '1') replace_chars_in_file('chess.data', 'g', '2') replace_chars_in_file('chess.data', 'l', '3') replace_chars_in_file('chess.data', 'n', '4') replace_chars_in_file('chess.data', 't', '5') replace_chars_in_file('chess.data', 'w', '6') data = load_raw_data('chess.data', sep = ',') data = move_label_in_front(data, 36) save_data_to_file(data, 'chess', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_abalone(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', 'abalone.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names', 'abalone.description') replace_chars_in_file('abalone.data', 'F', '-1') replace_chars_in_file('abalone.data', 'I', '0') replace_chars_in_file('abalone.data', 'M', '1') data = load_raw_data('abalone.data', sep = ',') save_data_to_file(data, 'abalone', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_madelon(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data', 'madelon.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels', 'madelon.train.labels.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_valid.data', 'madelon.valid.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/madelon_valid.labels', 'madelon.valid.labels.data') # I could not find the test labels, so the test data set is not included. LIBSVM's data set does not contain the test part, either #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_test.data', 'madelon.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/Dataset.pdf', 'madelon.description.pdf') train_data = load_raw_data('madelon.train.data', sep = ' ') train_label = load_raw_data('madelon.train.labels.data', sep = ' ') train_data = numpy.concatenate((train_label, train_data), axis = 1) valid_data = load_raw_data('madelon.valid.data', sep = ' ') valid_label = load_raw_data('madelon.valid.labels.data', sep = ' ') valid_data = numpy.concatenate((valid_label, valid_data), axis = 1) data = numpy.concatenate((train_data, valid_data), axis = 0) save_data_to_file(data, 'madelon', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_spambase(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.zip', 'spambase.zip') unzip_raw_data('spambase.zip') os.rename(UCIVars.raw_data_folder + 'spambase.names', UCIVars.raw_data_folder + 'spambase.feature.txt') os.rename(UCIVars.raw_data_folder + 'spambase.DOCUMENTATION', UCIVars.raw_data_folder + 'spambase.description') data = load_raw_data('spambase.data', sep = ',') data = move_label_in_front(data, 57) save_data_to_file(data, 'spambase', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_wilt(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00285/wilt.zip', 'wilt.zip') unzip_raw_data('wilt.zip') os.rename(UCIVars.raw_data_folder + 'training.csv', UCIVars.raw_data_folder + 'wilt.train.data') os.rename(UCIVars.raw_data_folder + 'testing.csv', UCIVars.raw_data_folder + 'wilt.test.data') concat_files(UCIVars.raw_data_folder + 'wilt.t*.data', UCIVars.raw_data_folder + 'wilt.data') replace_chars_in_file('wilt.data', 'n', '-1') replace_chars_in_file('wilt.data', 'w', '1') data = load_raw_data('wilt.data', sep = ',') save_data_to_file(data, 'wilt', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_waveform(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform.data.Z', 'waveform.Z') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform.names', 'waveform.description') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform-+noise.data.Z', 'waveform_noise.Z') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform-+noise.names', 'waveform_noise.description') success = un_z_raw_data('waveform.Z') if success == True: os.rename(UCIVars.raw_data_folder + 'waveform', UCIVars.raw_data_folder + 'waveform.data') data = load_raw_data('waveform.data', sep = ',') data = move_label_in_front(data, 21) save_data_to_file(data, 'waveform', is_classification = True, is_regression = False) else: print("The waveform data set could not be built.") success = un_z_raw_data('waveform_noise.Z') if success == True: os.rename(UCIVars.raw_data_folder + 'waveform_noise', UCIVars.raw_data_folder + 'waveform_noise.data') data = load_raw_data('waveform_noise.data', sep = ',') data = move_label_in_front(data, 40) save_data_to_file(data, 'waveform_noise', is_classification = True, is_regression = False) else: print("The waveform_noise data set could not be built.") #--------------------------------------------------------------------------------------------------- def get_wall_following_robot(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00194/AllData.zip', 'wall_follow_robot.zip') unzip_raw_data('wall_follow_robot.zip') os.rename(UCIVars.raw_data_folder + 'Wall-following.names', UCIVars.raw_data_folder + 'wall_follow_robot.description') os.rename(UCIVars.raw_data_folder + 'sensor_readings_2.data', UCIVars.raw_data_folder + 'wall_follow_robot_2.data') os.rename(UCIVars.raw_data_folder + 'sensor_readings_4.data', UCIVars.raw_data_folder + 'wall_follow_robot_4.data') os.rename(UCIVars.raw_data_folder + 'sensor_readings_24.data', UCIVars.raw_data_folder + 'wall_follow_robot_24.data') categories = ['Slight-Left-Turn', 'Move-Forward', 'Slight-Right-Turn', 'Sharp-Right-Turn'] data = load_mixed_raw_data('wall_follow_robot_2.data', sep = ',', header = False) data = replace_ordinals_in_mixed_data(data, categories, 2, ',', unknown_string = '') write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_2.trafo.data', data, sep = ',') data = load_mixed_raw_data('wall_follow_robot_4.data', sep = ',', header = False) data = replace_ordinals_in_mixed_data(data, categories, 4, ',', unknown_string = '') write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_4.trafo.data', data, sep = ',') data = load_mixed_raw_data('wall_follow_robot_24.data', sep = ',', header = False) data = replace_ordinals_in_mixed_data(data, categories, 24, ',', unknown_string = '') write_mixed_raw_data(UCIVars.raw_data_folder + 'wall_follow_robot_24.trafo.data', data, sep = ',') data = load_raw_data('wall_follow_robot_2.trafo.data', sep = ',') data = move_label_in_front(data, 2) save_data_to_file(data, 'wall_follow_robot_2', is_classification = True, is_regression = True) data = load_raw_data('wall_follow_robot_4.trafo.data', sep = ',') data = move_label_in_front(data, 4) save_data_to_file(data, 'wall_follow_robot_4', is_classification = True, is_regression = True) data = load_raw_data('wall_follow_robot_24.trafo.data', sep = ',') data = move_label_in_front(data, 24) save_data_to_file(data, 'wall_follow_robot_24', is_classification = True, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_page_blocks(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/page-blocks/page-blocks.data.Z', 'page_blocks.Z') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/page-blocks/page-blocks.names', 'page_blocks.description') success = un_z_raw_data('page_blocks.Z') if success == True: os.rename(UCIVars.raw_data_folder + 'page_blocks', UCIVars.raw_data_folder + 'page_blocks.data') replace_chars_in_file('page_blocks.data', ' ', ' ') replace_chars_in_file('page_blocks.data', ' ', ' ') replace_chars_in_file('page_blocks.data', ' ', ' ') replace_chars_in_file('page_blocks.data', ' ', ' ') replace_chars_in_file('page_blocks.data', ' ', ' ') replace_chars_in_file('page_blocks.data', ' ', ',') data = load_raw_data('page_blocks.data', sep = ',', description_columns = 1) data = move_label_in_front(data, 10) save_data_to_file(data, 'page_blocks', is_classification = True, is_regression = False) else: print("The waveform data set could not be built.") #--------------------------------------------------------------------------------------------------- def get_optical_recognition_handwritten_digits(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', 'optical_recognition_handwritten_digits.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', 'optical_recognition_handwritten_digits.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.names', 'optical_recognition_handwritten_digits.description') # The additional 'original' data sets contain the bitmaps of the handwritten digits in a strange format. # For this reason, they are not further considered. concat_files(UCIVars.raw_data_folder + 'optical_recognition_handwritten_digits.*.data', UCIVars.raw_data_folder + 'optical_recognition_handwritten_digits.data') data = load_raw_data('optical_recognition_handwritten_digits.data', sep = ',') data = move_label_in_front(data, 64) save_data_to_file(data, 'optical_recognition_handwritten_digits', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_bach_chorals_harmony(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00298/jsbach_chorals_harmony.zip', 'bach_chorals_harmony.zip') print("Bach Chorals Harmony is currently not processed since:") print(" - it contains a lot of classes with a handful of samples, only") #--------------------------------------------------------------------------------------------------- def get_turkiye_student_evaluation(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00262/turkiye-student-evaluation_generic.csv', 'turkiye_student_evaluation.data') # Without an explicit target variable, we decided to use the instructor id as target variable data = load_raw_data('turkiye_student_evaluation.data', sep = ',') save_data_to_file(data, 'turkiye_student_evaluation', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_smartphone_human_activity(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00364/dataset_uci.zip', 'smartphone_human_activity.zip') unzip_raw_data('smartphone_human_activity.zip') os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_X_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.train.data') os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_X_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.test.data') os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_y_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.train.labels.data') os.rename(UCIVars.raw_data_folder + 'dataset_uci/final_y_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.test.labels.data') os.rename(UCIVars.raw_data_folder + 'dataset_uci/features_info.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.features.txt') os.rename(UCIVars.raw_data_folder + 'dataset_uci/README.txt', UCIVars.raw_data_folder + 'smartphone_human_activity.description') shutil.rmtree(UCIVars.raw_data_folder + 'dataset_uci') train_data = load_raw_data('smartphone_human_activity.train.data', sep = ',') train_label = load_raw_data('smartphone_human_activity.train.labels.data', sep = ',') train_data = numpy.concatenate((train_label, train_data), axis = 1) test_data = load_raw_data('smartphone_human_activity.test.data', sep = ',') test_label = load_raw_data('smartphone_human_activity.test.labels.data', sep = ',') test_data = numpy.concatenate((test_label, test_data), axis = 1) data = numpy.concatenate((train_data, test_data), axis = 0) save_data_to_file(data, 'smartphone_human_activity', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_artificial_characters(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/artificial-characters/character.tar.Z', 'artificial_characters.tar.Z') #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/artificial-characters/character.names', 'artificial_characters.description') print("Artificial Characters is currently not processed since:") print(" - the data comes in a rather convoluted form") #--------------------------------------------------------------------------------------------------- def get_first_order_theorem_proving(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00249/ml-prove.tar.gz', 'first_order_theorem_proving.tar.gz') untar_raw_data('first_order_theorem_proving.tar.gz') os.rename(UCIVars.raw_data_folder + 'ml-prove/all-data-raw.csv', UCIVars.raw_data_folder + 'first_order_theorem_proving.data') os.rename(UCIVars.raw_data_folder + 'ml-prove/bridge-holden-paulson-details.txt', UCIVars.raw_data_folder + 'first_order_theorem_proving.description') shutil.rmtree(UCIVars.raw_data_folder + 'ml-prove') data = load_raw_data('first_order_theorem_proving.data', sep = ',') rows = numpy.shape(data)[0] columns = numpy.shape(data)[1] times_of_heuristics = data[0:rows, columns - 5:columns] data_features = data[0:rows, 0:columns - 5] # Create class labels, where -1 means the "decline" option, that occurs, if none of the # five considered heuristics finished within 100 secs. Also, there are 13 samples, in # which the heuristics appear to have finished instantaneously. These get a positive label. # One could also create regression tasks for each of the heuristics, but for now, we # don't do this. class_labels = numpy.reshape(numpy.sign(numpy.amax(times_of_heuristics, axis = 1)), newshape = (rows, 1)) class_labels[numpy.where(class_labels[0:rows, 0] == 0)] = 1.0 class_data = numpy.concatenate((class_labels, data_features), axis = 1) save_data_to_file(class_data, 'first_order_theorem_proving', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_landsat_satimage(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn', 'landsat_satimage.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst', 'landsat_satimage.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.doc', 'landsat_satimage.description') concat_files(UCIVars.raw_data_folder + 'landsat_satimage.*.data', UCIVars.raw_data_folder + 'landsat_satimage.data') data = load_raw_data('landsat_satimage.data', sep = ' ') data = move_label_in_front(data, 36) save_data_to_file(data, 'landsat_satimage', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_hiv_1_protease(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00330/newHIV-1_data.zip', 'hiv_1_protease.zip') print("HIV-1 protease is currently not processed since:") print(" - the 1D data comes in a rather convoluted form") #--------------------------------------------------------------------------------------------------- def get_musk(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z', 'musk.Z') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.info', 'musk.description') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.names', 'musk.features.txt') success = un_z_raw_data('musk.Z') if success == True: os.rename(UCIVars.raw_data_folder + 'musk', UCIVars.raw_data_folder + 'musk.data') data = load_raw_data('musk.data', description_columns = 2, sep = ',') data = move_label_in_front(data, 166) save_data_to_file(data, 'musk', is_classification = True, is_regression = False) else: print("The musk data set could not be built.") #--------------------------------------------------------------------------------------------------- def get_ble_rssi_indoor_location(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00435/BLE_RSSI_dataset.zip', 'ble_rssi_indoor_location.zip') print("BLE RSSI indoor location is currently not processed since:") print(" - it only has 1420 labeled samples") #--------------------------------------------------------------------------------------------------- def get_australian_sign_language(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/auslan-mld/allsigns.tar.gz', 'australian_sign_language.zip') print("Australian sign language is currently not processed since:") print(" - the 1D data comes in a rather convoluted form") print(" - it truly seems to be time series data") #--------------------------------------------------------------------------------------------------- def get_anuran_calls(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00406/Anuran%20Calls%20(MFCCs).zip', 'anuran_calls.zip') unzip_raw_data('anuran_calls.zip') os.rename(UCIVars.raw_data_folder + 'Frogs_MFCCs.csv', UCIVars.raw_data_folder + 'anuran_calls.data') os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'anuran_calls.description') data = load_mixed_raw_data('anuran_calls.data', sep = ',', header = True) categories = sorted(get_categories_in_mixed_data(data, 22)) data = replace_ordinals_in_mixed_data(data, categories, 22, separator = ',', unknown_replacement_value = 0, begin_value = 1) categories = get_categories_in_mixed_data(data, 23) data = replace_ordinals_in_mixed_data(data, categories, 23, separator = ',', unknown_replacement_value = 0, begin_value = 1) categories = get_categories_in_mixed_data(data, 24) data = replace_ordinals_in_mixed_data(data, categories, 24, separator = ',', unknown_replacement_value = 0, begin_value = 1) write_mixed_raw_data(UCIVars.raw_data_folder + 'anuran_calls.data', data, sep = ',') data = load_raw_data('anuran_calls.data', sep = ',') data = remove_columns(data, [25]) # There are three different classification problems, each have a few classes # with less than 250 samples. The following lines build these three problems # and remove the small classes. data_species = remove_columns(data, [22, 23]) data_species = move_label_in_front(data_species, 22) rows = numpy.shape(data_species)[0] data_species = data_species[numpy.where(data_species[0:rows, 0] != 3)[0], 0:24] rows = numpy.shape(data_species)[0] data_species = data_species[numpy.where(data_species[0:rows, 0] != 6)[0], 0:24] rows = numpy.shape(data_species)[0] data_species = data_species[numpy.where(data_species[0:rows, 0] != 10)[0], 0:24] save_data_to_file(data_species, 'anuran_calls_species', is_classification = True, is_regression = False) data_genus = remove_columns(data, [22, 24]) data_genus = move_label_in_front(data_genus, 22) rows = numpy.shape(data_genus)[0] data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 1)[0], 0:24] rows = numpy.shape(data_genus)[0] data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 4)[0], 0:24] rows = numpy.shape(data_genus)[0] data_genus = data_genus[numpy.where(data_genus[0:rows, 0] != 5)[0], 0:24] save_data_to_file(data_genus, 'anuran_calls_genus', is_classification = True, is_regression = False) data_families = remove_columns(data, [23, 24]) data_families = move_label_in_front(data_families, 22) rows = numpy.shape(data_families)[0] data_families = data_families[numpy.where(data_families[0:rows, 0] != 1)[0], 0:24] save_data_to_file(data_families, 'anuran_calls_families', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_thyroids(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick-euthyroid.data', 'thyroid_sick_eu.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick-euthyroid.names', 'thyroid_sick_eu.description') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.data', 'thyroid_sick.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.test', 'thyroid_sick.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/sick.names', 'thyroid_sick.description') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.data', 'thyroid_dis.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.test', 'thyroid_dis.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/dis.names', 'thyroid_dis.description') # new-thyroid.data only contains 215 samples and is thus omitted download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data', 'thyroid_hypo.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names', 'thyroid_hypo.description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-train.data', 'thyroid_ann.train.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-test.data', 'thyroid_ann.test.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-thyroid.names', 'thyroid_ann.description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ann-Readme', 'thyroid_ann.more_description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.data', 'thyroid_all_bp.train.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.test', 'thyroid_all_bp.test.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allbp.names', 'thyroid_all_bp.description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.data', 'thyroid_all_rep.train.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.test', 'thyroid_all_rep.test.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allrep.names', 'thyroid_all_rep.description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.data', 'thyroid_all_hypo.train.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.test', 'thyroid_all_hypo.test.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhypo.names', 'thyroid_all_hypo.description') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.data', 'thyroid_all_hyper.train.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.test', 'thyroid_all_hyper.test.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/allhyper.names', 'thyroid_all_hyper.description') #-------------------------------------------------- data = load_mixed_raw_data('thyroid_sick_eu.data', sep = ',', header = False) categories = [u'sick-euthyroid', u'negative'] data = replace_categories_in_mixed_data(data, categories, 0, separator = ',') for col in range(2, 15): categories = get_categories_in_mixed_data(data, col) data = replace_bin_cats_in_mixed_data(data, categories, col, separator = ',') columns = [16, 18, 20, 22, 24] for col in columns: categories = get_categories_in_mixed_data(data, col) data = replace_bin_cats_in_mixed_data(data, categories, col, separator = ',') # The last column is still in bad shape. The next two lines fix this # problem by a little dirty trick write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick_eu.data', data, sep = ',') data = load_mixed_raw_data('thyroid_sick_eu.data', sep = ',', header = False) data = auto_replace_missing_in_mixed_data(data, unknown_string = '?') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick_eu.data', data, sep = ',') data = load_raw_data('thyroid_sick_eu.data', sep = ',', na_string = '?') save_data_to_file(data, 'thyroid_sick_eu', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_sick.t*', UCIVars.raw_data_folder + 'thyroid_sick.data') replace_chars_in_file('thyroid_sick.data', '.|', ',') replace_chars_in_file('thyroid_sick.data', 'F', '0') replace_chars_in_file('thyroid_sick.data', 'M', '1') replace_chars_in_file('thyroid_sick.data', 'f', '0') replace_chars_in_file('thyroid_sick.data', 't', '1') replace_chars_in_file('thyroid_sick.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_sick.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') data = auto_replace_categories_in_mixed_data(data, 29, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_sick.data', data, sep = ',') data = load_raw_data('thyroid_sick.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_sick', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_dis.t*', UCIVars.raw_data_folder + 'thyroid_dis.data') replace_chars_in_file('thyroid_dis.data', '.|', ',') replace_chars_in_file('thyroid_dis.data', 'F', '0') replace_chars_in_file('thyroid_dis.data', 'M', '1') replace_chars_in_file('thyroid_dis.data', 'f', '0') replace_chars_in_file('thyroid_dis.data', 't', '1') replace_chars_in_file('thyroid_dis.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_dis.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') data = auto_replace_categories_in_mixed_data(data, 29, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_dis.data', data, sep = ',') data = load_raw_data('thyroid_dis.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_dis', is_classification = True, is_regression = False) #-------------------------------------------------- replace_chars_in_file('thyroid_hypo.data', 'F', '0') replace_chars_in_file('thyroid_hypo.data', 'M', '1') replace_chars_in_file('thyroid_hypo.data', 'f', '0') replace_chars_in_file('thyroid_hypo.data', 't', '1') replace_chars_in_file('thyroid_hypo.data', 'n', '0') replace_chars_in_file('thyroid_hypo.data', 'y', '1') replace_chars_in_file('thyroid_hypo.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_hypo.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 0, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_hypo.data', data, sep = ',') data = load_raw_data('thyroid_hypo.data', sep = ',', na_string = '?') save_data_to_file(data, 'thyroid_hypo', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_ann.t*', UCIVars.raw_data_folder + 'thyroid_ann.data') data = load_raw_data('thyroid_ann.data', sep = ' ', na_string = '?') data = move_label_in_front(data, 21) save_data_to_file(data, 'thyroid_ann', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_all_bp.t*', UCIVars.raw_data_folder + 'thyroid_all_bp.data') replace_chars_in_file('thyroid_all_bp.data', '.|', ',') replace_chars_in_file('thyroid_all_bp.data', 'F', '0') replace_chars_in_file('thyroid_all_bp.data', 'M', '1') replace_chars_in_file('thyroid_all_bp.data', 'f', '0') replace_chars_in_file('thyroid_all_bp.data', 't', '1') replace_chars_in_file('thyroid_all_bp.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_all_bp.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') # We combine all 2 non-negative classes to one, they are all very small categories = sorted(get_categories_in_mixed_data(data, 29)) data = replace_manual_in_mixed_data(data, categories, 29, (1, 1, 2), ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_bp.data', data, sep = ',') data = load_raw_data('thyroid_all_bp.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_all_bp', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_all_rep.t*', UCIVars.raw_data_folder + 'thyroid_all_rep.data') replace_chars_in_file('thyroid_all_rep.data', '.|', ',') replace_chars_in_file('thyroid_all_rep.data', 'F', '0') replace_chars_in_file('thyroid_all_rep.data', 'M', '1') replace_chars_in_file('thyroid_all_rep.data', 'f', '0') replace_chars_in_file('thyroid_all_rep.data', 't', '1') replace_chars_in_file('thyroid_all_rep.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_all_rep.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') # We combine all 3 non-negative classes to one, they are all very small categories = sorted(get_categories_in_mixed_data(data, 29)) data = replace_manual_in_mixed_data(data, categories, 29, (1, 2, 2, 2), ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_rep.data', data, sep = ',') data = load_raw_data('thyroid_all_rep.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_all_rep', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_all_hypo.t*', UCIVars.raw_data_folder + 'thyroid_all_hypo.data') replace_chars_in_file('thyroid_all_hypo.data', '.|', ',') replace_chars_in_file('thyroid_all_hypo.data', 'F', '0') replace_chars_in_file('thyroid_all_hypo.data', 'M', '1') replace_chars_in_file('thyroid_all_hypo.data', 'f', '0') replace_chars_in_file('thyroid_all_hypo.data', 't', '1') replace_chars_in_file('thyroid_all_hypo.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_all_hypo.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') # We combine 'primary' and 'secondary' to a new class since 'secondary' only has 2 samples categories = sorted(get_categories_in_mixed_data(data, 29)) data = replace_manual_in_mixed_data(data, categories, 29, (1, 2, 3, 3), ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_hypo.data', data, sep = ',') data = load_raw_data('thyroid_all_hypo.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_all_hypo', is_classification = True, is_regression = False) #-------------------------------------------------- concat_files(UCIVars.raw_data_folder + 'thyroid_all_hyper.t*', UCIVars.raw_data_folder + 'thyroid_all_hyper.data') replace_chars_in_file('thyroid_all_hyper.data', '.|', ',') replace_chars_in_file('thyroid_all_hyper.data', 'F', '0') replace_chars_in_file('thyroid_all_hyper.data', 'M', '1') replace_chars_in_file('thyroid_all_hyper.data', 'f', '0') replace_chars_in_file('thyroid_all_hyper.data', 't', '1') replace_chars_in_file('thyroid_all_hyper.data', ',0,?', ',0,0') data = load_mixed_raw_data('thyroid_all_hyper.data', sep = ',', header = False) data = auto_replace_categories_in_mixed_data(data, 28, ',') # We combine all 4 non-negative classes to one, they are all very small categories = sorted(get_categories_in_mixed_data(data, 29)) data = replace_manual_in_mixed_data(data, categories, 29, (1, 1, 1, 2, 1), ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'thyroid_all_hyper.data', data, sep = ',') data = load_raw_data('thyroid_all_hyper.data', sep = ',', na_string = '?') data = remove_columns(data, 34) data = move_label_in_front(data, 33) save_data_to_file(data, 'thyroid_all_hyper', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_isolet(): prepare_new_data_set_group_id() #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/isolet/isolet1+2+3+4.data.Z', 'isolet.train.Z') #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/isolet/isolet5.data.Z', 'isolet.test.Z') print("ISOLET is currently not processed since:") print(" - all classes are rather small (around 300 each)") #--------------------------------------------------------------------------------------------------- def get_mushroom(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 'mushroom.data') download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names', 'mushroom.description') data = load_mixed_raw_data('mushroom.data', sep = ',', header = False) columns = numpy.shape(data)[1] for col in range(0, columns): data = auto_replace_categories_in_mixed_data(data, col, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'mushroom.data', data, sep = ',') data = load_raw_data('mushroom.data', sep = ',') save_data_to_file(data, 'mushroom', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_assamese_characters(): prepare_new_data_set_group_id() #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00208/Online%20Handwritten%20Assamese%20Characters%20Dataset.rar', 'assamese_characters.rar') print("Assamese Characters is currently not processed since:") print(" - all classes are rather small (around 45 each)") #--------------------------------------------------------------------------------------------------- def get_arabic_digit(): prepare_new_data_set_group_id() #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/Test_Arabic_Digit.txt', 'arabic_digit.test.data') #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/Train_Arabic_Digit.txt', 'arabic_digit.train.data') #download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00195/documentation.html', 'arabic_digit.html') print("Arabic Digits is currently not processed since:") print(" - I could not find the time to figure out the format") #--------------------------------------------------------------------------------------------------- def get_eeg_steady_state_visual(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00457/BCI-SSVEP_Database_Aceves.zip', 'eeg_steady_state_visual.zip') print("EMG Physical Action is currently not processed since:") print(" - the data comes in a rather convoluted form") print(" - it truly seems to be time series data") #--------------------------------------------------------------------------------------------------- def get_gesture_phase_segmentation(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00302/gesture_phase_dataset.zip', 'gesture_phase_segmentation.zip') unzip_raw_data('gesture_phase_segmentation.zip') os.rename(UCIVars.raw_data_folder + 'data_description.txt', UCIVars.raw_data_folder + 'gesture_phase_segmentation.description') letters = ['a', 'b', 'c'] versions = ['raw', 'va3'] for version in versions: for letter in letters: concat_files(UCIVars.raw_data_folder + letter + '?_' + version + '.csv', UCIVars.raw_data_folder + 'gesture_phase_segmentation.' + letter + version + '.data') remove_files(UCIVars.raw_data_folder, letter + '?_' + version + '.csv') tmp_filename = 'gesture_phase_segmentation.?' + version + '.data' version_filename = 'gesture_phase_segmentation_' + version + '.data' concat_files(UCIVars.raw_data_folder + tmp_filename, UCIVars.raw_data_folder + version_filename) remove_files(UCIVars.raw_data_folder, tmp_filename) if version == 'raw': replace_chars_in_file(version_filename, 'Rest', '1') replace_chars_in_file(version_filename, 'Preparation', '2') replace_chars_in_file(version_filename, 'Stroke', '3') replace_chars_in_file(version_filename, 'Hold', '4') replace_chars_in_file(version_filename, 'Retraction', '5') else: replace_chars_in_file(version_filename, 'D', '1') replace_chars_in_file(version_filename, 'P', '2') replace_chars_in_file(version_filename, 'S', '3') replace_chars_in_file(version_filename, 'H', '4') replace_chars_in_file(version_filename, 'R', '5') data = load_raw_data(version_filename, sep = ',') columns = numpy.shape(data)[1] data = move_label_in_front(data, columns - 1) save_data_to_file(data, 'gesture_phase_segmentation_' + version, is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_emg_physical_action(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00213/EMG%20Physical%20Action%20Data%20Set.rar', 'emg_physical_action.rar') #unrar_raw_data('emg_physical_action.rar') print("EMG Physical Action is currently not processed since:") print(" - the data comes in a rather convoluted form") #--------------------------------------------------------------------------------------------------- def get_human_activity_smartphone(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip', 'human_activity_smartphone.zip') unzip_raw_data('human_activity_smartphone.zip') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/train/X_train.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.train.data') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/test/X_test.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.test.data') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/train/y_train.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.train.labels.data') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/test/y_test.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.test.labels.data') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/features_info.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.features.txt') os.rename(UCIVars.raw_data_folder + 'UCI HAR Dataset/README.txt', UCIVars.raw_data_folder + 'human_activity_smartphone.description') shutil.rmtree(UCIVars.raw_data_folder + 'UCI HAR Dataset') shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX') replace_chars_in_file('human_activity_smartphone.train.data', ' ', ' ') replace_chars_in_file('human_activity_smartphone.test.data', ' ', ' ') train_data = load_raw_data('human_activity_smartphone.train.data', sep = ' ') train_label = load_raw_data('human_activity_smartphone.train.labels.data', sep = ',') train_data = numpy.concatenate((train_label, train_data), axis = 1) test_data = load_raw_data('human_activity_smartphone.test.data', sep = ' ') test_label = load_raw_data('human_activity_smartphone.test.labels.data', sep = ',') test_data = numpy.concatenate((test_label, test_data), axis = 1) data = numpy.concatenate((train_data, test_data), axis = 0) save_data_to_file(data, 'human_activity_smartphone', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_polish_companies_bankruptcy(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip', 'polish_companies_bankruptcy.zip') unzip_raw_data('polish_companies_bankruptcy.zip') for i in range(1, 6): unarff_raw_data(str(i) + 'year') remove_files(UCIVars.raw_data_folder, str(i) + 'year.arff') os.rename(UCIVars.raw_data_folder + str(i) + 'year.data', UCIVars.raw_data_folder + 'polish_companies_bankruptcy_' + str(i) + 'year.data') replace_chars_in_file('polish_companies_bankruptcy_' + str(i) + 'year.data', 'nan', '?') data = load_mixed_raw_data('polish_companies_bankruptcy_' + str(i) + 'year.data', sep = ',') data = auto_replace_missing_in_mixed_data(data, unknown_string = '?') write_mixed_raw_data(UCIVars.raw_data_folder + 'polish_companies_bankruptcy_' + str(i) + 'year.trafo.data', data, sep = ',') data = load_raw_data('polish_companies_bankruptcy_' + str(i) + 'year.trafo.data', sep = ',') data = move_label_in_front(data, 64) save_data_to_file(data, 'polish_companies_bankruptcy_' + str(i) + 'year', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_crowd_sourced_mapping(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00400/Crowdsourced%20Mapping.zip', 'crowd_sourced_mapping.zip') unzip_raw_data('crowd_sourced_mapping.zip') os.rename(UCIVars.raw_data_folder + 'training.csv', UCIVars.raw_data_folder + 'crowd_sourced_mapping.train.data') os.rename(UCIVars.raw_data_folder + 'testing.csv', UCIVars.raw_data_folder + 'crowd_sourced_mapping.test.data') # Get rid of the headers ... train_data = load_mixed_raw_data('crowd_sourced_mapping.train.data', sep = ',', header = True) write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.train.data', train_data, sep = ',') test_data = load_mixed_raw_data('crowd_sourced_mapping.test.data', sep = ',', header = True) write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.test.data', test_data, sep = ',') concat_files(UCIVars.raw_data_folder + 'crowd_sourced_mapping.*.data', UCIVars.raw_data_folder + 'crowd_sourced_mapping.data') # The data set actually has the following classes: ['impervious', 'orchard', 'farm', 'water', 'forest', 'grass'] # However, 'orchard' and 'water' only occur 100 and 250 times, respectively. Ignoring them during the # replacement below leads eventually to a 4-class problem with the remaining classes. data = load_mixed_raw_data('crowd_sourced_mapping.data', sep = ',', header = True) categories = ['impervious', 'farm', 'forest', 'grass'] data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'crowd_sourced_mapping.data', data, sep = ',') data = load_raw_data('crowd_sourced_mapping.data', sep = ',') save_data_to_file(data, 'crowd_sourced_mapping', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_firm_teacher_clave(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00324/ClaveVectors_Firm-Teacher_Model.txt', 'firm_teacher_clave.data') replace_chars_in_file('firm_teacher_clave.data', ' ', ',') replace_chars_in_file('firm_teacher_clave.data', 'error,fixed', '') replace_chars_in_file('firm_teacher_clave.data', ', ', '') data = load_mixed_raw_data('firm_teacher_clave.data', sep = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'firm_teacher_clave.data', data, sep = ',') data = load_raw_data('firm_teacher_clave.data', sep = ',') # The data set has four classes, and their labels are stored as a four-dimensional # 'categorial'-vector. The following lines convert this format to the usual one. rows = numpy.shape(data)[0] columns = numpy.shape(data)[1] label_vectors = data[0:rows, columns - 4:columns] data_features = data[0:rows, 0:columns - 4] labels = numpy.zeros(shape = (rows, 1)) for i in range(0, 4): labels[numpy.where(label_vectors[0:rows, i] == 1)] = i data = numpy.concatenate((labels, data_features), axis = 1) save_data_to_file(data, 'firm_teacher_clave', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_smartphone_human_activity_postural(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00341/HAPT%20Data%20Set.zip', 'smartphone_human_activity_postural.zip') unzip_raw_data('smartphone_human_activity_postural.zip') os.rename(UCIVars.raw_data_folder + 'Train/X_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.train.data') os.rename(UCIVars.raw_data_folder + 'Test/X_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.test.data') os.rename(UCIVars.raw_data_folder + 'Train/y_train.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.train.labels.data') os.rename(UCIVars.raw_data_folder + 'Test/y_test.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.test.labels.data') os.rename(UCIVars.raw_data_folder + 'features_info.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.features.txt') os.rename(UCIVars.raw_data_folder + 'README.txt', UCIVars.raw_data_folder + 'smartphone_human_activity_postural.description') shutil.rmtree(UCIVars.raw_data_folder + 'Train') shutil.rmtree(UCIVars.raw_data_folder + 'Test') shutil.rmtree(UCIVars.raw_data_folder + 'RawData') os.remove(UCIVars.raw_data_folder + 'features.txt') os.remove(UCIVars.raw_data_folder + 'activity_labels.txt') train_data = load_raw_data('smartphone_human_activity_postural.train.data', sep = ' ') train_label = load_raw_data('smartphone_human_activity_postural.train.labels.data', sep = ',') train_data = numpy.concatenate((train_label, train_data), axis = 1) test_data = load_raw_data('smartphone_human_activity_postural.test.data', sep = ' ') test_label = load_raw_data('smartphone_human_activity_postural.test.labels.data', sep = ',') test_data = numpy.concatenate((test_label, test_data), axis = 1) data = numpy.concatenate((train_data, test_data), axis = 0) # The transitional classes 7 to 12 are very small compared to the first 6 classes. Since # we are mostly interested in data sets for which no extra care is needed, we remove these # six classes. rows = numpy.shape(data)[0] columns = numpy.shape(data)[1] data = data[numpy.where(data[0:rows, 0] <= 6)[0], 0:columns] save_data_to_file(data, 'smartphone_human_activity_postural', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_pen_recognition_handwritten_characters(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra', 'pen_recognition_handwritten_characters.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes', 'pen_recognition_handwritten_characters.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.names', 'pen_recognition_handwritten_characters.description') concat_files(UCIVars.raw_data_folder + 'pen_recognition_handwritten_characters.*.data', UCIVars.raw_data_folder + 'pen_recognition_handwritten_characters.data') replace_chars_in_file('pen_recognition_handwritten_characters.data', ' ', '') replace_chars_in_file('pen_recognition_handwritten_characters.data', ' ', '') data = load_raw_data('pen_recognition_handwritten_characters.data', sep = ',') data = move_label_in_front(data, 16) save_data_to_file(data, 'pen_recognition_handwritten_characters', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_epileptic_seizure_recognition(): print("Epileptic seizure recognition is currently not processed since:") print(" - it was removed from the UCI repository") #prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00388/data.csv', 'epileptic_seizure_recognition.data') #data = load_raw_data('epileptic_seizure_recognition.data', description_columns = 1, sep = ',') #data = move_label_in_front(data, 178) #save_data_to_file(data, 'epileptic_seizure_recognition', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_nursery(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data', 'nursery.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.names', 'nursery.description') data = load_mixed_raw_data('nursery.data', ',') categories = [u'usual', u'pretentious', u'great_pret'] data = replace_ordinals_in_mixed_data(data, categories, 0, separator = ',') categories = [u'proper', u'less_proper', u'improper', u'critical', u'very_crit'] data = replace_ordinals_in_mixed_data(data, categories, 1, separator = ',') categories = [u'complete', u'completed', u'incomplete', u'foster'] data = replace_ordinals_in_mixed_data(data, categories, 2, separator = ',') categories = [u'1', u'3', u'2', u'more'] data = replace_ordinals_in_mixed_data(data, categories, 3, separator = ',') categories = [u'convenient', u'less_conv', u'critical'] data = replace_ordinals_in_mixed_data(data, categories, 4, separator = ',') categories = [u'convenient', u'inconv'] data = replace_ordinals_in_mixed_data(data, categories, 5, separator = ',') categories = [u'nonprob', u'slightly_prob', u'problematic'] data = replace_ordinals_in_mixed_data(data, categories, 6, separator = ',') categories = [u'not_recom', u'recommended', u'priority'] data = replace_ordinals_in_mixed_data(data, categories, 7, separator = ',') # We combine the classes 'not_recom' and 'recommend', since the latter only has two instances categories = [u'recommend'] data = replace_ordinals_in_mixed_data(data, categories, 8, separator = ',') categories = [u'not_recom', u'very_recom', u'priority', u'spec_prior'] data = replace_ordinals_in_mixed_data(data, categories, 8, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'nursery.trafo.data', data, sep = ',') data = load_raw_data('nursery.trafo.data', sep = ',') data = move_label_in_front(data, 8) save_data_to_file(data, 'nursery', is_classification = True, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_indoor_user_movement_prediction(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00348/MovementAAL.zip', 'indoor_user_movement_prediction.zip') print("Indoor User Movement Prediction is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the number of time series samples is small, namely a few hundreds") #--------------------------------------------------------------------------------------------------- def get_eeg_eye_state(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff', 'eeg_eye_state.arff') unarff_raw_data('eeg_eye_state') data = load_raw_data('eeg_eye_state.data', sep = ',') data = move_label_in_front(data, 14) save_data_to_file(data, 'eeg_eye_state', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_htru2(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip', 'htru2.zip') unzip_raw_data('htru2.zip') os.rename(UCIVars.raw_data_folder + 'HTRU_2.csv', UCIVars.raw_data_folder + 'htru2.data') os.rename(UCIVars.raw_data_folder + 'Readme.txt', UCIVars.raw_data_folder + 'htru2.description') os.remove(UCIVars.raw_data_folder + 'HTRU_2.arff') # Somehow, the original htru2.data file has a strange format, so that all data is # viewed to be as a single row. Probably, the endofline characters are messed up. # In any case, the following two lines cure this. data = load_mixed_raw_data('htru2.data', ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'htru2.data', data, sep = ',') data = load_raw_data('htru2.data', ',') data = move_label_in_front(data, 8) save_data_to_file(data, 'htru2', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_magic_gamma_telescope(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data', 'magic_gamma_telescope.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names', 'magic_gamma_telescope.description') replace_chars_in_file('magic_gamma_telescope.data', 'g', '1') replace_chars_in_file('magic_gamma_telescope.data', 'h', '-1') data = load_raw_data('magic_gamma_telescope.data', ',') data = move_label_in_front(data, 10) save_data_to_file(data, 'magic_gamma_telescope', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_letter_recognition(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', 'letter_recognition.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.names', 'letter_recognition.description') data = load_mixed_raw_data('letter_recognition.data', sep = ',') categories = get_categories_in_mixed_data(data, 0) data = replace_ordinals_in_mixed_data(data, sorted(categories), 0, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'letter_recognition.data', data, sep = ',') data = load_raw_data('letter_recognition.data', ',') save_data_to_file(data, 'letter_recognition', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_occupancy_detection(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip', 'occupancy_detection.zip') unzip_raw_data('occupancy_detection.zip') os.rename(UCIVars.raw_data_folder + 'datatraining.txt', UCIVars.raw_data_folder + 'occupancy_detection.train.data') os.rename(UCIVars.raw_data_folder + 'datatest.txt', UCIVars.raw_data_folder + 'occupancy_detection.val.data') os.rename(UCIVars.raw_data_folder + 'datatest2.txt', UCIVars.raw_data_folder + 'occupancy_detection.test.data') concat_files(UCIVars.raw_data_folder + 'occupancy_detection.*.data', UCIVars.raw_data_folder + 'occupancy_detection.data') replace_chars_in_file('occupancy_detection.data', ' ', ',') replace_chars_in_file('occupancy_detection.data', '"', '') data = load_raw_data('occupancy_detection.data', ',', description_columns = 1, date_column = 1, date_sep = '-', date_order = 'Ymd', time_column = 2, time_sep = ':') data = move_label_in_front(data, 7) save_data_to_file(data, 'occupancy_detection', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_avila(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip', 'avila.zip') unzip_raw_data('avila.zip') os.rename(UCIVars.raw_data_folder + 'avila/avila-description.txt', UCIVars.raw_data_folder + 'avila.description') os.rename(UCIVars.raw_data_folder + 'avila/avila-tr.txt', UCIVars.raw_data_folder + 'avila.train.data') os.rename(UCIVars.raw_data_folder + 'avila/avila-ts.txt', UCIVars.raw_data_folder + 'avila.test.data') shutil.rmtree(UCIVars.raw_data_folder + 'avila') concat_files(UCIVars.raw_data_folder + 'avila.*.data', UCIVars.raw_data_folder + 'avila.data') replace_chars_in_file('avila.data', 'A', '1') replace_chars_in_file('avila.data', 'B', '2') replace_chars_in_file('avila.data', 'C', '3') replace_chars_in_file('avila.data', 'D', '4') replace_chars_in_file('avila.data', 'E', '5') replace_chars_in_file('avila.data', 'F', '6') replace_chars_in_file('avila.data', 'G', '7') replace_chars_in_file('avila.data', 'H', '8') replace_chars_in_file('avila.data', 'I', '9') replace_chars_in_file('avila.data', 'W', '10') replace_chars_in_file('avila.data', 'X', '11') replace_chars_in_file('avila.data', 'Y', '12') data = load_raw_data('avila.data', ',') data = move_label_in_front(data, 10) save_data_to_file(data, 'avila', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_grammatical_facial_expressions(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00317/grammatical_facial_expression.zip', 'grammatical_facial_expression.zip') print("Activity Recognition is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the number of time series samples is very low, namely 36") #--------------------------------------------------------------------------------------------------- def get_chess_krvk(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data', 'chess_krvk.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.info', 'chess_krvk.description') data = load_mixed_raw_data('chess_krvk.data', sep = ',') data = auto_replace_categories_in_mixed_data(data, 0, separator = ',') data = auto_replace_categories_in_mixed_data(data, 2, separator = ',') data = auto_replace_categories_in_mixed_data(data, 4, separator = ',') categories = ['draw', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen'] data = replace_ordinals_in_mixed_data(data, categories, 6, separator = ',', begin_value = -1) write_mixed_raw_data(UCIVars.raw_data_folder + 'chess_krvk.trafo.data', data, sep = ',') data = load_raw_data('chess_krvk.trafo.data', sep = ',') data = move_label_in_front(data, 23) save_data_to_file(data, 'chess_krvk', is_classification = True, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_default_credit_card(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', 'default_credit_card.xls') excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'default_credit_card.xls', engine = 'xlrd') excel_data.to_csv(UCIVars.raw_data_folder + 'default_credit_card.data') data = load_raw_data('default_credit_card.data', sep = ',', description_columns = 1) data = move_label_in_front(data, 24) save_data_to_file(data, 'default_credit_card', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_nomao(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00227/Nomao.zip', 'nomao.zip') unzip_raw_data('nomao.zip') os.rename(UCIVars.raw_data_folder + 'Nomao/Nomao.data', UCIVars.raw_data_folder + 'nomao.data') os.rename(UCIVars.raw_data_folder + 'Nomao/Nomao.names', UCIVars.raw_data_folder + 'nomao.description') shutil.rmtree(UCIVars.raw_data_folder + 'Nomao') replace_chars_in_file('nomao.data', '#', ',') data = load_mixed_raw_data('nomao.data', sep = ',', header = False) categories = ['s', 'm', 'n'] columns = [8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 57, 64, 65, 72, 73, 80, 81, 88, 89, 93, 97, 101, 105, 109, 113, 117] for i in range(len(columns)): data = replace_ordinals_in_mixed_data(data, categories, columns[i], ',', unknown_string = '') data = auto_replace_missing_in_mixed_data(data, unknown_string = '?') write_mixed_raw_data(UCIVars.raw_data_folder + 'nomao.trafo.data', data, sep = ',') data = load_raw_data('nomao.trafo.data', sep = ',') data = move_label_in_front(data, 120) save_data_to_file(data, 'nomao', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_indoor_loc_mag(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00343/UJIIndoorLoc-Mag-forUCI.zip', 'indoor_loc_mag.zip') print("Indoor Location Mag is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the number of time series samples is too low") #--------------------------------------------------------------------------------------------------- def get_activity_recognition(): prepare_new_data_set_group_id() #download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00366/AReM.zip', 'activity_recognition.zip') print("Activity Recognition is currently not processed since:") print(" - according to the description it seems to be a time series data set") print(" - the number of time series samples is too low") #--------------------------------------------------------------------------------------------------- def get_bank_marketing(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip', 'bank_marketing.zip') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip', 'bank_marketing_additional.zip') unzip_raw_data('bank_marketing.zip') os.rename(UCIVars.raw_data_folder + 'bank-full.csv', UCIVars.raw_data_folder + 'bank_marketing.data') os.rename(UCIVars.raw_data_folder + 'bank-names.txt', UCIVars.raw_data_folder + 'bank_marketing.description') os.remove(UCIVars.raw_data_folder + 'bank.csv') replace_chars_in_file('bank_marketing.data', '"', '') data = load_mixed_raw_data('bank_marketing.data', sep = ';', header = True) categories = ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed'] data = replace_categories_in_mixed_data(data, categories, 1, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['divorced', 'married', 'single'] data = replace_categories_in_mixed_data(data, categories, 2, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['primary', 'secondary', 'tertiary'] data = replace_ordinals_in_mixed_data(data, categories, 3, ';', unknown_string = '') categories = ['no', 'yes'] data = replace_bin_cats_in_mixed_data(data, categories, 4, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 6, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 7, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 16, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['cellular', 'telephone'] data = replace_bin_cats_in_mixed_data(data, categories, 8, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] data = replace_circulars_in_mixed_data(data, categories, 10, ';', unknown_string = 'unknown') categories = ['failure', 'success'] data = replace_bin_cats_in_mixed_data(data, categories, 15, ';', unknown_string = 'unknown', unknown_replacement_value = 0) write_mixed_raw_data(UCIVars.raw_data_folder + 'bank_marketing.trafo.data', data, sep = ';') data = load_raw_data('bank_marketing.trafo.data', sep = ';', na_string = 'unknown') data = move_label_in_front(data, 29) save_data_to_file(data, 'bank_marketing', is_classification = True, is_regression = False) #------------------------------------------------ unzip_raw_data('bank_marketing_additional.zip') shutil.rmtree(UCIVars.raw_data_folder + '__MACOSX') os.rename(UCIVars.raw_data_folder + 'bank-additional/bank-additional-full.csv', UCIVars.raw_data_folder + 'bank_marketing_additional.data') os.rename(UCIVars.raw_data_folder + 'bank-additional/bank-additional-names.txt', UCIVars.raw_data_folder + 'bank_marketing_additional.description') shutil.rmtree(UCIVars.raw_data_folder + 'bank-additional') replace_chars_in_file('bank_marketing_additional.data', '"', '') data = load_mixed_raw_data('bank_marketing_additional.data', sep = ';', header = True) categories = ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed'] data = replace_categories_in_mixed_data(data, categories, 1, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['divorced', 'married', 'single'] data = replace_categories_in_mixed_data(data, categories, 2, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree'] data = replace_ordinals_in_mixed_data(data, categories, 3, ';', unknown_string = '') categories = ['no', 'yes'] data = replace_bin_cats_in_mixed_data(data, categories, 4, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 5, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 6, ';', unknown_string = 'unknown', unknown_replacement_value = 0) data = replace_bin_cats_in_mixed_data(data, categories, 20, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['cellular', 'telephone'] data = replace_bin_cats_in_mixed_data(data, categories, 7, ';', unknown_string = 'unknown', unknown_replacement_value = 0) categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] data = replace_circulars_in_mixed_data(data, categories, 8, ';', unknown_string = 'unknown') categories = ['mon', 'tue', 'wed', 'thu', 'fri'] data = replace_circulars_in_mixed_data(data, categories, 9, ';', unknown_string = 'unknown') categories = ['failure', 'success'] data = replace_bin_cats_in_mixed_data(data, categories, 14, ';', unknown_string = 'nonexistent', unknown_replacement_value = 0) write_mixed_raw_data(UCIVars.raw_data_folder + 'bank_marketing_additional.trafo.data', data, sep = ';') data = load_raw_data('bank_marketing_additional.trafo.data', sep = ';') data = move_label_in_front(data, 34) save_data_to_file(data, 'bank_marketing_additional', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_census_income(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 'adult.train.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', 'adult.test.data') download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names', 'adult.description') if os.path.exists(UCIVars.raw_data_folder + 'adult.trafo.data'): os.remove(UCIVars.raw_data_folder + 'adult.trafo.data') concat_files(UCIVars.raw_data_folder + 'adult.t*.data', UCIVars.raw_data_folder + 'adult.data') replace_chars_in_file('adult.data', '>50K.', '>50K') replace_chars_in_file('adult.data', '<=50K.', '<=50K') replace_chars_in_file('adult.data', '|1x3 Cross validator', '') replace_chars_in_file('adult.data', ', ', ',') data = load_mixed_raw_data('adult.data', sep = ',', header = False) categories = ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'] data = replace_categories_in_mixed_data(data, categories, 1, ',', unknown_string = '') categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Assoc-acdm', 'Assoc-voc', 'Some-college', 'Bachelors', 'Prof-school', 'Masters', 'Doctorate'] data = replace_ordinals_in_mixed_data(data, categories, 3, ',', unknown_string = '') categories = ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'] data = replace_categories_in_mixed_data(data, categories, 5, ',', unknown_string = '') categories = ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'] data = replace_categories_in_mixed_data(data, categories, 6, ',', unknown_string = '') categories = ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'] data = replace_categories_in_mixed_data(data, categories, 7, ',', unknown_string = '') categories = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'] data = replace_categories_in_mixed_data(data, categories, 8, ',', unknown_string = '') categories = ['Female', 'Male'] data = replace_bin_cats_in_mixed_data(data, categories, 9, ',', unknown_string = '') categories = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'] data = replace_categories_in_mixed_data(data, categories, 13, ',', unknown_string = '') categories = ['<=50K', '>50K'] data = replace_bin_cats_in_mixed_data(data, categories, 14, ',', unknown_string = '') write_mixed_raw_data(UCIVars.raw_data_folder + 'adult.trafo.data', data, sep = ',') data = load_raw_data('adult.trafo.data', sep = ',') data = move_label_in_front(data, 89) save_data_to_file(data, 'adult', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_emg_for_gestures(): prepare_new_data_set_group_id() print("EMG for Gestures is currently not processed since:") print(" - according to the description it seems to be a time series data set") #--------------------------------------------------------------------------------------------------- def get_indoor_channel_measurements(): prepare_new_data_set_group_id() print("Indoor Channel Measurements is currently not processed since:") print(" - according to the description it seems to be a complicated time series data set") #--------------------------------------------------------------------------------------------------- def get_electrical_grid_stability_simulated(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv', 'electrical_grid_stability_simulated.data') data = load_mixed_raw_data('electrical_grid_stability_simulated.data', sep = ',', header = True) categories = get_categories_in_mixed_data(data, 13) data = replace_bin_cats_in_mixed_data(data, categories, 13, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'electrical_grid_stability_simulated.data', data, sep = ',') data = load_raw_data('electrical_grid_stability_simulated.data', ',') data_class = move_label_in_front(data, 13) data_class = remove_columns(data_class, 13) save_data_to_file(data_class, 'electrical_grid_stability_simulated', is_classification = True, is_regression = False) data_regr = move_label_in_front(data, 12) data_regr = remove_columns(data_regr, 13) save_data_to_file(data_regr, 'electrical_grid_stability_simulated', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_online_shoppers_attention(): prepare_new_data_set_group_id() download_and_save('http://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', 'online_shoppers_attention.data') data = load_mixed_raw_data('online_shoppers_attention.data', sep = ',', header = True) data = auto_replace_categories_in_mixed_data(data, 16, ',') data = auto_replace_categories_in_mixed_data(data, 17, ',') categories = get_categories_in_mixed_data(data, 15) data = replace_categories_in_mixed_data(data, categories, 15, ',') categories = [u'Jan', u'Feb', u'Mar', u'Apr', u'May', u'June', u'Jul', u'Aug', u'Sep', u'Oct', u'Nov', u'Dec'] data = replace_circulars_in_mixed_data(data, categories, 10, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'online_shoppers_attention.data', data, sep = ',') data = load_raw_data('online_shoppers_attention.data', ',') data = move_label_in_front(data, 20) save_data_to_file(data, 'online_shoppers_attention', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_pmu_ud(): prepare_new_data_set_group_id() print("PMU-UD is currently not processed since:") print(" - the data consists of .jpg images") #--------------------------------------------------------------------------------------------------- def get_seoul_bike_data(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv', 'seoul_bike_data.data') # The purpose of the following two lines is to remove the header, which gives an annoying encoding error... data = pandas.read_csv(UCIVars.raw_data_folder + 'seoul_bike_data.data', encoding = 'unicode_escape') data.to_csv(UCIVars.raw_data_folder + 'seoul_bike_data.data', header = False, index = False) data = load_mixed_raw_data('seoul_bike_data.data', sep = ',', header = False) categories = ['No', 'Yes'] data = replace_bin_cats_in_mixed_data(data, categories, column = 13, separator = ',') categories = ['No Holiday', 'Holiday'] data = replace_bin_cats_in_mixed_data(data, categories, column = 12, separator = ',') categories = ['Winter', 'Spring', 'Summer', 'Autumn'] data = replace_circulars_in_mixed_data(data, categories, 11, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'seoul_bike_data.data', data, sep = ',') data = load_raw_data('seoul_bike_data.data', sep=',', date_column=0, date_sep='/', date_order=['d','m','Y'], header=False) data = move_label_in_front(data, 1) save_data_to_file(data, 'seoul_bike_data', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_south_german_credit(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00573/SouthGermanCredit.zip', 'south_german_credit.zip') unrar_raw_data('south_german_credit.zip') remove_files(UCIVars.raw_data_folder, 'read_SouthGermanCredit.R') remove_files(UCIVars.raw_data_folder, 'codetable.txt') remove_files(UCIVars.raw_data_folder, 'south_german_credit.zip') data = load_raw_data('SouthGermanCredit.asc', sep = ' ', header = True) data = move_label_in_front(data, 20) save_data_to_file(data, 'south_german_credit', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_shill_bidding(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill%20Bidding%20Dataset.csv', 'shill_bidding.data') data = load_mixed_raw_data('shill_bidding.data', sep = ',', header = True) # Remove Record ID, Auction ID, Bidder ID data = remove_columns(data, [0, 1, 2]) write_mixed_raw_data(UCIVars.raw_data_folder + 'shill_bidding.data', data, sep = ',') data = load_raw_data('shill_bidding.data', sep = ',') data = move_label_in_front(data, 9) save_data_to_file(data, 'shill_bidding', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_gas_turbine(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00551/pp_gas_emission.zip', 'gas_turbine.zip') unzip_raw_data('gas_turbine.zip') remove_files(UCIVars.raw_data_folder, 'gas_turbine.zip') concat_files(UCIVars.raw_data_folder + 'gt_201*.csv', UCIVars.raw_data_folder + 'gt.data') remove_files(UCIVars.raw_data_folder, 'gt_201*.csv') data = load_raw_data('gt.data', sep = ',', header = True) # Will report 4 errors because of headers in the middle of the data data_co = remove_columns(data, [10]) data_co = move_label_in_front(data_co, 9) save_data_to_file(data_co, 'gas_turbine_co', is_classification=False, is_regression=True) data_nox = remove_columns(data, [9]) data_nox = move_label_in_front(data_nox, 9) save_data_to_file(data_nox, 'gas_turbine_nox', is_classification=False, is_regression=True) #--------------------------------------------------------------------------------------------------- def get_oral_toxicity(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00508/qsar_oral_toxicity.zip', 'oral_toxicity.zip') unzip_raw_data('oral_toxicity.zip') remove_files(UCIVars.raw_data_folder, 'oral_toxicity.zip') data = load_mixed_raw_data('qsar_oral_toxicity.csv', sep = ';', header = False) categories = ['negative', 'positive'] data = replace_bin_cats_in_mixed_data(data, categories, column = 1024, separator = ';') write_mixed_raw_data(UCIVars.raw_data_folder + 'oral_toxicity.data', data, sep = ',') remove_files(UCIVars.raw_data_folder, 'qsar_oral_toxicity.csv') data = load_raw_data('oral_toxicity.data', sep = ',', header = False) data = move_label_in_front(data, 1024) save_data_to_file(data, 'oral_toxicity', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_wave_energy(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00494/WECs_DataSet.zip', 'wave_energy.zip') unzip_raw_data('wave_energy.zip') remove_files(UCIVars.raw_data_folder, 'wave_energy.zip') # For each of the 4 data sets, the last column contains the sum of columns 32 to 47. # I assume the last column is the label and columns 32 to 47 are intermediate results # and that only the first 32 columns should be used as features. indices = range(32, 48) data_adelaide = load_raw_data('WECs_DataSet/Adelaide_Data.csv', sep=',') data_adelaide = remove_columns(data_adelaide, indices) data_adelaide = move_label_in_front(data_adelaide, 32) save_data_to_file(data_adelaide, 'wave_energy_adelaide', is_classification=False, is_regression=True) data_perth = load_raw_data('WECs_DataSet/Perth_Data.csv', sep=',') data_perth = remove_columns(data_perth, indices) data_perth = move_label_in_front(data_perth, 32) save_data_to_file(data_perth, 'wave_energy_perth', is_classification=False, is_regression=True) data_sydney = load_raw_data('WECs_DataSet/Sydney_Data.csv', sep=',') data_sydney = remove_columns(data_sydney, indices) data_sydney = move_label_in_front(data_sydney, 32) save_data_to_file(data_sydney, 'wave_energy_sydney', is_classification=False, is_regression=True) data_tasmania = load_raw_data('WECs_DataSet/Tasmania_Data.csv', sep=',') data_tasmania = remove_columns(data_tasmania, indices) data_tasmania = move_label_in_front(data_tasmania, 32) save_data_to_file(data_tasmania, 'wave_energy_tasmania', is_classification=False, is_regression=True) #--------------------------------------------------------------------------------------------------- def get_firewall(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00542/log2.csv', 'firewall.data') data = load_mixed_raw_data('firewall.data', sep = ',', header = True) categories = ['allow', 'drop', 'deny', 'reset-both'] data = replace_ordinals_in_mixed_data(data, categories, column = 4, separator = ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'firewall.data', data, sep = ',') data = load_raw_data('firewall.data', sep = ',', header = False) data = move_label_in_front(data, 4) save_data_to_file(data, 'firewall', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_real_estate_value(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx', 'real_estate_value.xlsx') excel_data = pandas.read_excel(UCIVars.raw_data_folder + 'real_estate_value.xlsx', engine = 'openpyxl') excel_data.to_csv(UCIVars.raw_data_folder + 'real_estate_value.data', index = False) remove_files(UCIVars.raw_data_folder, 'real_estate_value.xlsx') data = load_raw_data('real_estate_value.data', sep = ',', header = True) data = remove_columns(data, [0]) data = move_label_in_front(data, 6) save_data_to_file(data, 'real_estate_value', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- def get_crop_mapping(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00525/data.zip', 'crop_mapping.zip') unzip_raw_data('crop_mapping.zip') remove_files(UCIVars.raw_data_folder, 'crop_mapping.zip') data = load_raw_data('WinnipegDataset.txt', sep=',', header=True) save_data_to_file(data, 'crop_mapping', is_classification=True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_bitcoin_heist(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip', 'bitcoin_heist.zip') unzip_raw_data('bitcoin_heist.zip') remove_files(UCIVars.raw_data_folder, 'bitcoin_heist.zip') data = load_mixed_raw_data('BitcoinHeistData.csv', sep = ',', header = True) data = remove_columns(data, [0]) # The labels consist of 28 ransomware types and the labe 'white' for not ransomware. # We merge every ransomware type into one class. The resulting data set still only has 1.4% positive labels. categories = sorted(get_categories_in_mixed_data(data, 8)) new_cats = [1]*(len(categories)-1) + [2] data = replace_manual_in_mixed_data(data, categories, 8, new_cats, ',') write_mixed_raw_data(UCIVars.raw_data_folder + 'bitcoin_heist.data', data, sep = ',') remove_files(UCIVars.raw_data_folder, 'BitcoinHeistData.csv') data = load_raw_data('bitcoin_heist.data', sep = ',', header = False) data = move_label_in_front(data, 8) save_data_to_file(data, 'bitcoin_heist', is_classification = True, is_regression = False) #--------------------------------------------------------------------------------------------------- def get_query_analytics(): prepare_new_data_set_group_id() download_and_save('https://archive.ics.uci.edu/ml/machine-learning-databases/00493/datasets.zip', 'query_analytics.zip') unzip_raw_data('query_analytics.zip') remove_files(UCIVars.raw_data_folder, 'query_analytics.zip') remove_files(UCIVars.raw_data_folder + 'Datasets/', 'Radius-Queries.csv') data_radius = load_raw_data('Datasets/Radius-Queries-Count.csv', sep = ',', header = False) data_radius = move_label_in_front(data_radius, 3) save_data_to_file(data_radius, 'radius_query', is_classification = False, is_regression = True) data_range = load_raw_data('Datasets/Range-Queries-Aggregates.csv', sep = ',', header = True) data_range = remove_columns(data_range, [0]) data_range_incidents = remove_columns(data_range, [5, 6]) data_range_incidents = move_label_in_front(data_range_incidents, 4) save_data_to_file(data_range_incidents, 'range_query_incidents', is_classification = False, is_regression = True) data_range_arrests = remove_columns(data_range, [4, 6]) data_range_arrests = move_label_in_front(data_range_arrests, 4) save_data_to_file(data_range_arrests, 'range_query_arrests', is_classification = False, is_regression = True) data_range_beat = remove_columns(data_range, [4, 5]) data_range_beat = move_label_in_front(data_range_beat, 4) save_data_to_file(data_range_beat, 'range_query_beat', is_classification = False, is_regression = True) #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- def download_all_uci(paths: Paths): # preparation # code was written with global variables, so we set the global variable values here for the paths base_folder = str(paths.uci_download()) #global data_folder #global UCIVars.raw_data_folder #global regression_data_folder #global binary_classification_data_folder #global multiclass_classification_data_folder #global statistics_filename UCIVars.data_folder = base_folder + '/data/' UCIVars.raw_data_folder = base_folder + '/raw_data/' UCIVars.regression_data_folder = base_folder + '/regression-data/' UCIVars.binary_classification_data_folder = base_folder + '/bin-class-data/' UCIVars.multiclass_classification_data_folder = base_folder + '/multi-class-data/' UCIVars.statistics_filename = base_folder + '/data_statistics.csv' utils.ensureDir(UCIVars.data_folder) utils.ensureDir(UCIVars.raw_data_folder) utils.ensureDir(UCIVars.regression_data_folder) utils.ensureDir(UCIVars.binary_classification_data_folder) utils.ensureDir(UCIVars.multiclass_classification_data_folder) # this was also a global statement if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context #if os.path.exists(statistics_filename): #os.remove(statistics_filename) #--------------------------------------------------------------------------------------------------- # Data sets that are (primarily) of regression type #--------------------------------------------------------------------------------------------------- get_skill_craft() get_cargo_2000() get_KDC_4007() get_sml2010() get_wine_quality() get_parkinson() get_insurance_benchmark() get_air_quality() get_EEG_steady_state() get_cycle_power_plant() get_carbon_nanotubes() get_naval_propulsion() get_blood_pressure() get_gas_sensor_drift() get_bike_sharing() get_appliances_energy() get_indoor_loc() get_online_news_popularity() get_facebook_comment_volume() get_bejing_pm25() get_protein_tertiary_structure() get_five_cities_pm25() get_tamilnadu_electricity() # Additional data sets added after mid 2018 get_metro_interstate_traffic_volume() get_facebook_live_sellers_thailand() get_parking_birmingham() get_tarvel_review_ratings() get_superconductivity() get_gnfuv_unmanned_surface_vehicles() # Additional data sets added February 2021 #get_seoul_bike_data() #get_gas_turbine() #get_wave_energy() #get_real_estate_value() #get_query_analytics() #--------------------------------------------------------------------------------------------------- # Data sets that are (primarily) of classification type #--------------------------------------------------------------------------------------------------- get_phishing() get_ozone_level() get_opportunity_activity() get_australian_sign_language() get_seismic_bumps() get_meu_mobile_ksd() get_character_trajectories() get_vicon_physical_action() get_simulated_falls() get_chess() get_abalone() get_madelon() get_spambase() get_wilt() get_waveform() get_wall_following_robot() get_page_blocks() get_optical_recognition_handwritten_digits() get_bach_chorals_harmony() get_smartphone_human_activity() get_turkiye_student_evaluation() get_artificial_characters() get_first_order_theorem_proving() get_landsat_satimage() get_hiv_1_protease() get_musk() get_ble_rssi_indoor_location() get_anuran_calls() get_thyroids() get_isolet() get_mushroom() get_assamese_characters() get_arabic_digit() get_eeg_steady_state_visual() get_gesture_phase_segmentation() get_emg_physical_action() get_human_activity_smartphone() get_polish_companies_bankruptcy() get_crowd_sourced_mapping() get_firm_teacher_clave() get_smartphone_human_activity_postural() get_pen_recognition_handwritten_characters() get_epileptic_seizure_recognition() get_nursery() get_indoor_user_movement_prediction() get_eeg_eye_state() get_htru2() get_magic_gamma_telescope() get_letter_recognition() get_occupancy_detection() get_avila() get_grammatical_facial_expressions() get_chess_krvk() get_default_credit_card() get_nomao() get_indoor_loc_mag() get_activity_recognition() get_bank_marketing() get_census_income() ## Additional data sets added after mid 2018 get_emg_for_gestures() get_indoor_channel_measurements() get_electrical_grid_stability_simulated() get_online_shoppers_attention() get_pmu_ud() # Additional data sets added February 2021 #get_south_german_credit() #get_shill_bidding() #get_oral_toxicity() #get_firewall() #get_crop_mapping() #get_bitcoin_heist() ================================================ FILE: pytabkit/bench/data/import_talent_benchmark.py ================================================ from pathlib import Path from typing import Optional import numpy as np import pandas as pd from pytabkit.bench.data.import_tasks import PandasTask from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskCollection from pytabkit.models import utils from pytabkit.models.data.data import TaskType def import_talent_benchmark(paths: Paths, talent_folder: str, source_name: str, allow_regression: bool = True, allow_classification: bool = True, normalize_y: bool = False, min_n_samples: int = 1, max_n_classes: int = 100000, min_n_classes: int = 0, remove_missing_cont: bool = True, remove_duplicates: bool = False, max_n_samples: Optional[int] = None, ignore_above_n_classes: int = 100000, dry_run: bool = False): talent_folder = Path(talent_folder) dataset_folders = [dataset_folder for dataset_folder in talent_folder.iterdir()] for i, dataset_folder in enumerate(dataset_folders): dataset_name = dataset_folder.name info = utils.deserialize(dataset_folder / 'info.json', use_json=True) if dry_run: train_size = info.get("train_size", None) n_samples = info['train_size'] + info['val_size'] + info['test_size'] if train_size >= 100_000: print(f'{dataset_name}: {train_size=}') if n_samples >= 100_000: print(f'{dataset_name}: {n_samples=}') continue print(f'Importing dataset {dataset_name} [{i + 1}/{len(dataset_folders)}]') # can be 'regression', 'multiclass', 'binclass' task_type = info['task_type'] print(f'{task_type=}') assert task_type in ['regression', 'multiclass', 'binclass'] if task_type == 'regression' and not allow_regression: print(f'Skipping regression datasets') continue elif task_type != 'regression' and not allow_classification: print(f'Skipping classification datasets') continue # can be 1 for regression n_classes = info.get('n_classes', info.get('num_classes', None)) print(f'{n_classes=}') y = np.concatenate( [np.load(dataset_folder / f'y_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']], axis=0) n_samples = y.shape[0] # print(f'{y[:5]=}') # print(f'{y.shape=}, {y.dtype=}') if len(y.shape) == 2 and y.shape[1] == 1: y = y[:, 0] y_df = pd.Series(y) if task_type == 'regression': y_df = y_df.astype(np.float32) else: y_df = y_df.astype('category') if np.any(y_df.isnull()): raise ValueError(f'Missing values in class labels not allowed') x_dfs = [] if utils.existsFile(dataset_folder / 'N_train.npy'): N = np.concatenate( [np.load(dataset_folder / f'N_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']], axis=0) # print(f'{N.shape=}, {N.dtype=}') df = pd.DataFrame(N, columns=[f'cont_{i}' for i in range(N.shape[1])]).astype(np.float32) # print(df.head()) # print(f'{df.columns=}') x_dfs.append(df) # print(N.flatten()[0]) # if np.any(np.isnan(N)): if np.any(df.isnull()): print(f'Contains missing numerical values! ##########################################') else: N = np.zeros(shape=(n_samples, 0), dtype=np.float32) if utils.existsFile(dataset_folder / 'C_train.npy'): C = np.concatenate( [np.load(dataset_folder / f'C_{part}.npy', allow_pickle=True) for part in ['train', 'val', 'test']], axis=0) # print(f'{C.shape=}, {C.dtype=}') df = pd.DataFrame(C, columns=[f'cat_{i}' for i in range(C.shape[1])]).astype('category') # print(f'{df.columns=}') x_dfs.append(df) if np.any(df.isnull()): print(f'Contains missing categorical values! ##########################################') else: C = np.zeros(shape=(n_samples, 0), dtype=np.int32) if len(x_dfs) == 1: x_df = x_dfs[0] elif len(x_dfs) == 2: x_df = pd.concat(x_dfs, axis='columns') else: raise ValueError(f'Expected len(x_dfs) in [1, 2], but got {len(x_dfs)=}') cat_columns = x_df.select_dtypes(include='category').columns.tolist() cat_indicator = [column in cat_columns for column in x_df.columns] task_type = TaskType.REGRESSION if task_type == 'regression' else TaskType.CLASSIFICATION # task_source_name = 'talent-reg' if task_type == TaskType.REGRESSION else 'talent-class' task_desc = TaskDescription(source_name, dataset_name) pd_task = PandasTask(x_df, y_df, cat_indicator, task_type, more_info=info) if remove_missing_cont: pd_task.remove_missing_cont() if remove_duplicates: pd_task.deduplicate() if max_n_samples is not None: pd_task.subsample(max_n_samples) if normalize_y: pd_task.normalize_regression_y() if pd_task.get_n_classes() > ignore_above_n_classes: print(f'Ignoring task with {pd_task.get_n_classes()} > {ignore_above_n_classes} classes') continue if pd_task.get_n_classes() > max_n_classes: print(f'Only keeping the most frequent {max_n_classes} out of {pd_task.get_n_classes()} classes') pd_task.limit_n_classes(max_n_classes) if pd_task.get_n_samples() < min_n_samples: print(f'Too few samples ({pd_task.get_n_samples()} < {min_n_samples}), ignoring task') continue if pd_task.get_n_classes() < min_n_classes: print(f'Too few classes, ignoring task') continue pd_task.get_task(task_desc).save(paths) if not dry_run: TaskCollection.from_source(source_name, paths).save(paths) ================================================ FILE: pytabkit/bench/data/import_tasks.py ================================================ from typing import Union, Optional, List, Dict import sklearn.model_selection import torch from pathlib import Path import numpy as np import pandas as pd from pytabkit.bench.data.common import TaskSource from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskInfo, Task, TaskCollection from pytabkit.models import utils from pytabkit.models.data.data import TaskType, DictDataset, TensorInfo def download_if_not_exists(url: str, dest: str): import requests """ Simple function for downloading a file from an url if no file at the destination path exists. :param url: URL of the file to download. :param dest: Path where to save the downloaded file. """ # following https://dzone.com/articles/simple-examples-of-downloading-files-using-python utils.ensureDir(dest) if not utils.existsFile(dest): print('Downloading ' + url, flush=True) # file = requests.get(url) # open(dest, 'wb').write(file.content) r = requests.get(url, stream=True) with open(dest, 'wb') as f: print('Progress (dot = 1 MB): ', end='', flush=True) for ch in r.iter_content(chunk_size=1024**2): print('.', end='', flush=True) f.write(ch) print(flush=True) def extract_categories(X): n_cols = X.shape[1] n_samples = X.shape[0] is_categorical = np.asarray([np.allclose(np.abs(X[:, i]), 1.0) for i in range(n_cols)]) cat_idx_groups = [] i = 0 while i < n_cols: if not is_categorical[i]: i += 1 continue compat_signs = [] while i < n_cols: signs = X[:, i] > 0 if np.any([np.any(np.logical_and(signs, cs)) for cs in compat_signs]): break compat_signs.append(signs) i += 1 cat_idx_groups.append(list(np.arange(i - len(compat_signs), i))) cont_idxs = list(np.argwhere(~is_categorical)[:, 0]) X_conts = X[:, cont_idxs] if len(cont_idxs) > 0 else np.zeros(shape=(n_samples, 0), dtype=np.float32) signs = X > 0 # for binary categorical variables, shift by 1 since the category 0 is reserved for missing values X_cats = [np.sum(signs[:, g] * np.arange(1, len(g) + 1), axis=1) + (1 if len(g) == 1 else 0) for g in cat_idx_groups] X_cats = np.stack(X_cats, axis=1).astype(np.int32) if len(X_cats) > 0 else np.zeros(shape=(n_samples, 0), dtype=np.int32) # binary categorical variables need to be shifted one more since here # "-1" is not already the missing variable category cat_sizes = [len(group) + 1 + (1 if len(group) == 1 else 0) for group in cat_idx_groups] return X_conts, X_cats, cat_sizes def check_zero_hot(uci_base_path): uci_base = Path(uci_base_path) uci_paths = [uci_base / 'bin-class-data', uci_base / 'multi-class-data', uci_base / 'regression-data'] for path in uci_paths: ds_names = [file.stem for file in path.iterdir() if file.is_file()] ds_names.sort() for ds_name in ds_names: print('Processing dataset', ds_name) ds_path = path / (ds_name + '.csv') data = np.genfromtxt(ds_path, delimiter=',') X = data[:, 1:] X_cont, X_cat, cat_sizes = extract_categories(X) if np.any(np.logical_and(np.min(X_cat, axis=0) == 0, np.max(X_cat, axis=0) >= 2)): print('This dataset has a zero-hot encoding') def convert_to_class_numbers(y): y = np.rint(y) y_target = np.zeros(y.shape, dtype=np.int32) classes = np.unique(y) n_classes = len(classes) for i, c in enumerate(classes): y_target[y == c] = i return y_target, n_classes def import_from_csv(ds_path: Union[Path, str], task_type: TaskType, task_desc: TaskDescription, paths: Paths, default_split_idx: Optional[int] = None, remove_duplicates: bool = False): data = np.genfromtxt(ds_path, delimiter=',') X = data[:, 1:] y = data[:, 0] x_cont, x_cat, cat_sizes = extract_categories(X) n_classes = 0 if remove_duplicates: # check for duplicates df_cont = pd.DataFrame(x_cont) df_cat = pd.DataFrame(x_cat) df_combined = pd.concat([df_cont, df_cat], axis=1) # Concatenate the two DataFrames along the column axis is_duplicated = df_combined.duplicated() if is_duplicated.any(): print(f'Warning: Data set contains {is_duplicated.sum()} duplicate values! Removing duplicates...') not_duplicated_np = (~is_duplicated).values x_cont = x_cont[not_duplicated_np] x_cat = x_cat[not_duplicated_np] y = y[not_duplicated_np] # preprocess y if task_type == TaskType.CLASSIFICATION: y, n_classes = convert_to_class_numbers(y) elif task_type == TaskType.REGRESSION: # normalize y y = (y - np.mean(y, axis=-1)) / (np.std(y, axis=-1) + 1e-30) ds = DictDataset({'x_cont': torch.as_tensor(x_cont, dtype=torch.float32), 'x_cat': torch.as_tensor(x_cat, dtype=torch.long), 'y': torch.as_tensor(y[:, None])}, {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]), 'x_cat': TensorInfo(cat_sizes=cat_sizes), 'y': TensorInfo(cat_sizes=[n_classes])}) task_info = TaskInfo.from_ds(task_desc, ds, default_split_idx=default_split_idx) task = Task(task_info, ds) task.save(paths) def import_uci_tasks(paths: Paths, remove_duplicates: bool = False, rerun=False): uci_base = Path(paths.uci_download()) uci_matches = [(TaskSource.UCI_BIN_CLASS, uci_base / 'bin-class-data'), (TaskSource.UCI_MULTI_CLASS, uci_base / 'multi-class-data'), (TaskSource.UCI_REGRESSION, uci_base / 'regression-data')] for src, path in uci_matches: print('Processing task source', src) ds_names = [file.stem for file in path.iterdir() if file.is_file()] ds_names.sort() task_type = TaskType.CLASSIFICATION if 'class' in src else TaskType.REGRESSION for ds_name in ds_names: task_desc = TaskDescription(task_source=src, task_name=ds_name) if (not rerun) and task_desc.exists_task(paths): continue print('Processing dataset', ds_name) ds_path = path / (ds_name + '.csv') import_from_csv(ds_path=ds_path, task_type=task_type, task_desc=task_desc, paths=paths, remove_duplicates=remove_duplicates) TaskCollection.from_source(src, paths).save(paths) print() def get_openml_task_ids(suite_id: Union[str, int]) -> List[int]: import openml suite = openml.study.get_suite(suite_id) return suite.tasks class PandasTask: def __init__(self, x_df: pd.DataFrame, y_df: pd.Series, cat_indicator: List[bool], task_type: str, more_info: Dict): if len(x_df.columns) != len(cat_indicator): raise ValueError('x.shape[1] != len(category_indicator)') self.x_df = x_df # should be (sparse) pd.DataFrame # should be (sparse) pd.Series (i.e. a single column of a DataFrame) self.y_df = y_df if task_type == TaskType.REGRESSION else y_df.astype('category') # if pd.api.types.is_sparse(self.y_df): if isinstance(self.y_df.dtype, pd.SparseDtype): self.y_df = self.y_df.sparse.to_dense() # this is a fix because category_indicator[0] was False for the dataset MIP-2016-regression # despite the column being categorical (dtype=object) self.cat_indicator = [v or not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]]) for i, v in enumerate(cat_indicator)] self.cont_indicator = [not b for b in self.cat_indicator] self.task_type = task_type self.more_info_dict = more_info # could be passed along to TaskInfo def get_n_classes(self): if self.task_type == TaskType.REGRESSION: return 0 else: self.y_df = self.y_df.cat.remove_unused_categories() return len(self.y_df.cat.categories) def get_n_samples(self): return len(self.x_df) def deduplicate(self): is_duplicated = self.x_df.duplicated() if is_duplicated.any(): print(f'Warning: Data set contains {is_duplicated.sum()} duplicate values! Removing duplicates...') self.x_df = self.x_df.loc[~is_duplicated] self.y_df = self.y_df[~is_duplicated] def limit_n_classes(self, max_n_classes: int): n_classes = self.get_n_classes() if n_classes <= max_n_classes: return vc = self.y_df.value_counts() # use mergesort to make it more deterministic perm = np.argsort(vc, kind='mergesort') cats = vc.axes[0] largest_classes = [cats[i] for i in perm[-max_n_classes:]] other_classes = [cats[i] for i in perm[:-max_n_classes]] to_keep = self.y_df.isin(largest_classes) self.x_df = self.x_df.loc[to_keep, :] self.y_df = self.y_df[to_keep] self.y_df = self.y_df.cat.remove_categories(other_classes) def subsample(self, max_size: int): if self.x_df.shape[0] > max_size: gen = np.random.default_rng(seed=0) perm = gen.permutation(self.x_df.shape[0]) idxs = perm[:max_size] self.x_df = self.x_df.iloc[idxs] self.y_df = self.y_df.iloc[idxs] def remove_missing_cont(self): if not np.any(self.cont_indicator): return # no continuous columns not_nan_rows = self.x_df.loc[:, self.cont_indicator].notna().all(axis=1) self.x_df = self.x_df.loc[not_nan_rows, :] self.y_df = self.y_df[not_nan_rows] def normalize_regression_y(self): if self.task_type == TaskType.REGRESSION and len(self.y_df) >= 2: y_np = np.asarray(self.y_df) self.y_df.loc[:] = (y_np - np.mean(y_np)) / (np.std(y_np) + 1e-30) def get_task(self, task_desc: TaskDescription) -> Task: x_cont = np.array(self.x_df.loc[:, self.cont_indicator], dtype=np.float32) x_cat_columns = [] cat_sizes = [] for i, is_cat in enumerate(self.cat_indicator): if is_cat: # this fails if column names are also row names, # but this is maybe a good check because this might otherwise cause problems in other places... col = self.x_df[self.x_df.columns[i]].astype('category') # print(f'{type(self.x_df.iloc[:, i])=}') # print(f'{type(col)=}') col = col.cat.remove_unused_categories() # detect missing values col = col.cat.remove_categories([s for s in ['', '?'] if s in col.cat.categories]) # don't use asarray to make sure that the array is not read-only col = np.array(col.cat.codes, dtype=np.int32) col += 1 # category 0 is used for missing value x_cat_columns.append(col) cat_sizes.append(1 + np.max(col)) if len(x_cat_columns) > 0: x_cat = np.stack(x_cat_columns, axis=1) else: x_cat = np.zeros(shape=(len(self.x_df), 0), dtype=np.int32) if self.task_type == TaskType.CLASSIFICATION: self.y_df = self.y_df.cat.remove_unused_categories() y = np.array(self.y_df.cat.codes, dtype=np.int32) # y, n_classes = convert_to_class_numbers(y) else: y = np.array(self.y_df, dtype=np.float32) ds = DictDataset({'x_cont': torch.as_tensor(x_cont), 'x_cat': torch.as_tensor(x_cat), 'y': torch.as_tensor(y[:, None])}, {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]), 'x_cat': TensorInfo(cat_sizes=cat_sizes), 'y': TensorInfo(cat_sizes=[self.get_n_classes()])}) task_info = TaskInfo.from_ds(task_desc, ds, more_info_dict=self.more_info_dict) return Task(task_info, ds) @staticmethod def from_openml_task_id(task_id: int): import openml task = openml.tasks.get_task(task_id, download_data=False) dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) x_df, y_df, cat_indicator, names = dataset.get_data(target=task.target_name, dataset_format='dataframe') if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION: task_type = TaskType.CLASSIFICATION elif task.task_type_id == openml.tasks.TaskType.SUPERVISED_REGRESSION: task_type = TaskType.REGRESSION else: raise RuntimeError(f'Unknown OpenML Task Type: {task.task_type}') more_info_dict = dict(openml_task_id=task_id, openml_dataset_id=task.dataset_id) return PandasTask(x_df, y_df, cat_indicator, task_type, more_info=more_info_dict) def set_openml_cache_dir(dir_name: Union[str, Path]): import openml if 'set_root_cache_directory' in dir(openml.config): # newer openml versions openml.config.set_root_cache_directory(str(dir_name)) elif 'set_cache_directory' in dir(openml.config): # older openml versions openml.config.set_cache_directory(str(dir_name)) def get_openml_ds_names(task_ids: List[int]): import openml names = [] for i, task_id in enumerate(task_ids): task = openml.tasks.get_task(task_id, download_data=False) dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) names.append(dataset.name) return names def import_openml(task_ids: List[int], task_source_name: str, paths: Paths, cache_dir: Union[str, Path] = None, normalize_y: bool = False, min_n_samples: int = 1, max_n_classes: int = 100000, min_n_classes: int = 0, remove_missing_cont: bool = True, remove_duplicates: bool = False, exclude_ds_names: Optional[List[str]] = None, max_n_samples: Optional[int] = None, include_only_ds_names: Optional[List[str]] = None, rerun: bool = False, ignore_above_n_classes: int = 100000): print(f'Processing task source {task_source_name}') import openml for i, task_id in enumerate(task_ids): with paths.new_tmp_folder() as tmp_folder: set_openml_cache_dir(cache_dir or tmp_folder) task = openml.tasks.get_task(task_id, download_data=False) dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) print(f'Processing task {dataset.name} for OpenML task source {task_source_name} [{i+1}/{len(task_ids)}]') if dataset.name in (exclude_ds_names or []) or \ (include_only_ds_names is not None and dataset.name not in include_only_ds_names): print('Task was manually excluded') continue task_desc = TaskDescription(task_source_name, dataset.name) if (not rerun) and task_desc.exists_task(paths): continue pd_task = PandasTask.from_openml_task_id(task_id) if remove_missing_cont: pd_task.remove_missing_cont() if remove_duplicates: pd_task.deduplicate() if max_n_samples is not None: pd_task.subsample(max_n_samples) if normalize_y: pd_task.normalize_regression_y() if pd_task.get_n_classes() > ignore_above_n_classes: print(f'Ignoring task with {pd_task.get_n_classes()} > {ignore_above_n_classes} classes') continue if pd_task.get_n_classes() > max_n_classes: print(f'Only keeping the most frequent {max_n_classes} out of {pd_task.get_n_classes()} classes') pd_task.limit_n_classes(max_n_classes) if pd_task.get_n_samples() < min_n_samples: print(f'Too few samples ({pd_task.get_n_samples()} < {min_n_samples}), ignoring task') continue if pd_task.get_n_classes() < min_n_classes: print(f'Too few classes, ignoring task') continue pd_task.get_task(task_desc).save(paths) TaskCollection.from_source(task_source_name, paths).save(paths) print(f'Finished importing OpenML tasks {task_source_name}') print() if __name__ == '__main__': # import time # paths = Paths.from_env_variables() # start_time = time.time() # with paths.new_tmp_folder() as tmp_folder: # pass # print(f'Time: {time.time() - start_time:g} s') task_ids = get_openml_task_ids(271) import_openml(task_ids[1:2], 'test', Paths('test')) pass ================================================ FILE: pytabkit/bench/data/paths.py ================================================ import os import uuid from pathlib import Path from typing import Optional from pytabkit.models import utils import shutil class TmpPathContextManager: """ Helper class: Context manager for creating temporary paths. """ def __init__(self, path: Path): self.path = path def __enter__(self) -> Path: if utils.existsDir(self.path): raise RuntimeError('Temporary path already exists:', self.path) utils.create_dir(self.path) return self.path def __exit__(self, type, value, traceback): shutil.rmtree(self.path) class Paths: """ This class provides paths where data can be stored. Its base path can be configured. It requires one base folder, which will have several subfolders: algs, tasks, task_collections, results, result_summaries, eval, plots, tmp, ... by subclassing this class, specific folders can be re-located (e.g. put data on SSD) """ def __init__(self, base_folder: str, tasks_folder: Optional[str] = None, results_folder: Optional[str] = None, result_summaries_folder: Optional[str] = None, uci_download_folder: Optional[str] = None): self.base_path = Path(base_folder) self.tasks_path = Path(tasks_folder) if tasks_folder is not None else self.base_path / 'tasks' self.results_path = Path(results_folder) if results_folder is not None else self.base_path / 'results' self.result_summaries_path = Path( result_summaries_folder) if result_summaries_folder is not None else self.base_path / 'result_summaries' self.uci_download_path = Path( uci_download_folder) if uci_download_folder is not None else self.base_path / 'uci_download' @staticmethod def from_env_variables() -> 'Paths': """ Construct a Paths object that is constructed from environment variables if they are set. Otherwise, the base folder will either be taken from custom_paths.py, if available, or set to './tab_bench_data'. :return: Paths object. """ base_folder = os.environ.get('TAB_BENCH_DATA_BASE_FOLDER', None) if base_folder is None: try: from scripts import custom_paths base_folder = custom_paths.get_base_folder() except: base_folder = './tab_bench_data' tasks_folder = os.environ.get('TAB_BENCH_DATA_TASKS_FOLDER', None) results_folder = os.environ.get('TAB_BENCH_DATA_RESULTS_FOLDER', None) result_summaries_folder = os.environ.get('TAB_BENCH_DATA_RESULT_SUMMARIES_FOLDER', None) uci_download_folder = os.environ.get('TAB_BENCH_DATA_UCI_DOWNLOAD_FOLDER', None) return Paths(base_folder=base_folder, tasks_folder=tasks_folder, results_folder=results_folder, result_summaries_folder=result_summaries_folder, uci_download_folder=uci_download_folder) def base(self) -> Path: return self.base_path def algs(self) -> Path: return self.base() / 'algs' def tasks(self) -> Path: return self.tasks_path def task_collections(self) -> Path: return self.base() / 'task_collections' def results(self) -> Path: return self.results_path def result_summaries(self) -> Path: return self.result_summaries_path def eval(self) -> Path: return self.base() / 'eval' def plots(self) -> Path: return self.base() / 'plots' def tmp(self) -> Path: return self.base() / 'tmp' def uci_download(self) -> Path: return self.uci_download_path def resources(self): return self.base() / 'resources' def times(self) -> Path: return self.base() / 'times' def new_tmp_folder(self) -> TmpPathContextManager: # https://stackoverflow.com/questions/2759644/python-multiprocessing-doesnt-play-nicely-with-uuid-uuid4 return TmpPathContextManager(self.tmp() / str(uuid.UUID(bytes=os.urandom(16), version=4))) def results_alg_task(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int) -> Path: return self.results() / alg_name / task_desc.task_source / task_desc.task_name / f'{n_cv}-fold' def summary_alg_task(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int) -> Path: return self.result_summaries() / alg_name / task_desc.task_source / task_desc.task_name \ / f'{n_cv}-fold' def results_alg_task_split(self, task_desc: 'TaskDescription', alg_name: str, n_cv: int, split_type: str, split_id: int) -> Path: return self.results_alg_task(task_desc, alg_name, n_cv) / split_type / str(split_id) def tasks_task(self, task_desc: 'TaskDescription') -> Path: return self.tasks() / task_desc.task_source / task_desc.task_name def results_task(self, task_desc: 'TaskDescription') -> Path: return self.results() / task_desc.task_source / task_desc.task_name def resources_exp_it(self, exp_name: str, iteration: int) -> Path: return self.resources() / exp_name / str(iteration) def task_source(self, task_source_name: str) -> Path: return self.tasks() / task_source_name def times_alg_task(self, alg_name: str, task_desc: 'TaskDescription'): return self.times() / alg_name / task_desc.task_source / task_desc.task_name ================================================ FILE: pytabkit/bench/data/tasks.py ================================================ from typing import Dict, List, Optional from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.models import utils import numpy as np import torch from pytabkit.models.data.data import TensorInfo, TaskType, DictDataset from pytabkit.models.data.splits import SplitInfo, RandomSplitter, IndexSplitter # Should a Task/TaskInfo allow to configure the sizes of train/val/test? # Disadvantages: # - Might want to compare different train sizes on the same test set # - How do we distinguish them in a TaskDescription? # current solution is instead to set this in RunConfig # alternatively, could consider encoding this in the split type, # but this would only concern the fraction of test samples # make default split simply an int so it can be serialized more easily? # Do we ever need something other than an IndexSplitter? class TaskDescription: """ The minimal necessary information to identify a task, consisting of a task source and a task name. A task is a dataset with a specific target variable. """ def __init__(self, task_source: str, task_name: str): """ :param task_source: Name of the source where the task was retrieved from (see ``data.common.TaskSource``) :param task_name: Name of the task (dataset). """ self.task_source = task_source self.task_name = task_name def load_info(self, paths: Paths) -> 'TaskInfo': """ Load the associated TaskInfo object. :param paths: Path configuration. :return: Task info object. """ return TaskInfo.load(paths, self) def load_task(self, paths: Paths): """ Load the associated Task object. :param paths: Path configuration. :return: Task object. """ return self.load_info(paths).load_task(paths) def exists_task(self, paths: Paths): """ Check if the task for this description is stored on disk. :param paths: Path configuration. :return: True iff it exists. """ return utils.existsFile(paths.tasks_task(self) / 'info.yaml') def __str__(self): """ :return: Description as a string ``f'{self.task_source}/{self.task_name}'`` """ return f'{self.task_source}/{self.task_name}' def to_dict(self) -> Dict: """ Convert to a dictionary for saving. :return: Dictionary with 'task_source' and 'task_name' entries. """ return {'task_source': self.task_source, 'task_name': self.task_name} @staticmethod def from_dict(data: Dict) -> 'TaskDescription': """ Create from a dictionary. :param data: Dictionary. :return: TaskDescription object. """ return TaskDescription(task_source=data['task_source'], task_name=data['task_name']) def __hash__(self): return hash(str(self)) def __eq__(self, other): if not isinstance(other, TaskDescription): return False return self.task_source == other.task_source and self.task_name == other.task_name class TaskCollection: """ Collection (list) of TaskDescription objects with its own name (can be the name of the task source). """ # there should be a TaskCollection for every TaskSource with the same name # but there can be other collections with other names def __init__(self, coll_name: str, task_descs: List[TaskDescription]): """ :param coll_name: Name of the task collection. :param task_descs: Task descriptions. """ self.coll_name = coll_name self.task_descs = task_descs def save(self, paths: Paths): file = paths.task_collections() / f'{self.coll_name}.yaml' data = {'coll_name': self.coll_name, 'task_descs': [td.to_dict() for td in self.task_descs]} utils.serialize(file, data, use_yaml=True) def load_infos(self, paths: Paths) -> List['TaskInfo']: return [desc.load_info(paths) for desc in self.task_descs] @staticmethod def from_name(coll_name: str, paths: Paths) -> 'TaskCollection': file = paths.task_collections() / f'{coll_name}.yaml' data = utils.deserialize(file, use_yaml=True) task_descs = [TaskDescription.from_dict(d) for d in data['task_descs']] return TaskCollection(data['coll_name'], task_descs) @staticmethod def from_source(task_source: str, paths: Paths) -> 'TaskCollection': """ Create a task collection with all tasks from a given task source (that have been imported/saved with this task source name). The task collection will have the same name as the source. :param task_source: Name of the task source. :param paths: Path configuration. :return: TaskCollection object. """ path = paths.task_source(task_source) if not utils.existsDir(path): return TaskCollection(task_source, []) task_descs = [TaskDescription(task_source, p.name) for p in path.iterdir()] task_descs.sort(key=lambda task_desc: str(task_desc).lower()) # sort by name return TaskCollection(task_source, task_descs) class TaskInfo: """ Information about a task (without containing the dataset itself). """ def __init__(self, task_desc: TaskDescription, n_samples: int, tensor_infos: Dict[str, TensorInfo], default_split_idx: Optional[int], more_info_dict: Optional[Dict], max_n_trainval: Optional[int] = None): """ :param task_desc: Task description. :param n_samples: Number of samples. :param tensor_infos: Information about the tensors (x_cat, x_cont, y). :param default_split_idx: If the dataset has a default split, this is the index of the first test sample. We assume that in this case, the training part is stored before the test part. :param more_info_dict: Dictionary with more information that can be stored, for example about the original OpenML dataset id. :param max_n_trainval: maximum number of samples used for training+validation in random splits. If None (default value), no maximum is imposed. """ self.task_desc = task_desc self.n_samples = n_samples self.tensor_infos = tensor_infos self.task_type = TaskType.REGRESSION if tensor_infos['y'].is_cont() else TaskType.CLASSIFICATION self.default_split_idx = default_split_idx self.more_info_dict = more_info_dict or dict() self.max_n_trainval = max_n_trainval def get_n_classes(self) -> int: """ :return: Number of classes for classification, or 0 for regression. """ return self.tensor_infos['y'].get_cat_size_product() # we take the product, but it should only be 1 element def load_task(self, paths: Paths) -> 'Task': """ Load the associated task. :param paths: Path configuration. :return: Task object. """ path = paths.tasks_task(self.task_desc) tensors = {} tensors['x_cont'] = torch.as_tensor(np.load(str(path / 'x_cont.npy'))).type(torch.float32) tensors['x_cat'] = torch.as_tensor(np.load(str(path / 'x_cat.npy'))).type(torch.long) tensors['y'] = torch.as_tensor(np.load(str(path / 'y.npy'))).type( torch.long if self.task_type == TaskType.CLASSIFICATION else torch.float32) ds = DictDataset(tensors=tensors, tensor_infos=self.tensor_infos) return Task(task_info=self, ds=ds) def get_ds_size_gb(self) -> float: """ :return: Dataset size in gigabyte, when stored in torch Tensors (8 byte for categorical variables, 4 byte for continuous variables). """ # need 8 byte for categorical variables (torch.long) but only 4 for continuous (torch.float32) return self.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4) for ti in self.tensor_infos.values()]) / (1024**3) def save(self, paths: Paths): path = paths.tasks_task(self.task_desc) info_dict = {'task_desc': self.task_desc.to_dict(), 'n_samples': self.n_samples, 'tensor_infos': {key: value.to_dict() for key, value in self.tensor_infos.items()}, 'default_split_idx': None if self.default_split_idx is None else int(self.default_split_idx), 'more_info_dict': self.more_info_dict, 'max_n_trainval': self.max_n_trainval} utils.serialize(path / 'info.yaml', info_dict, use_yaml=True) @staticmethod def load(paths: Paths, task_desc: TaskDescription): info_dict = utils.deserialize(paths.tasks_task(task_desc) / 'info.yaml', use_yaml=True) return TaskInfo(task_desc=TaskDescription.from_dict(info_dict['task_desc']), n_samples=info_dict['n_samples'], tensor_infos={key: TensorInfo.from_dict(value) for key, value in info_dict['tensor_infos'].items()}, default_split_idx=info_dict['default_split_idx'], more_info_dict=info_dict.get('more_info_dict', dict()), max_n_trainval=info_dict.get('max_n_trainval', None)) @staticmethod def from_ds(task_desc: TaskDescription, ds: DictDataset, default_split_idx: Optional[int] = None, more_info_dict: Optional[Dict] = None) -> 'TaskInfo': return TaskInfo(task_desc=task_desc, n_samples=ds.n_samples, tensor_infos=ds.tensor_infos, default_split_idx=default_split_idx, more_info_dict=more_info_dict) def get_random_splits(self, n_splits: int, trainval_fraction: float = 0.8, train_fraction: float = 0.75) -> List[SplitInfo]: # use n_samples to generate alg_seed # in order to have the randomness also depend on the data set and not only on the split index return [SplitInfo(RandomSplitter(seed=i, first_fraction=trainval_fraction, max_n_first=self.max_n_trainval), SplitType.RANDOM, id=i, alg_seed=utils.combine_seeds(self.n_samples, i), train_fraction=train_fraction) for i in range(n_splits)] def get_default_splits(self, n_splits) -> List[SplitInfo]: if self.default_split_idx is None: return [] else: return [SplitInfo(IndexSplitter(self.default_split_idx), SplitType.DEFAULT, id=i, alg_seed=utils.combine_seeds(self.n_samples, i)) for i in range(n_splits)] class Task: """ Task (dataset with defined target variable), consisting of a task info and a dataset. """ def __init__(self, task_info: TaskInfo, ds: DictDataset): self.task_info = task_info self.ds = ds # data is on CPU here def save(self, paths: Paths): path = paths.tasks_task(self.task_info.task_desc) utils.ensureDir(path / 'x_cont.npy') np.save(str(path / 'x_cont.npy'), self.ds.tensors['x_cont'].type(torch.float32).numpy()) np.save(str(path / 'x_cat.npy'), self.ds.tensors['x_cat'].type(torch.int32).numpy()) np.save(str(path / 'y.npy'), self.ds.tensors['y'].type( torch.int32 if self.task_info.task_type == TaskType.CLASSIFICATION else torch.float32).numpy()) self.task_info.save(paths) class TaskPackage: """ Combines information about how to run a task on a benchmark. """ def __init__(self, task_info: TaskInfo, split_infos: List[SplitInfo], n_cv: int, n_refit: int, paths: Paths, rerun: bool, alg_name: str, save_y_pred: bool): self.task_info = task_info self.split_infos = split_infos self.n_cv = n_cv self.n_refit = n_refit self.paths = paths self.rerun = rerun self.alg_name = alg_name self.save_y_pred = save_y_pred ================================================ FILE: pytabkit/bench/data/uci_file_ops.py ================================================ import os as os import re as re import csv as csv import math as math from pathlib import Path import pandas as pandas import numpy as numpy import os.path as path import glob as glob import shutil as shutil import zipfile as zipfile from scipy.io import arff import patoolib as patoolib import sklearn.preprocessing as preprocessing import sklearn.datasets as datasets import urllib.request as urllib2 import time import datetime import codecs import platform import tarfile import gzip import ssl from collections import Counter class UCIVars: # formerly global variables, will be re-set by get_uci.download_all_uci() data_folder = '../data/' raw_data_folder = '../raw-data/' regression_data_folder = '../regression-data/' binary_classification_data_folder = '../bin-class-data/' multiclass_classification_data_folder = '../multi-class-data/' statistics_filename = "../data_statistics.csv" data_group_id = 0 # if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): # ssl._create_default_https_context = ssl._create_unverified_context #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------- def prepare_new_data_set_group_id(): print("\n==================================================================") UCIVars.data_group_id = UCIVars.data_group_id + 1 #--------------------------------------------------------------------------------------------------- def make_folder(folder): if (os.path.exists(folder) == False): os.mkdir(folder) #--------------------------------------------------------------------------------------------------- def download_and_save(url, filename): data_link = urllib2.urlopen(url) print('Downloading: ' + filename) if os.path.exists(UCIVars.raw_data_folder + filename): os.remove(UCIVars.raw_data_folder + filename) with open(UCIVars.raw_data_folder + filename, 'wb') as output: output.write(data_link.read()) #--------------------------------------------------------------------------------------------------- def unzip_raw_data(filename): zip_ref = zipfile.ZipFile(UCIVars.raw_data_folder + filename, 'r') zip_ref.extractall(UCIVars.raw_data_folder) zip_ref.close() #--------------------------------------------------------------------------------------------------- def unrar_raw_data(filename): full_filename = UCIVars.raw_data_folder + filename patoolib.extract_archive(full_filename, outdir = UCIVars.raw_data_folder) #--------------------------------------------------------------------------------------------------- def my_decode(x): if isinstance(x, bytes): return x.decode('utf-8') else: return str(x) #--------------------------------------------------------------------------------------------------- def unarff_raw_data(filename): data = arff.loadarff(UCIVars.raw_data_folder + filename + '.arff')[0] target_filename = UCIVars.raw_data_folder + filename + '.data' data_cleaned = [] for row in data: data_cleaned.append([my_decode(entry) for entry in row]) with open(target_filename, "w") as target_file: writer = csv.writer(target_file, lineterminator = '\n') writer.writerows(data_cleaned) #--------------------------------------------------------------------------------------------------- def un_z_raw_data(filename): if platform.system() == "Linux": os.system('uncompress -f ' + UCIVars.raw_data_folder + filename) return True else: print("Could not decompress .Z file, since this requires Linux.") return False #--------------------------------------------------------------------------------------------------- def untar_raw_data(filename): full_filename = UCIVars.raw_data_folder + filename tar = tarfile.open(full_filename) tar.extractall(UCIVars.raw_data_folder) tar.close() #--------------------------------------------------------------------------------------------------- def ungz_raw_data(filename): full_filename = UCIVars.raw_data_folder + filename target_filename = UCIVars.raw_data_folder + filename + '.data' target_file = open(target_filename, "w") with gzip.open(full_filename, 'rt') as source_file: data = source_file.read() target_file.write(data) target_file.close() #--------------------------------------------------------------------------------------------------- def replace_chars_in_file(filename, old_char, new_char): fr = codecs.open(UCIVars.raw_data_folder + filename, encoding = 'utf-8') content = fr.read() fr.close() newcontent = content.replace(old_char, new_char) fw = codecs.open(UCIVars.raw_data_folder + filename, 'w', encoding = 'utf-8') fw.write(newcontent) fw.close() #--------------------------------------------------------------------------------------------------- def get_category_replace_string(category_size, position, separator): string = '' for i in range(position): string = string + '0' + separator string = string + '1' + separator for i in range(position + 1, category_size): string = string + '0' + separator string = string[0:len(string) - len(separator)] return string #--------------------------------------------------------------------------------------------------- def replace_categories_in_file(filename, categories, separator): for i in range(len(categories)): replace_chars_in_file(filename, categories[i], get_category_replace_string(len(categories), i, separator)) #--------------------------------------------------------------------------------------------------- def convert_replace_string_to_vector(string, separator): string_vector = string.split(separator) return list(numpy.float_(string_vector)) #--------------------------------------------------------------------------------------------------- def get_categories_in_mixed_data(data, column): rows = numpy.shape(data)[0] categories = list(set(data[0:rows, column])) return categories #--------------------------------------------------------------------------------------------------- def auto_replace_categories_in_mixed_data(data, column, separator, unknown_string = '', unknown_replacement_value = 0): categories = get_categories_in_mixed_data(data, column) if numpy.shape(categories)[0] == 2: new_data = replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string = unknown_string, unknown_replacement_value = unknown_replacement_value) else: new_data = replace_categories_in_mixed_data(data, categories, column, separator, unknown_string = unknown_string, unknown_replacement_value = unknown_replacement_value) return new_data #--------------------------------------------------------------------------------------------------- def auto_replace_missing_in_mixed_data(data, unknown_string = '?'): rows = numpy.shape(data)[0] dim = numpy.shape(data)[1] columns = range(dim) for i in range(len(columns)): count_entries = Counter(data[0:rows, columns[i]]) weighted_sum = 0.0 entries_sum = 0.0 for key in count_entries: if key != unknown_string: weighted_sum = weighted_sum + float(key) * count_entries[key] entries_sum = entries_sum + count_entries[key] average = weighted_sum / float(entries_sum) data = replace_categories_in_mixed_data(data, [], columns[i], ',', unknown_string = '?', unknown_replacement_value = average) return data #--------------------------------------------------------------------------------------------------- def replace_categories_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] empty_string_length = len(categories) * max(1, len(str(unknown_replacement_value))) + (len(categories) - 1) * len(separator) empty_string = ' ' * empty_string_length new_column = [empty_string] * rows new_column = data[0:rows, column] for i in range(len(categories)): replacement = get_category_replace_string(len(categories), i, separator) new_column = [replacement if word == categories[i] else word for word in new_column] if unknown_string != '': replacement = str(unknown_replacement_value) for i in range(len(categories) - 1): replacement = replacement + separator + str(unknown_replacement_value) new_column = [replacement if word == unknown_string else word for word in new_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_bin_cats_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] empty_string_length = max(2, len(str(unknown_replacement_value))) empty_string = ' ' * empty_string_length new_column = [empty_string] * rows new_column = data[0:rows, column] if unknown_string != '': replacement = str(unknown_replacement_value) new_column = [replacement if word == unknown_string else word for word in new_column] for i in range(len(categories)): replacement = str(2 * i - 1) new_column = [replacement if word == categories[i] else word for word in new_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_ordinals_in_mixed_data(data, categories, column, separator, unknown_string = '', unknown_replacement_value = 0, begin_value = 1): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] empty_string_length = max(len(str(unknown_replacement_value)), len(str(len(categories) + 1))) empty_string = ' ' * empty_string_length new_column = [empty_string] * rows new_column = data[0:rows, column] for i in range(len(categories)): replacement = str(i + begin_value) new_column = [replacement if word == categories[i] else word for word in new_column] if unknown_string != '': replacement = str(unknown_replacement_value) new_column = [replacement if word == unknown_string else word for word in new_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_manual_in_mixed_data(data, categories, column, replacement, separator, unknown_string = '', unknown_replacement_value = 0): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] empty_string_length = max(len(str(unknown_replacement_value)), len(str(len(categories) + 1))) empty_string = ' ' * empty_string_length new_column = [empty_string] * rows new_column = data[0:rows, column] for i in range(len(categories)): new_column = [str(replacement[i]) if word == categories[i] else word for word in new_column] if unknown_string != '': replacement_tmp = str(unknown_replacement_value) new_column = [replacement_tmp if word == unknown_string else word for word in new_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_circulars_in_mixed_data(data, categories, column, separator, unknown_string = ''): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] decimals = 5 empty_string_length = 2 * (decimals + 3) + len(separator) empty_string = ' ' * empty_string_length new_column = [empty_string] * rows new_column = data[0:rows, column] for i in range(len(categories)): radians = float(i) * 2.0 * math.pi / float(len(categories)) replacement = str(round(math.cos(radians), decimals)) + separator + str(round(math.sin(radians), decimals)) new_column = [replacement if word == categories[i] else word for word in new_column] if unknown_string != '': replacement = str(0.0) + separator + str(0.0) new_column = [replacement if word == unknown_string else word for word in new_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_isodate_by_day_in_mixed_data(data, column): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] old_column = [numpy.datetime64(date) for date in data[0:rows, column]] new_column = [str(date.astype(datetime.datetime).isoweekday()) for date in old_column] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def replace_time_by_seconds_in_mixed_data(data, column, sep, rounded = 1): rows = numpy.shape(data)[0] cols = numpy.shape(data)[1] new_column = [str(int(round(float(convert_time_to_seconds(time, sep)) / float(rounded))) * rounded) for time in data[0:rows, column]] new_column = numpy.reshape(new_column, newshape = (rows, 1)) new_data = numpy.concatenate((data[0:rows, 0:column], new_column, data[0:rows, column + 1:cols]), axis = 1) return new_data #--------------------------------------------------------------------------------------------------- def remove_files(folder, filename_pattern): filenames = glob.glob(folder + filename_pattern) for name in filenames: os.remove(name) #--------------------------------------------------------------------------------------------------- def concat_files(source_filename_pattern, target_filename): filenames = glob.glob(source_filename_pattern) if os.path.exists(target_filename): os.remove(target_filename) with open(target_filename,'wb') as target_file: for name in filenames: with open(name,'rb') as source_file: shutil.copyfileobj(source_file, target_file, 1024*1024*10) #--------------------------------------------------------------------------------------------------- def load_mixed_raw_data(filename, sep, header = False): # Some Python versions issue a warning if 'encoding' is not set, while other versions do not know 'encoding' # Pick the one you prefer ... #data = numpy.genfromtxt(UCIVars.raw_data_folder + filename, dtype = None, delimiter = sep) data = numpy.genfromtxt(UCIVars.raw_data_folder + filename, dtype = str, delimiter = sep, encoding = None) if (header == True): data = numpy.delete(data, 0, 0) if len(numpy.shape(data)) == 1: dim = len(data[0]) rows = numpy.shape(data)[0] new_data = [None] * (dim * rows) new_data = numpy.reshape(new_data, newshape = (rows, dim)) for i in range(rows): new_data[i] = map(str, data[i]) data = new_data return data #--------------------------------------------------------------------------------------------------- def write_mixed_raw_data(filename, data, sep): with open(filename, mode = 'w') as write_file: writer = csv.writer(write_file, delimiter = sep, quotechar = '', quoting = csv.QUOTE_NONE, escapechar = ' ') writer.writerows(data) # replace_chars_in_file will add the raw_data_path, so we have to remove it from the filename replace_chars_in_file(Path(filename).name, ' ' + sep, sep) #--------------------------------------------------------------------------------------------------- def load_raw_data(filename, sep, description_columns = 0, date_column = -1, date_sep = '', date_order = '', time_column = -1, time_sep = '', german_decimal = False, na_string = '---', show_intermediate = False, header = False): fp = open(UCIVars.raw_data_folder + filename, 'r') number_of_rows = 0 number_of_lines = 0 max_number_of_columns = 0 rows_with_na_string = 0 rows_with_incorrect_date = 0 rows_with_incorrect_time = 0 rows_with_incorrect_number_of_columns = 0 rows_with_odd_error = 0 is_first_line = True for row in fp: if (is_first_line == True) and (header == True): is_first_line = False else: row = row.strip() raw_row = row.split(sep) number_of_columns = numpy.shape(raw_row)[0] max_number_of_columns = max(number_of_columns, max_number_of_columns) number_of_data_columns = number_of_columns - description_columns current_row = numpy.zeros(shape = (1, number_of_data_columns)) number_of_lines = number_of_lines + 1 if ((number_of_lines % 1000 == 0) and (show_intermediate == True)): print("Read %d lines" %number_of_lines) correct_row = True for c in range(description_columns, number_of_columns): if (raw_row[c] == na_string): correct_row = False rows_with_na_string = rows_with_na_string + 1 elif (c == date_column): date = raw_row[c].split(date_sep) if (len(date) != 3): correct_row = False rows_with_incorrect_date = rows_with_incorrect_date + 1 else: date_string = date[0] + '-' + date[1] + '-' + date[2] date_fmt = '%' + date_order[0] + '-%' + date_order[1] + '-%' + date_order[2] date_result = datetime.datetime.strptime(date_string, date_fmt) date_tuple = date_result.timetuple() current_row[0, c - description_columns] = float(date_tuple.tm_yday) elif (c == time_column): time = raw_row[c].split(time_sep) if (len(time) != 3): correct_row = False rows_with_incorrect_time = rows_with_incorrect_time + 1 else: current_row[0, c - description_columns] = 3600.0 * float(time[0]) + 60.0 * float(time[1]) + float(time[2]) elif (is_number(raw_row[c], german_decimal) == True): if (german_decimal == False): current_row[0, c - description_columns] = float(raw_row[c]) else: current_row[0, c - description_columns] = float(raw_row[c].replace(',', '.', 1)) elif (raw_row[c] == ''): current_row[0, c - description_columns] = 0.0 else: correct_row = False rows_with_odd_error = rows_with_odd_error + 1 if (number_of_columns != max_number_of_columns): correct_row = False rows_with_incorrect_number_of_columns = rows_with_incorrect_number_of_columns + 1 if (correct_row == False): break if (correct_row == True): number_of_rows = number_of_rows + 1 if (number_of_rows == 1): data = numpy.zeros(shape = (0, number_of_data_columns)) data_block = current_row else: data_block = numpy.concatenate((data_block, current_row), axis = 0) if (number_of_rows == 1000): data = data_block data_block = numpy.zeros(shape = (0, number_of_data_columns)) elif (number_of_rows % 1000 == 0): data = numpy.concatenate((data, data_block), axis = 0) data_block = numpy.zeros(shape = (0, number_of_data_columns)) # Make sure the last block is added if this has not just happened if (number_of_rows % 1000 != 0): data = numpy.concatenate((data, data_block), axis = 0) fp.close() if (number_of_lines - number_of_rows > 0): if (number_of_rows > 0): print("File %s has %d data columns and %d rows with complete data and %d rows with corrupted data" % (filename, numpy.shape(data)[1], number_of_rows, number_of_lines - number_of_rows)) print("Rows with na string: %d" % rows_with_na_string) print("Rows with incorrect date: %d" % rows_with_incorrect_date) print("Rows with incorrect time: %d" % rows_with_incorrect_time) print("Rows with incorrect number of columns: %d" % rows_with_incorrect_number_of_columns) print("Rows with odd error: %d" % rows_with_odd_error) else: print("Could not read a single row!!!\n") quit() else: print("File %s has %d data columns and %d rows" % (filename, numpy.shape(data)[1], number_of_rows)) return data #--------------------------------------------------------------------------------------------------- def remove_rows_with_label(data, label): bad_rows = numpy.where(data[:, 0] == label)[0] if (len(bad_rows) > 0): data = numpy.delete(data, bad_rows, axis = 0) print('Removing %d rows with label %1.3f' % (len(bad_rows), label)) return data #--------------------------------------------------------------------------------------------------- def remove_empty_columns(data): min_values = numpy.min(data, axis = 0) max_values = numpy.max(data, axis = 0) value_range = max_values - min_values empty_columns = numpy.where(value_range == 0.0)[0] if (len(empty_columns) > 0): print('Removing %d empty columns' % len(empty_columns)) data = remove_columns(data, empty_columns) return data #--------------------------------------------------------------------------------------------------- def save_data_to_file(data, filename, is_classification, is_regression = True, min_scale = -1.0, max_scale = 1.0): data_stats = {} data_stats['filename'] = filename data = remove_empty_columns(data) number_of_rows = numpy.shape(data)[0] number_of_columns = numpy.shape(data)[1] data_stats['rows'] = number_of_rows data_stats['columns'] = number_of_columns - 1 data_stats['binary columns'] = count_bin_columns(data) print("Writing file %s with dim = %d and %d rows" % (filename, number_of_columns - 1, number_of_rows)) numpy.savetxt(UCIVars.data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '') min_values = numpy.min(data, axis = 0) max_values = numpy.max(data, axis = 0) value_range = max_values - min_values for c in range(1, number_of_columns): m = (max_scale - min_scale) / value_range[c] b = min_scale - m * min_values[c] data[:, c] = m * data[:, c] + b if (is_classification == False): min_scale = -1.0 max_scale = 1.0 m = (max_scale - min_scale) / value_range[0] b = min_scale - m * min_values[0] data[:, 0] = m * data[:, 0] + b if (is_regression == True): numpy.savetxt(UCIVars.regression_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '') data_stats['classes'] = 0 data_stats['naive'] = numpy.var(data[:, 0]) save_data_stats(data_stats) if (is_classification == True): all_labels = data[:, 0].astype(int) labels, label_counts = numpy.unique(all_labels, return_counts = True) data_stats['classes'] = len(labels) highest_frequency = numpy.max(label_counts) data_stats['naive'] = float(number_of_rows - highest_frequency) / float(number_of_rows) if (len(labels) == 2): m = 2.0 / (labels[1] - labels[0]) b = - (labels[1] + labels[0]) / (labels[1] - labels[0]) data[:, 0] = numpy.floor(m * data[:, 0] + b + 0.5) numpy.savetxt(UCIVars.binary_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '') save_data_stats(data_stats) else: numpy.savetxt(UCIVars.multiclass_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '') save_data_stats(data_stats) second_highest_frequency = numpy.sort(label_counts)[len(labels) - 2] if (highest_frequency != second_highest_frequency): label_1 = labels[numpy.nonzero(label_counts == highest_frequency)[0][0]] label_2 = labels[numpy.nonzero(label_counts == second_highest_frequency)[0][0]] else: label_1 = labels[numpy.nonzero(label_counts == highest_frequency)[0][0]] label_2 = labels[numpy.nonzero(label_counts == highest_frequency)[0][1]] data_1 = data[data[:, 0] == label_1] data_1[:, 0] = -1.0 data_2 = data[data[:, 0] == label_2] data_2[:, 0] = 1.0 data = numpy.concatenate((data_1, data_2), axis = 0) data_stats['classes'] = 2 data_stats['rows'] = highest_frequency + second_highest_frequency data_stats['naive'] = float(second_highest_frequency) / float(data_stats['rows']) if (data_stats['rows'] >= 2500): numpy.savetxt(UCIVars.binary_classification_data_folder + filename + '.csv', data, fmt = '%.8e', delimiter = ',', newline = '\n', header = '', footer = '') save_data_stats(data_stats) #--------------------------------------------------------------------------------------------------- def save_data_stats(data_stats): if os.path.exists(UCIVars.statistics_filename): string = '' else: string = 'Name, Rows, Columns, Binary Columns, Classes, Naive Error, Relative Weight\n' string = string + data_stats['filename'] + ', ' + str(data_stats['rows']) + ', ' + str(data_stats['columns']) + ', ' + str(data_stats['binary columns']) + ', ' + str(data_stats['classes']) + ', ' + str(data_stats['naive']) + ', ' + str(UCIVars.data_group_id) + '\n' with open(UCIVars.statistics_filename, "a") as fp: fp.write(string) #--------------------------------------------------------------------------------------------------- def is_number(string, german_decimal): # Idea of this code is taken from # https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float if (german_decimal == False): string = string.replace('.', '', 1) else: string = string.replace(',', '', 1) string = string.replace('e-', '', 1) string = string.replace('e+', '', 1) string = string.replace('E-', '', 1) string = string.replace('E+', '', 1) string = string.replace('-', '', 2) string = string.replace('+', '', 1) return string.isdigit() #--------------------------------------------------------------------------------------------------- def remove_columns(data, columns): return numpy.delete(data, columns, axis = 1) #--------------------------------------------------------------------------------------------------- def move_label_in_front(data, label_column): number_of_rows = numpy.shape(data)[0] labels = numpy.reshape(data[:, label_column], newshape = (number_of_rows, 1)) unlabeled_data = remove_columns(data, [label_column]) data = numpy.concatenate((labels, unlabeled_data), axis = 1) return data #--------------------------------------------------------------------------------------------------- def count_bin_columns(data): cols = numpy.shape(data)[1] count = 0 for i in range(1, cols): if len(set(data[:, i])) == 2: count = count + 1 return count #--------------------------------------------------------------------------------------------------- def convert_time_to_seconds(time, sep): time_tmp = time.split(sep) seconds = 3600 * int(time_tmp[0]) + 60 * int(time_tmp[1]) + int(time_tmp[2]) return seconds ================================================ FILE: pytabkit/bench/eval/__init__.py ================================================ ================================================ FILE: pytabkit/bench/eval/analysis.py ================================================ from typing import Optional, Callable, Tuple, Dict, List, Union import numpy as np import scipy from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.eval.evaluation import FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector, TaskWeighting, \ get_ranks from pytabkit.models import utils from pytabkit.models.data.nested_dict import NestedDict class ResultsTables: def __init__(self, paths: Paths): self.paths = paths self.tables = NestedDict() def get(self, coll_name: str, n_cv: int = 1, tag: str = 'paper') -> MultiResultsTable: idxs = (coll_name, n_cv, tag) if idxs in self.tables: return self.tables[idxs] else: # load table from disk task_collection = TaskCollection.from_name(coll_name, self.paths) alg_filter = FunctionAlgFilter(lambda an, tags, config, my_tag=tag: my_tag in tags) table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=self.paths, alg_filter=alg_filter) self.tables[idxs] = table return table def _get_t_mean_confidence_interval_single(values: np.ndarray) -> Tuple[float, float]: # following https://www.geeksforgeeks.org/how-to-calculate-confidence-intervals-in-python/ # see also https://stats.stackexchange.com/questions/358408/confidence-interval-for-the-mean-normal-distribution-or-students-t-distributi # and http://stla.github.io/stlapblog/posts/ModelReduction.html sem = scipy.stats.sem(values) if sem == 0.0: mean = np.mean(values) return mean, mean else: interval = scipy.stats.t.interval(confidence=0.95, df=len(values) - 1, loc=np.mean(values), scale=sem) return interval[0], interval[1] def get_t_mean_confidence_interval(values: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: # takes the confidence intervals across the last dimension, # the other dimensions are considered to be batch dimensions if len(values.shape) == 1: lower, upper = _get_t_mean_confidence_interval_single(values) return np.asarray(lower), np.asarray(upper) pairs = [get_t_mean_confidence_interval(values[i]) for i in range(values.shape[0])] lower = np.asarray([pair[0] for pair in pairs]) upper = np.asarray([pair[1] for pair in pairs]) return lower, upper def get_benchmark_results(paths: Paths, table: MultiResultsTable, coll_name: str, use_relative_score: bool = True, return_percentages: bool = True, val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, rel_alg_name: str = 'BestModel', use_ranks: bool = False, use_normalized_errors: bool = False, use_grinnorm_errors: bool = False, use_task_mean: bool = True, use_geometric_mean: bool = True, shift_eps: float = 1e-2, filter_alg_names_list: Optional[List[str]] = None, simplify_name_fn: Optional[Callable[[str], str]] = None, n_splits: int = 10, use_validation_errors: bool = False) -> \ Tuple[ Dict[str, Union[float, np.ndarray]], Dict[str, Tuple[Union[float, np.ndarray], Union[float, np.ndarray]]]]: # returns means and confidence intervals for each alg_name (converted using get_display_name()) # relative confidence intervals for arithmetic mean are a bit wrong # because the uncertainty in the divisor is not incorporated f = (lambda x: np.log(x + shift_eps)) if use_geometric_mean else (lambda x: x) post_f = (lambda x: np.exp(x)) if use_geometric_mean else (lambda x: x) if simplify_name_fn is None: simplify_name_fn = get_simplified_name task_collection = TaskCollection.from_name(coll_name, paths) task_infos = task_collection.load_infos(paths) task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg' opt_groups = get_opt_groups(task_type_name) alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{ f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans) for group_name, alg_names in opt_groups.items() }} test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict, test_metric_name=test_metric_name, val_metric_name=val_metric_name, use_validation_errors=use_validation_errors) test_table = test_table.rename_algs(simplify_name_fn) # print(f'{test_table.alg_names=}') # print(f'{filter_alg_names_list=}') if filter_alg_names_list is not None: test_table = test_table.filter_algs(filter_alg_names_list) # new code test_table = test_table.filter_n_splits(n_splits) # shape: [n_algs, n_tasks, n_splits] errors = test_table.to_array() if len(errors) == 0: return dict(), dict() # print(f'{errors.shape=}, {errors=}') if use_ranks: errors = get_ranks(errors) elif use_normalized_errors: min_arr = np.min(errors, axis=0, keepdims=True) max_arr = np.max(errors, axis=0, keepdims=True) errors = (errors - min_arr) / (max_arr - min_arr + 1e-30) errors = np.clip(errors, 0.0, 1.0) elif use_grinnorm_errors: assert task_type_name in ['class', 'reg'] min_arr = np.min(errors, axis=0, keepdims=True) max_arr = np.quantile(errors, 1.0 if task_type_name == 'class' else 0.9, axis=0, keepdims=True) errors = (errors - min_arr) / (max_arr - min_arr + 1e-30) if task_type_name == 'reg': errors = np.clip(errors, 0.0, 1.0) else: errors = np.clip(errors, 0.0, np.inf) idx_best = test_table.alg_names.index(rel_alg_name) if use_relative_score else 0 use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci') if use_task_weighting: separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] task_weights = TaskWeighting(test_table.task_infos, separate_task_names).get_task_weights() else: n_tasks = len(test_table.task_infos) task_weights = np.ones(n_tasks) / n_tasks f_errors = f(errors) mean_f_errors = np.mean(f_errors, axis=-1) # print(f'{f_errors.shape=}, {f_errors=}') if use_task_mean: mean_f_errors = mean_f_errors @ task_weights mean_scores = post_f(mean_f_errors) if not use_task_mean: assert not use_relative_score if return_percentages: assert use_relative_score base_f_errors = f_errors[idx_best, None] if use_relative_score else np.zeros_like(f_errors) # mean_base_f_errors = np.mean(base_f_errors, axis=-1) @ task_weights rel_f_errors = f_errors - base_f_errors # print(f'{rel_f_errors.shape=}, {rel_f_errors=}') mean_rel_f_errors = np.mean(rel_f_errors, axis=-1) if use_task_mean: mean_rel_f_errors = mean_rel_f_errors @ task_weights # # unbiased estimate of variance of mean estimator # variances_algs_tasks = np.var(rel_f_errors, axis=-1) / (n_splits - 1) # variances_algs = variances_algs_tasks @ (task_weights ** 2) # stds_algs = np.sqrt(variances_algs) # lower_rel_mean_f_errors = mean_rel_f_errors - 1.96 * stds_algs # upper_rel_mean_f_errors = mean_rel_f_errors + 1.96 * stds_algs if use_task_mean: # take the mean over tasks first, then do the confidence interval for rel_f_errors = np.einsum('ats,t->as', rel_f_errors, task_weights) lower_rel_mean_f_errors, upper_rel_mean_f_errors = get_t_mean_confidence_interval(rel_f_errors) # lower_rel_mean_f_errors = [] # upper_rel_mean_f_errors = [] # for i in range(means_algs_splits.shape[0]): # # following https://www.geeksforgeeks.org/how-to-calculate-confidence-intervals-in-python/ # # see also https://stats.stackexchange.com/questions/358408/confidence-interval-for-the-mean-normal-distribution-or-students-t-distributi # # and http://stla.github.io/stlapblog/posts/ModelReduction.html # means_splits = means_algs_splits[i] # sem = scipy.stats.sem(means_splits) # if sem == 0.0: # mean = np.mean(means_splits) # interval = [mean, mean] # else: # interval = scipy.stats.t.interval(confidence=0.95, df=len(means_splits) - 1, loc=np.mean(means_splits), # scale=sem) # lower_rel_mean_f_errors.append(interval[0]) # upper_rel_mean_f_errors.append(interval[1]) # lower_rel_mean_f_errors = np.array(lower_rel_mean_f_errors) # upper_rel_mean_f_errors = np.array(upper_rel_mean_f_errors) # 2.5% and 97.5% quantiles for normal distribution lower_rel_mean_scores = post_f(lower_rel_mean_f_errors) upper_rel_mean_scores = post_f(upper_rel_mean_f_errors) rel_mean_scores = post_f(mean_rel_f_errors) # lower_f_errors = mean_f_errors - 1.96 * stds_algs # upper_f_errors = mean_f_errors + 1.96 * stds_algs # lower_scores = post_f(lower_f_errors) # upper_scores = post_f(upper_f_errors) def transform(scores: np.ndarray) -> np.ndarray: if use_relative_score and not use_geometric_mean: # we computed the arithmetic mean of the difference, need to normalize and add 1 scores = scores / mean_scores[idx_best, None] + 1.0 if return_percentages: scores = 100 * (scores - 1.0) return scores # print(f'{rel_mean_scores=}') scores = transform(rel_mean_scores) lower_scores = transform(lower_rel_mean_scores) upper_scores = transform(upper_rel_mean_scores) # print(f'{scores=}') scores_dict = {alg_name: score for alg_name, score in zip(test_table.alg_names, scores)} intervals_dict = {alg_name: (lower, upper) for alg_name, lower, upper in zip(test_table.alg_names, lower_scores, upper_scores)} # scores_dict = {display_name_fn(alg_name): score # for alg_name, score in zip(test_table.alg_names, scores)} # intervals_dict = {display_name_fn(alg_name): (lower, upper) # for alg_name, lower, upper in zip(test_table.alg_names, lower_scores, upper_scores)} return scores_dict, intervals_dict def get_opt_groups(task_type_name: str) -> Dict[str, List[str]]: """ Generates a groups of methods that should be evaluated. :param task_type_name: 'class' or 'reg' :return: A dict of lists {alg_group_name: [alg_name_1, alg_name_2, ...]} """ opt_groups = utils.join_dicts(get_ensemble_groups(task_type_name), { '_LGBM-HPO+TD': ['LGBM-HPO', f'LGBM-TD-{task_type_name}'], '_XGB-HPO+TD': ['XGB-HPO', f'XGB-TD-{task_type_name}'], '_CatBoost-HPO+TD': ['CatBoost-HPO', f'CatBoost-TD-{task_type_name}'], '_RealMLP-HPO+TD': ['RealMLP-HPO', f'RealMLP-TD-{task_type_name}'], '_MLP-HPO+TD': ['MLP-HPO', f'MLP-TD-{task_type_name}'], '-TD_val-ce': [f'RealMLP-TD-{task_type_name}_val-ce_no-ls', f'XGB-TD-{task_type_name}_val-ce', f'LGBM-TD-{task_type_name}_val-ce', f'CatBoost-TD-{task_type_name}_val-ce'], '-D_val-ce': [f'MLP-PLR-D-{task_type_name}_val-ce', f'XGB-D-{task_type_name}_val-ce', f'LGBM-D-{task_type_name}_val-ce', f'CatBoost-D-{task_type_name}_val-ce'], }) for method in ['MLP-RTDL-D', 'ResNet-RTDL-D', 'MLP-PLR-D', 'FTT-D', 'TabR-S-D']: opt_groups[f'_{method}_prep'] = [f'{method}-{task_type_name}', f'{method}-{task_type_name}_rssc'] return opt_groups def get_ensemble_groups(task_type_name: str) -> Dict[str, List[str]]: """ Generates a groups of methods that should be evaluated. :param task_type_name: 'class' or 'reg' :return: A dict of lists {alg_group_name: [alg_name_1, alg_name_2, ...]} """ return { '_GBDTs-TD': [f'XGB-TD-{task_type_name}', f'LGBM-TD-{task_type_name}', f'CatBoost-TD-{task_type_name}'], '-TD': [f'XGB-TD-{task_type_name}', f'LGBM-TD-{task_type_name}', f'CatBoost-TD-{task_type_name}', f'RealMLP-TD-{task_type_name}'], '_GBDTs-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO'], # 'GBDTs-HPO_MLP-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO', 'MLP-HPO'], # todo: duplicate '-HPO': ['XGB-HPO', 'LGBM-HPO', 'CatBoost-HPO', 'RealMLP-HPO'], '_MLP-TD_MLP-TD-S': [f'RealMLP-TD-{task_type_name}', f'RealMLP-TD-S-{task_type_name}'], '-D': ['XGB-D', 'LGBM-D', 'CatBoost-D', f'MLP-PLR-D-{task_type_name}'], } def get_simplified_name(alg_name: str): alg_name = alg_name.replace(' [bag-1]', '') alg_name = alg_name.replace('-class', '').replace('-reg', '') # the rest is not happening in get_display_name after merging the names with the names from the runtimes # alg_name = alg_name.replace('RF-SKL', 'RF') # alg_name = alg_name.replace('-RTDL', '') # alg_name = alg_name.replace('_val-ce', '') # if alg_name == 'XGBoost-HPO': # return 'XGB-HPO' # elif alg_name == 'Ensemble_GBDTs-TD_MLP-TD': # return 'Ensemble_TD' # elif alg_name == 'Ensemble_GBDTs-HPO_MLP-HPO': # return 'Ensemble_HPO' return alg_name def get_display_name(alg_name: str) -> str: alg_name = alg_name.replace('BestModel', 'Best') # alg_name = alg_name.replace('_rssc', '') alg_name = alg_name.replace('_rssc', ' (RS+SC)') alg_name = alg_name.replace('_no-ls', ' (no LS)') alg_name = alg_name.replace('_val-ce', '') alg_name = alg_name.replace('RF-SKL', 'RF') alg_name = alg_name.replace('-RTDL', '') alg_name = alg_name.replace('_best-1-auc-ovr', '') if alg_name.endswith('_prep') and alg_name.startswith('Best_'): alg_name = alg_name[len('Best_'):-len('_prep')] alg_name = alg_name + ' (best of both)' return alg_name ================================================ FILE: pytabkit/bench/eval/colors.py ================================================ from typing import List, Tuple, Callable def bilin_int(x: float, values: List[Tuple[float, float]]) -> float: # integrates a bilinear interpolation of the values sum_of_integrals = 0.0 x0, y0 = values[0] for x1, y1 in values[1:]: if x <= x0: return sum_of_integrals if x <= x1: y1 = y0 + (x-x0)/(x1-x0)*(y1-y0) x1 = x sum_of_integrals += (x1-x0) * (y1+y0) / 2 x0, y0 = x1, y1 return sum_of_integrals def bisection_find(f: Callable[[float], float], y: float, xmin: float, xmax: float, n=50) -> float: # find x with f(x) = y, assuming increasing f a = xmin b = xmax c = (a+b)/2 # middle fa = f(a) fb = f(b) fc = f(c) if fa >= y: return a if fb <= y: return b for _ in range(n): if fc >= y: b, fb = c, fc else: a, fa = c, fc c = (a+b)/2 fc = f(c) return c def more_percep_uniform_hue(x: float) -> float: """ Returns a hue-value that should change perceptually somewhat uniformly with x :param x: a value between 0 and 1. :return: Hue value for HSV space. """ # eye-balled perceptual "rate of change" scores at different hues hue_percep_deriv = [(0, 0.3), (30, 0.6), (60, 1.0), (90, 0.3), (150, 0.3), (180, 0.8), (220, 0.4), (260, 0.4), (280, 0.8), (300, 0.6), (360, 0.3)] f = lambda val: bilin_int(val, hue_percep_deriv) fmax = f(360) return bisection_find(f, x*fmax, 0, 360)/360 ================================================ FILE: pytabkit/bench/eval/evaluation.py ================================================ import distutils.command.build_ext from typing import List, Dict, Any, Tuple, Optional, Callable, Union import numpy as np from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection, TaskInfo from pytabkit.models import utils from pytabkit.models.training.metrics import Metrics class AlgFilter: def __call__(self, alg_name: str, tags: List[str], alg_config: Dict[str, Any]) -> bool: raise NotImplementedError() class FunctionAlgFilter(AlgFilter): def __init__(self, f): self.f = f def __call__(self, alg_name: str, tags: List[str], alg_config: Dict[str, Any]) -> bool: return self.f(alg_name, tags, alg_config) class EvalModeSelector: # base class def select_eval_modes(self, eval_modes: List[Tuple[str, str, str]]) -> List[Tuple[str, Tuple[str, str, str]]]: # gets a list of (cv_type, n_models, start_idx) tuples, returns a sublist of them # but with a suffix-str in for each element raise NotImplementedError() def select(self, alg_name: str, task_results: List) -> Tuple[List[str], List[List]]: # task results should be indexed by [task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)] # returns a list of alg names and a list new_alg_task_results indexed by [task_idx][split_idx] # determine all combinations that occur in all task results sets = [set((cv_type, n_models, start_idx) for cv_type, d1 in split_dict.items() for n_models, d2 in d1.items() for start_idx, d3 in d2.items()) for task_result in task_results for split_dict in task_result] eval_modes = list(set.intersection(*sets)) # select using function overridden in subclass selected = self.select_eval_modes(eval_modes) # select elements for selected eval modes new_alg_names = [] new_alg_task_results = [] for suffix, (cv_type, n_models, start_idx) in selected: new_alg_names.append(alg_name + suffix) new_alg_task_results.append([[split[cv_type][n_models][start_idx] for split in task_result] for task_result in task_results]) return new_alg_names, new_alg_task_results class DefaultEvalModeSelector(EvalModeSelector): def select_eval_modes(self, eval_modes: List[Tuple[str, str, str]]) -> List[Tuple[str, Tuple[str, str, str]]]: # out of different numbers of ensemble members, # select only the largest ensemble/bagging combinations and single ensemble member result = [] # if ('refit', '1', '0') in eval_modes: # # refit with 1 model, standard # result.append(('', ('refit', '1', '0'))) for name, val in [('bag', 'cv'), ('ens', 'refit')]: modes = [mode for mode in eval_modes if mode[0] == val] if len(modes) > 0: # maximize n_models bag_sizes = [int(mode[1]) for mode in modes] max_cv = np.max(bag_sizes) min_cv = np.min(bag_sizes) bag_sizes = list({max_cv, min_cv}) # only have one element if they're equal for bag_size in bag_sizes: # make sure to always select model '0' to avoid non-determinism result.append((f' [{name}-{bag_size}]', (val, str(bag_size), '0'))) # idx = np.argmax([int(mode[1]) for mode in modes]) # idx_min = np.argmin([int(mode[1]) for mode in modes]) # mode = modes[idx] # result.append((f' [{name}-{mode[1]}]', mode)) # if idx_min != idx: # result.append((f' [{name}-{modes[idx_min][1]}]', modes[idx_min])) return result class AlgTaskTable: def __init__(self, alg_names: List[str], task_infos: List[TaskInfo], alg_task_results: List[List[Any]]): self.alg_names = alg_names self.task_infos = task_infos self.alg_task_results = alg_task_results def map(self, f): return AlgTaskTable(self.alg_names, self.task_infos, [[[f(r) for r in splits] for splits in task_results] for task_results in self.alg_task_results]) def filter_n_splits(self, n_splits: int) -> 'AlgTaskTable': """ Limits the number of split results to n_splits and removes all algs where there exists a task with less than n_splits split results. :param n_splits: :return: """ alg_valid = [all(len(split_results) >= n_splits for split_results in task_results) for task_results in self.alg_task_results] alg_names = [alg_name for is_valid, alg_name in zip(alg_valid, self.alg_names) if is_valid] alg_task_results = [[split_results[:n_splits] for split_results in task_results] for is_valid, task_results in zip(alg_valid, self.alg_task_results) if is_valid] return AlgTaskTable(alg_names, self.task_infos, alg_task_results) def to_array(self) -> np.ndarray: return np.asarray(self.alg_task_results) def rename_algs(self, f: Callable[[str], str]) -> 'AlgTaskTable': return AlgTaskTable(alg_names=[f(an) for an in self.alg_names], task_infos=self.task_infos, alg_task_results=self.alg_task_results) def filter_algs(self, alg_names: List[str]) -> 'AlgTaskTable': return AlgTaskTable(alg_names=[an for an in self.alg_names if an in alg_names], task_infos=self.task_infos, alg_task_results=[tr for tr, an in zip(self.alg_task_results, self.alg_names) if an in alg_names]) class MultiResultsTable: def __init__(self, train_table: AlgTaskTable, val_table: AlgTaskTable, test_table: AlgTaskTable, alg_tags: List[List[str]], alg_configs: List[Dict[str, Any]]): # val_table.alg_task_table and test_table.alg_task_table are indexed by # [alg_idx][task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)][metric_name] self.train_table = train_table self.val_table = val_table self.test_table = test_table self.alg_tags = alg_tags self.alg_configs = alg_configs def get_test_results_table(self, eval_mode_selector: EvalModeSelector, val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, alg_group_dict: Optional[Dict[str, AlgFilter]] = None, val_test_groups: Optional[Dict[str, Dict[str, str]]] = None, use_validation_errors: bool = False, use_train_errors: bool = False) \ -> AlgTaskTable: """ :param eval_mode_selector: Decides how to select results from the different available ensembled/bagged results and how to name them :param val_metric_name: Name of the validation metric (used for optimizing over multiple algorithms) :param test_metric_name: Name of the test metric :param alg_group_dict: Optional dictionary of name: alg_filter. For each such pair, an additional algorithm with the given name will be added to the resulting table. Its results are computed as follows: On each split of each task, out of all the algorithms where the alg_filter returns True, the one with the best validation error is chosen, and then its test error is used. :param val_test_groups: Similar to alg_group_dict, but allows to use a different alg for the test score associated with the one with the best validation error. Specifically, for name: pairs in val_test_groups.items(), the best validation error among the keys of pairs will be determined, and then the test score of the value associated to this best key will be returned. :param use_validation_errors: If True, use validation errors instead of test errors. :param use_train_errors: If True, use train errors instead of test errors. :return: """ # the selector assigns new alg names (e.g. with [ens-5] for an ensemble) # but the alg_group selects based on configs and new names assert not (use_train_errors and use_validation_errors) # extract only default metric values from self.val_table val_metric_name = val_metric_name or Metrics.default_eval_metric_name(self.val_table.task_infos[0].task_type) test_metric_name = test_metric_name or Metrics.default_eval_metric_name(self.val_table.task_infos[0].task_type) if '1-r2' in [val_metric_name, test_metric_name]: for table in [self.val_table, self.test_table, self.train_table]: table.alg_task_results = utils.map_nested(table.alg_task_results, lambda metrics_dict: utils.join_dicts(metrics_dict, {'1-r2': metrics_dict['nrmse']**2}), dim=6) # tables indexed by [alg_idx][task_idx][split_idx]['cv'/'refit'][str(n_models)][str(start_idx)][metric_name] val_results = utils.select_nested(self.val_table.alg_task_results, val_metric_name, dim=6) if use_validation_errors: test_results = val_results elif use_train_errors: test_results = utils.select_nested(self.train_table.alg_task_results, val_metric_name, dim=6) else: test_results = utils.select_nested(self.test_table.alg_task_results, test_metric_name, dim=6) # take mean over all single model validation scores in cross-validation # now indexed by [alg_idx][task_idx][split_idx] # print(np.asarray(val_results[0][0][0]['cv']['1'].values())) val_results = utils.map_nested(val_results, lambda dct: np.mean(np.asarray(list(dct['cv']['1'].values()))), dim=3) # create new test table by selecting for eval modes (multiple eval modes can be selected for an alg_name) # hence the table can get longer new_alg_names = [] new_alg_task_results = [] # Meaning: new_alg_names[new_alg_idxs[i]] is first algorithm corresponding to self.val_table.alg_names[i] new_alg_idxs = [] for alg_name, task_results in zip(self.test_table.alg_names, test_results): # generates a list of alg names and of alg_task_results an, atr = eval_mode_selector.select(alg_name, task_results) if len(an) == 0: raise RuntimeError(f'No eval mode selected from alg {alg_name}') new_alg_idxs.append(len(new_alg_names)) new_alg_names.extend(an) new_alg_task_results.extend(atr) # test_results_table.alg_task_results is indexed by [alg_idx][task_idx][split_idx] test_results_table = AlgTaskTable(new_alg_names, self.test_table.task_infos, new_alg_task_results) if val_test_groups is None: val_test_groups = dict() if alg_group_dict is not None: more_val_test_groups = {key: {alg_name: alg_name for alg_name, alg_tags, alg_config in zip(self.val_table.alg_names, self.alg_tags, self.alg_configs) if filter(alg_name, alg_tags, alg_config)} for key, filter in alg_group_dict.items()} val_test_groups = utils.join_dicts(val_test_groups, more_val_test_groups) # add algorithms optimized over a group, selecting the one with the best validation score # (or one associated to the best one) group_names = [] group_task_results = [] for group_name, val_test_dict in val_test_groups.items(): if len(val_test_dict) == 0: continue # could happen if the alg_filter does not apply to anything all_alg_names = self.val_table.alg_names val_alg_names = list(val_test_dict.keys()) val_alg_idxs = [all_alg_names.index(alg_name) if alg_name in all_alg_names else None for alg_name in val_alg_names] test_alg_idxs = [all_alg_names.index(val_test_dict[alg_name]) if val_test_dict[alg_name] in all_alg_names else None for alg_name in val_alg_names] # print(f'{group_name=}, {val_alg_idxs=}, {test_alg_idxs=}') if None in (val_alg_idxs + test_alg_idxs): continue # not all algs found max_n_splits = np.min([len(splits) for i in (val_alg_idxs + test_alg_idxs) for splits in val_results[i]]) # shape: n_algs x n_tasks x max_n_splits cut_splits = [[splits[:max_n_splits] for splits in val_results[i]] for i in val_alg_idxs] # shape: n_tasks x max_n_splits best_idxs = np.argmin(np.asarray(cut_splits), axis=0) test_atr = test_results_table.alg_task_results group_names.append(group_name) group_task_results.append( [[test_atr[new_alg_idxs[test_alg_idxs[best_idxs[task_idx, split_idx]]]][task_idx][split_idx] for split_idx in range(best_idxs.shape[1])] for task_idx in range(best_idxs.shape[0])]) test_results_table = AlgTaskTable(test_results_table.alg_names + group_names, test_results_table.task_infos, test_results_table.alg_task_results + group_task_results) # # add alg groups - on each task, alg groups take the alg from the group with the best val error # # (val error is always minimized here, not maximized) # if alg_group_dict is not None: # group_names = [] # group_task_results = [] # for key, alg_filter in alg_group_dict.items(): # alg_idxs = [i for i in range(len(self.val_table.alg_names)) # if alg_filter(self.val_table.alg_names[i], self.alg_tags[i], self.alg_configs[i])] # if len(alg_idxs) == 0: # continue # max_n_splits = np.min([len(splits) # for i in alg_idxs # for splits in val_results[i]]) # # shape: n_algs x n_tasks x max_n_splits # cut_splits = [[splits[:max_n_splits] for splits in val_results[i]] # for i in alg_idxs] # # shape: n_tasks x max_n_splits # best_idxs = np.argmin(np.asarray(cut_splits), axis=0) # test_atr = test_results_table.alg_task_results # # group_names.append(key) # group_task_results.append( # [[test_atr[new_alg_idxs[alg_idxs[best_idxs[task_idx, split_idx]]]][task_idx][split_idx] # for split_idx in range(best_idxs.shape[1])] # for task_idx in range(best_idxs.shape[0])]) # test_results_table = AlgTaskTable(test_results_table.alg_names + group_names, test_results_table.task_infos, # test_results_table.alg_task_results + group_task_results) return test_results_table @staticmethod def load(task_collection: TaskCollection, n_cv: int, paths: Paths, alg_filter: Optional[AlgFilter] = None, split_type=SplitType.RANDOM, max_n_splits: Optional[int] = None, max_n_algs: Optional[int] = None): # load only summaries (faster) alg_names = [alg_path.name for alg_path in paths.result_summaries().iterdir()] # now only keep algs where all tasks from task_collection have been evaluated alg_names = [an for an in alg_names if np.all([utils.existsDir(paths.summary_alg_task(task_desc, an, n_cv)) for task_desc in task_collection.task_descs])] print('computed alg names') alg_tags = [utils.deserialize(paths.algs() / alg_name / 'tags.yaml', use_yaml=True) for alg_name in alg_names] alg_configs = [utils.deserialize(paths.algs() / alg_name / 'extended_config.yaml', use_yaml=True) for alg_name in alg_names] if alg_filter is None: alg_filter = lambda an, tags, aw: True alg_dict = {an: (tags, config) for an, tags, config in zip(alg_names, alg_tags, alg_configs) if alg_filter(an, tags, config)} if max_n_algs is not None and max_n_algs >= 0: alg_dict = {key: value for i, (key, value) in enumerate(alg_dict.items()) if i < max_n_algs} alg_names = list(alg_dict.keys()) alg_tags = [alg_dict[an][0] for an in alg_names] alg_configs = [alg_dict[an][1] for an in alg_names] task_infos = task_collection.load_infos(paths) # val_metric_name = Metrics.default_metric_name(task_infos[0].task_type) # indexed by # [alg_idx][task_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name][split_idx] alg_task_results = [[utils.deserialize(paths.summary_alg_task(task_desc, alg_name, n_cv) / f'metrics.msgpack.gz', use_msgpack=True, compressed=True)[split_type] for task_desc in task_collection.task_descs] for alg_name in alg_names] # swap split_idx dimension to after task_idx, now indexed by # [alg_idx][task_idx][split_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name] alg_task_results = utils.shift_dim_nested(alg_task_results, 7, 2) if max_n_splits is not None and max_n_splits >= 1: alg_task_results = utils.map_nested(alg_task_results, lambda lst: lst[:max_n_splits] if len(lst) > max_n_splits else lst, 2) def select_valtest(dct: Dict, name: str): # helper function because for the 'refit' results, # we have to take the validation results from the 'cv' part # because 'refit' did not have a validation set if name != 'val': return {key: value[name] for key, value in dct.items()} else: return {key: dct['cv']['val'] for key in dct} tables = {name: AlgTaskTable(alg_names=alg_names, task_infos=task_infos, alg_task_results=utils.map_nested(alg_task_results, lambda dct: select_valtest(dct, name), dim=3)) for name in ['train', 'val', 'test']} # does not work since 'refit' does not have 'val' # tables = [AlgTaskTable(alg_names=alg_names, task_infos=task_infos, # alg_task_results=utils.select_nested(alg_task_results, name, dim=4)) # for name in ['val', 'test']] return MultiResultsTable(train_table=tables['train'], val_table=tables['val'], test_table=tables['test'], alg_tags=alg_tags, alg_configs=alg_configs) class TableAnalyzer: def __init__(self, post_f: Optional[Callable[[float], float]] = None): self.post_f = post_f or (lambda x: x) def _print_table(self, alg_names: List[str], means, stds=None, is_higher_better: bool = False, perm: Optional[np.ndarray] = None): means = np.asarray(means) if perm is None: perm = np.argsort(means) if is_higher_better: perm = perm[::-1] means = means[perm] alg_names = [alg_names[i] for i in perm] if stds is None: str_table = [[an + ': ', f'{self.post_f(m):6.4f}'] for an, m in zip(alg_names, means)] else: stds = np.asarray(stds)[perm] str_table = [[an + ': ', f'{self.post_f(m):6.4f} ', f'[{self.post_f(m - 2 * s):6.4f}, {self.post_f(m + 2 * s):6.4f}]'] for an, m, s in zip(alg_names, means, stds)] print(utils.pretty_table_str(str_table)) def print_analysis(self, alg_task_table: AlgTaskTable): raise NotImplementedError() class TaskWeighting: def __init__(self, task_infos: List[TaskInfo], separate_task_names: Optional[List[str]]): """ Computes a weighting of tasks, downweighting tasks that have similar tasks. :param task_infos: Task infos. :param separate_task_names: Names of tasks that should not be grouped together with other tasks """ self.task_infos = task_infos separate_task_names = separate_task_names or [] task_names = [task_info.task_desc.task_name.split('_')[0] for task_info in task_infos] task_prefixes = [task_name if task_name in separate_task_names else task_name.split('_')[0] for task_name in task_names] self.prefix_counts = {} for prefix in task_prefixes: if prefix in self.prefix_counts: self.prefix_counts[prefix] += 1 else: self.prefix_counts[prefix] = 1 self.task_weights = np.asarray([1.0 / self.prefix_counts[prefix] for prefix in task_prefixes]) self.task_weights /= np.sum(self.task_weights) def get_n_groups(self) -> int: return len(self.prefix_counts) def get_task_weights(self) -> np.ndarray: return self.task_weights class MeanTableAnalyzer(TableAnalyzer): def __init__(self, f=None, use_weighting=False, separate_task_names: Optional[List[str]] = None, post_f=None): super().__init__(post_f=post_f) self.f = f self.use_weighting = use_weighting self.separate_task_names = separate_task_names def print_analysis(self, alg_task_table: AlgTaskTable) -> None: if self.use_weighting: task_weights = TaskWeighting(alg_task_table.task_infos, self.separate_task_names).get_task_weights() # task_weights = get_task_weights(alg_task_table.task_infos) else: n = len(alg_task_table.task_infos) task_weights = np.ones(n) / n if self.f is not None: alg_task_table = alg_task_table.map(self.f) alg_task_results = alg_task_table.alg_task_results # if self.f is not None: # alg_task_results = [[[self.f(x) for x in c] for c in b] for b in alg_task_results] means = [np.dot(task_weights, [np.mean(splits) for splits in task_results]) for task_results in alg_task_results] stds = [np.sqrt(np.dot(task_weights ** 2, [np.std(splits) ** 2 / len(splits) for splits in task_results])) for task_results in alg_task_results] self._print_table(alg_task_table.alg_names, means, stds) def get_means(self, alg_task_table: AlgTaskTable) -> List[float]: if self.use_weighting: separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] task_weights = TaskWeighting(alg_task_table.task_infos, separate_task_names).get_task_weights() else: n = len(alg_task_table.task_infos) task_weights = np.ones(n) / n if self.f is not None: alg_task_table = alg_task_table.map(self.f) alg_task_results = alg_task_table.alg_task_results return [self.post_f(np.dot(task_weights, [np.mean(splits) for splits in task_results])) for task_results in alg_task_results] def get_intervals(self, alg_task_table: AlgTaskTable, std_factor: float = 2.0) -> List[Tuple[float, float]]: # e.g. if std_factor=2, then the +-2 sigma interval will be used if self.use_weighting: separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] task_weights = TaskWeighting(alg_task_table.task_infos, separate_task_names).get_task_weights() else: n = len(alg_task_table.task_infos) task_weights = np.ones(n) / n if self.f is not None: alg_task_table = alg_task_table.map(self.f) alg_task_results = alg_task_table.alg_task_results means = [np.dot(task_weights, [np.mean(splits) for splits in task_results]) for task_results in alg_task_results] stds = [np.sqrt(np.dot(task_weights ** 2, [np.std(splits) ** 2 / len(splits) for splits in task_results])) for task_results in alg_task_results] post_intervals = [(self.post_f(mean - std_factor * std), self.post_f(mean + std_factor * std)) for mean, std in zip(means, stds)] return post_intervals class ArrayTableAnalyzer(TableAnalyzer): """ Intermediate class that analyzes using the same number of splits for each method """ def __init__(self, f=None, use_weighting=False, separate_task_names: Optional[List[str]] = None, post_f=None): super().__init__(post_f=post_f) self.f = f self.use_weighting = use_weighting self.separate_task_names = separate_task_names def _is_higher_better(self) -> bool: # can be overridden if necessary return False def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: # optional second tuple can be the permutation of configurations that should be used for displaying them raise NotImplementedError() def print_analysis(self, alg_task_table: AlgTaskTable, val_table: Optional[AlgTaskTable] = None) -> None: if self.use_weighting: task_weights = TaskWeighting(alg_task_table.task_infos, self.separate_task_names).get_task_weights() # task_weights = get_task_weights(alg_task_table.task_infos) else: n = len(alg_task_table.task_infos) task_weights = np.ones(n) / n if self.f is not None: alg_task_table = alg_task_table.map(self.f) if val_table is not None: val_table = val_table.map(self.f) alg_task_results = alg_task_table.alg_task_results # if self.f is not None: # alg_task_results = [[[self.f(x) for x in c] for c in b] for b in alg_task_results] min_n_splits = np.min([len(splits) for task_results in alg_task_results for splits in task_results]) loss_arr = np.asarray([[splits[:min_n_splits] for splits in task_results] for task_results in alg_task_results]) val_loss_arr = None if val_table is not None: val_loss_arr = np.asarray( [[splits[:min_n_splits] for splits in task_results] for task_results in val_table.alg_task_results]) results_arr = self._process_losses(loss_arr, val_loss_arr) perm = None if isinstance(results_arr, Tuple): results_arr, perm = results_arr means = np.mean(results_arr, axis=-1) @ task_weights # todo: could implement better confidence intervals from plotting code stds = np.sqrt((np.std(results_arr, axis=-1) ** 2 / results_arr.shape[-1]) @ (task_weights ** 2)) self._print_table(alg_task_table.alg_names, means, stds, is_higher_better=self._is_higher_better(), perm=perm) class WinsTableAnalyzer(ArrayTableAnalyzer): def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: return (loss_arr == np.min(loss_arr, axis=0, keepdims=True)).astype(np.float32) def _is_higher_better(self) -> bool: return True def get_ranks(values: np.ndarray) -> np.ndarray: # computes ranks across the first axis return np.sum(values[:, None] > values[None, :], axis=1) + 1 # ranks_per_method = [] # for i in range(values.shape[0]): # ranks_per_method.append(np.sum((values[i, None] > values).astype(np.int32), axis=0) + 1) # return np.stack(ranks_per_method, axis=0) class RankTableAnalyzer(ArrayTableAnalyzer): def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: return get_ranks(loss_arr) class NormalizedLossTableAnalyzer(ArrayTableAnalyzer): def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: min_arr = np.min(loss_arr, axis=0, keepdims=True) max_arr = np.max(loss_arr, axis=0, keepdims=True) return (loss_arr - min_arr) / (max_arr - min_arr + 1e-30) class GreedyAlgSelectionTableAnalyzer(ArrayTableAnalyzer): """ Greedy selection of a portfolio of methods such that the addition improves the best performance in the portfolio the most """ def _process_losses(self, loss_arr: np.ndarray, val_loss_arr: Optional[np.ndarray]) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: # val_loss_arr = loss_arr # todo assert val_loss_arr is not None n_algs = loss_arr.shape[0] non_selected_algs = np.arange(n_algs) # alg_selected = np.zeros(loss_arr.shape[0], dtype=np.bool_) perm = [] for i in range(loss_arr.shape[0]): # losses are updated, tracking the loss of the alg after optimizing over best models and the given one # find best model best_non_selected_idx = np.argmin(np.mean(val_loss_arr, axis=(1, 2))[non_selected_algs]) best_idx = non_selected_algs[best_non_selected_idx] perm.append(best_idx) non_selected_algs = np.concatenate( [non_selected_algs[:best_non_selected_idx], non_selected_algs[best_non_selected_idx + 1:]], axis=0) for alg_idx in non_selected_algs: is_better = val_loss_arr[best_idx] <= val_loss_arr[alg_idx] val_loss_arr[alg_idx] = np.where(is_better, val_loss_arr[best_idx], val_loss_arr[alg_idx]) loss_arr[alg_idx] = np.where(is_better, loss_arr[best_idx], loss_arr[alg_idx]) return loss_arr, np.asarray(perm, dtype=np.int32) def alg_results_str(alg_task_table: AlgTaskTable, alg_name: str): alg_task_results = alg_task_table.alg_task_results if alg_name not in alg_task_table.alg_names: alg_name = alg_name + ' [bag-1]' # todo: could throw an exception alg_idx = alg_task_table.alg_names.index(alg_name) task_results = alg_task_results[alg_idx] means = [np.mean(splits) for splits in task_results] stds = [np.std(splits) / np.sqrt(len(splits)) for splits in task_results] task_names = [str(task_info.task_desc) for task_info in alg_task_table.task_infos] str_table = [[f'Task ', 'Error', 'Interval']] for name, mean, std in zip(task_names, means, stds): str_table.append([f'{name}: ', f'{mean:6.4f} ', f'[{mean - 2 * std:6.4f}, {mean + 2 * std:6.4f}]']) return utils.pretty_table_str(str_table) def alg_comparison_str(alg_task_table: AlgTaskTable, alg_names: List[str]): alg_task_results = alg_task_table.alg_task_results alg_names = [an if an in alg_task_table.alg_names else an + ' [bag-1]' for an in alg_names] # todo: could throw an exception alg_idxs = [alg_task_table.alg_names.index(alg_name) for alg_name in alg_names] means = [[np.mean(splits) for splits in alg_task_results[alg_idx]] for alg_idx in alg_idxs] task_names = [str(task_info.task_desc) for task_info in alg_task_table.task_infos] str_table = [[f'Task '] + [f'Alg {i + 1} ' for i in range(len(alg_names))]] for i, name in enumerate(task_names): str_table.append([f'{name}: '] + [f'{alg_means[i]:6.4f} ' for alg_means in means]) str_table.append([''] * 3) min_means = [np.min([alg_means[i] for alg_means in means]) for i in range(len(task_names))] n_wins_list = [sum([int(alg_means[i] == min_means[i]) for i in range(len(task_names))]) for alg_means in means] str_table.append(['Wins:'] + [str(n_wins) for n_wins in n_wins_list]) return utils.pretty_table_str(str_table) # CLI: # task collection # n_cv (default=1?) # preference regarding is_cv and ensembling? # optionally whether default splits should be used or not? # tags (connect by and or or?) ================================================ FILE: pytabkit/bench/eval/plotting.py ================================================ import copy from pathlib import Path from typing import List, Dict, Optional, Tuple, Callable import matplotlib import numpy as np import pandas as pd from matplotlib.pyplot import arrow from pytabkit.bench.eval.analysis import get_opt_groups, get_simplified_name, ResultsTables, \ get_benchmark_results, get_display_name from pytabkit.bench.eval.colors import more_percep_uniform_hue matplotlib.use('agg') # matplotlib.use('pdf') matplotlib.rcParams.update({ "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'font.size': 10.95, 'text.usetex': True, 'pgf.rcfonts': False, # 'legend.framealpha': 0.5, 'text.latex.preamble': r'\usepackage{times} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{xcolor}' }) from tueplots import bundles, fonts, fontsizes, figsizes matplotlib.rcParams.update(bundles.icml2022()) matplotlib.rcParams.update(fonts.icml2022_tex()) matplotlib.rcParams.update(fontsizes.icml2022()) matplotlib.rcParams['text.latex.preamble'] = matplotlib.rcParams['text.latex.preamble'] + r'\usepackage{xcolor}' import matplotlib.pyplot as plt import matplotlib.colors as mcolors from matplotlib import patches as mpatches import seaborn as sns from adjustText import adjust_text import matplotlib.patheffects as PathEffects from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.eval.evaluation import MultiResultsTable, DefaultEvalModeSelector, FunctionAlgFilter, TaskWeighting from pytabkit.bench.eval.runtimes import get_avg_train_times from pytabkit.models import utils from pytabkit.models.training.scheduling import get_schedule # import distinctipy # class CustomPalette: # default = distinctipy.get_colors(n_colors=14, # exclude_colors=[(a, b, c) for a in [1, 0.8] for b in [1, 0.8] for c in [1, 0.8]], # pastel_factor=0.5, rng=0) def get_plot_color_idx(alg_name: str): parts = ['BestModel', 'Ensemble', 'MLP-RTDL', 'MLP-PLR', 'RealMLP', 'ResNet', 'FTT', ['TabR', 'RealTabR'], # 'SAINT', 'XGB', 'LGBM', 'CatBoost', # 'GBT', 'RF'] # don't use prefixes and reverse to get better colors for BestModel_FTT-D_prep etc. for i, part_or_list in reversed(list(enumerate(parts))): lst = part_or_list if isinstance(part_or_list, list) else [part_or_list] for part in lst: if part in alg_name: return i raise ValueError(f'Unknown method: {alg_name}') def gg_color_hue(n: int, saturation: float = 1.0, value: float = 0.65): # hues = np.linspace(13, 375, num=n + 1)[:-1] # exclude the last element to avoid a duplicate of the first color # return [tuple(matplotlib.colors.hsv_to_rgb((h / 360.0, saturation, value)).tolist()) for h in hues] hues = np.linspace(0.0, 1.0, n + 1)[:-1] hues = [more_percep_uniform_hue(hue) for hue in hues] return [tuple(matplotlib.colors.hsv_to_rgb((h, saturation, value)).tolist()) for h in hues] def get_plot_color(alg_name: str): idx = get_plot_color_idx(alg_name) special = ('rssc' in alg_name or 'TPE' in alg_name or 'no-ls' in alg_name) half_special = '_prep' in alg_name colors = gg_color_hue(12, saturation=0.6 if special else (0.8 if half_special else 1.0), value=0.9 if special else (0.775 if half_special else 0.65)) return colors[idx] def coll_name_to_title(coll_name: str) -> str: if coll_name == 'meta-train-class': title = r'Meta-train classification benchmark' elif coll_name == 'meta-train-reg': title = r'Meta-train regression benchmark' elif coll_name == 'meta-test-class': title = r'Meta-test classification benchmark' elif coll_name == 'meta-test-reg': title = r'Meta-test regression benchmark' elif coll_name == 'meta-test-class-no-missing': title = r'$\mathcal{B}^{\mathrm{test}}_{\mathrm{class}}$ without missing value datasets' elif coll_name == 'meta-test-reg-no-missing': title = r'$\mathcal{B}^{\mathrm{test}}_{\mathrm{reg}}$ without missing value datasets' elif coll_name == 'grinsztajn-class-filtered': title = r'Grinsztajn et al.\ (2022) classification benchmark' elif coll_name == 'grinsztajn-reg': title = r'Grinsztajn et al.\ (2022) regression benchmark' else: title = coll_name title = r'\textbf{' + title + r'}' return title def plot_schedule(paths: Paths, filename: str, sched_name: str) -> None: with plt.rc_context(figsizes.icml2022_half()): plt.figure() ts = np.linspace(0.0, 1.0, 400) sched = get_schedule(sched_name) sched_values = [sched.call_time_(t) for t in ts] plt.plot(ts, sched_values, 'tab:blue') plt.xlabel('$t$') plt.ylabel('$f(t)$') # plt.tight_layout() plot_name = paths.plots() / filename utils.ensureDir(plot_name) plt.savefig(plot_name) plt.close() def plot_schedules(paths: Paths, filename: str, sched_names: List[str], sched_labels: List[str]) -> None: with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=0.4)): plt.figure() ts = np.linspace(0.0, 1.0, 400) for sched_name, sched_label in zip(sched_names, sched_labels): sched = get_schedule(sched_name) sched_values = [sched.call_time_(t) for t in ts] plt.plot(ts, sched_values, label=sched_label) plt.legend(loc='best') plt.xlabel('$t$') plt.ylabel('$f(t)$') # plt.tight_layout() plot_name = paths.plots() / filename utils.ensureDir(plot_name) plt.savefig(plot_name) plt.close() def _create_benchmark_result_plot(file_path: Path, benchmark_results: Dict[str, Dict[str, float]], alg_names: List[str], colors: List): # generated mostly using ChatGPT df = pd.DataFrame(benchmark_results) # Reorder DataFrame based on alg_names df = df.reindex(alg_names) # Plotting # todo: use ICML compatible size fig, axs = plt.subplots(nrows=1, ncols=len(df.columns), figsize=(10, 7), sharey=True) for i, col in enumerate(df.columns): ax = axs[i] values = df[col].values bar_height = 1.0 bar_positions = np.arange(len(df), dtype=np.float64)[::-1] * bar_height # Handle empty strings in alg_names to create gaps between bars mask = df.index != '' non_empty_indices = np.where(mask)[0] ax.xaxis.grid(True) # Plot only if the method name is not an empty string non_empty_values = values[mask] non_empty_bar_positions = bar_positions[non_empty_indices] # ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=colors[:len(non_empty_bar_positions)], alpha=0.8, height=bar_height) ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=[colors[j] for j in non_empty_indices], alpha=0.8, height=bar_height) # Add method names on the y-axis # ax.invert_yaxis() # Invert y-axis to have Method A on top ax.tick_params(left=False) ax.set_yticks(bar_positions + 0.5 * bar_height) ax.set_yticklabels(df.index) ax.set_xlabel(r'Error increase in \% vs best') ax.set_title(col) # Remove frame around plot ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False) # Set x-axis ticks and gridlines ax.xaxis.set_ticks_position('bottom') # Highlight x=0 tick and corresponding gridline ax.axvline(x=0, color='black', linewidth=1.5) # Set common labels and adjust layout # fig.text(0.5, 0.04, 'Performance', ha='center') # fig.suptitle('Method Performance Comparison', y=1.05) plt.tight_layout() utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) def _create_benchmark_result_plot_with_intervals(file_path: Path, benchmark_results: Dict[str, Dict[str, float]], benchmark_intervals: Dict[str, Dict[str, Tuple[float, float]]], alg_names: List[str], colors: List): n_benchmarks = len(benchmark_results) with plt.rc_context(figsizes.icml2022_full(height_to_width_ratio=1.3)): # Plotting fig, axs = plt.subplots(nrows=1, ncols=n_benchmarks, sharey=True) # for i, col in enumerate(df.columns): for i, (col, results) in enumerate(benchmark_results.items()): ax = axs[i] # values = df[col].values bar_height = 1.0 bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * bar_height # Handle empty strings in alg_names to create gaps between bars # mask = df.index != '' mask = [alg_name != '' for alg_name in alg_names] non_empty_indices = np.where(mask)[0] non_empty_alg_names = [alg_name for alg_name in alg_names if alg_name != ''] values = [results[alg_name] if alg_name in results else 0.0 for alg_name in non_empty_alg_names] ax.xaxis.grid(True) # Plot only if the method name is not an empty string non_empty_values = values non_empty_bar_positions = bar_positions[non_empty_indices] intervals = np.array([benchmark_intervals[col][alg_name] if alg_name in results else (0.0, 0.0) for alg_name in non_empty_alg_names]).transpose() rel_intervals = intervals - non_empty_values errors = np.array([-rel_intervals[0], rel_intervals[1]]) # turn them into (absolute) errors # ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=colors[:len(non_empty_bar_positions)], alpha=0.8, height=bar_height) ax.barh(non_empty_bar_positions, non_empty_values, align='edge', color=[colors[j] for j in non_empty_indices], alpha=0.8, height=bar_height) ax.errorbar(non_empty_values, non_empty_bar_positions + 0.5 * bar_height, xerr=errors, fmt='none', color='black') # Add method names on the y-axis # ax.invert_yaxis() # Invert y-axis to have Method A on top ax.tick_params(left=False) ax.set_yticks(bar_positions + 0.5 * bar_height) ax.set_yticklabels(alg_names) # ax.set_xlabel(r'Error increase in \% vs best ($\downarrow$)') ax.set_title(col) # Remove frame around plot ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False) # Set x-axis ticks and gridlines ax.xaxis.set_ticks_position('bottom') # Highlight x=0 tick and corresponding gridline ax.axvline(x=0, color='black', linewidth=1.5) # Set common labels and adjust layout fig.text(0.6, -0.02, r'Error increase in \% vs best ($\downarrow$)', ha='center') # fig.suptitle('Method Performance Comparison', y=1.05) plt.tight_layout() utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) def get_equidistant_colors(n: int): cmap = plt.get_cmap('viridis') norm = matplotlib.colors.Normalize(vmin=0, vmax=n - 1) colors = [cmap(norm(i)) for i in range(n)] return colors def plot_benchmark_bars(paths: Paths, tables: ResultsTables, filename: str = None, coll_names: Optional[List[str]] = None, val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, alg_names: Optional[List[str]] = None, simplify_name_fn: Optional[Callable[[str], str]] = None, use_geometric_mean: bool = True, shift_eps: float = 1e-2): benchmark_results = {} benchmark_intervals = {} if coll_names is None: coll_names = ['meta-train-class', 'meta-test-class', 'meta-train-reg', 'meta-test-reg'] for coll_name in coll_names: table = tables.get(coll_name) rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name, val_metric_name=val_metric_name, test_metric_name=test_metric_name, use_geometric_mean=use_geometric_mean, shift_eps=shift_eps, simplify_name_fn=simplify_name_fn) benchmark_results[coll_name] = rel_means_dict benchmark_intervals[coll_name] = rel_intervals_dict # ens_group_names = ['GBDTs-TD_MLP-TD', 'MLP-TD_MLP-TD-S', 'GBDTs-HPO', 'GBDTs-TD'] ens_group_names = ['-HPO', '-TD'] ens_alg_names = sum([[f'Ensemble{gn}', f'BestModel{gn}', ''] for gn in ens_group_names], []) # ens_alg_names = ['BestModel_GBDTs-HPO_MLP-HPO', ''] + ens_alg_names # todo # ens_alg_names = ['HPO', ''] + ens_alg_names # todo # single_alg_names = [ # # 'MLP-TD', 'MLP-TD-S', 'MLP-SKLD', '', # 'BestModel_MLP-HPO+TD', 'MLP-HPO', 'MLP-TD', 'MLP-TD-S', '', # 'BestModel_CatBoost-HPO+TD', 'CatBoost-HPO', 'CatBoost-TD', 'CatBoost-D', '', # 'BestModel_LGBM-HPO+TD', 'LGBM-HPO', 'LGBM-TD', 'LGBM-D', '', # 'BestModel_XGB-HPO+TD', 'XGB-HPO', 'XGB-TD', 'XGB-D', '', # 'RF-SKLD', # ] single_alg_names = [ # 'MLP-TD', 'MLP-TD-S', 'MLP-SKLD', '', 'MLP-HPO', 'MLP-TD', 'MLP-TD-S', '', 'CatBoost-HPO', 'CatBoost-TD', 'CatBoost-D', '', 'LGBM-HPO', 'LGBM-TD', 'LGBM-D', '', 'XGB-HPO', 'XGB-TD', 'XGB-D', 'XGB-PBB-D', '', 'RF-SKL-D', ] if alg_names is None: alg_names = ens_alg_names + single_alg_names mean_name = f'geometric_eps-{shift_eps:g}' if use_geometric_mean else 'arithmetic' if filename is None: filename = f'benchmarks_bars_{mean_name}.pdf' file_path = paths.plots() / filename # todo # colors = ['b'] * len(alg_names) colors = get_equidistant_colors(len(alg_names)) _create_benchmark_result_plot_with_intervals(file_path=file_path, benchmark_results=benchmark_results, benchmark_intervals=benchmark_intervals, alg_names=alg_names, colors=colors) # _create_benchmark_result_plot(file_path=file_path, benchmark_results=benchmark_results, alg_names=alg_names, # colors=colors) def plot_scatter_ax(paths: Paths, tables: ResultsTables, ax: matplotlib.axes.Axes, coll_name: str, alg_name_1: str, alg_name_2: str, test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None, use_validation_errors: bool = False): task_collection = TaskCollection.from_name(coll_name, paths) task_infos = task_collection.load_infos(paths) task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg' table = tables.get(coll_name=coll_name, n_cv=1, tag='paper') opt_groups = get_opt_groups(task_type_name) alg_group_dict = {'BestModel': (lambda an, tags, config: True), **{ f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans) for group_name, alg_names in opt_groups.items() }} val_test_groups = {'HPO-on-BestModel-TD': {f'{family}-TD-{task_type_name}': f'{family}-HPO' for family in ['XGB', 'LGBM', 'CatBoost', 'MLP']}} test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict, test_metric_name=test_metric_name, val_metric_name=val_metric_name, val_test_groups=val_test_groups, use_validation_errors=use_validation_errors) test_table = test_table.filter_n_splits(n_splits=10) test_table.alg_names = [get_simplified_name(alg_name) for alg_name in test_table.alg_names] test_arr = test_table.to_array() mean_results = np.mean(test_arr, axis=-1) alg_1_results = mean_results[test_table.alg_names.index(alg_name_1)] alg_2_results = mean_results[test_table.alg_names.index(alg_name_2)] with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1)): max_err = max(np.max(alg_1_results), np.max(alg_2_results)) lim_err = max_err * 1.02 ax.set_xlim(0.0, lim_err) ax.set_ylim(0.0, lim_err) # ax.set_xscale('symlog') # ax.set_yscale('symlog') ax.plot([0.0, lim_err], [0.0, lim_err], 'k-') ax.scatter(alg_1_results, alg_2_results, color='tab:blue', s=8.0, zorder=3) display_name_1 = get_display_name(alg_name_1) display_name_2 = get_display_name(alg_name_2) if test_metric_name is not None: raise NotImplementedError(f'Correct label for custom test metric name is not implemented') metric = 'Classification error' if task_type_name == 'class' else 'nRMSE' ax.set_xlabel(f'{metric} for {display_name_1}' + r' ($\downarrow$)') ax.set_ylabel(f'{metric} for {display_name_2}' + r' ($\downarrow$)') ax.set_title(coll_name_to_title(coll_name)) # diagonal text version # eps = 0.3 # # upper left text # ax.text(eps*lim_err, (1-eps)*lim_err, f'{alg_name_1} better', # ha="center", va="center", rotation=45, size=11, zorder=-2) # # bottom right text # ax.text((1-eps) * lim_err, eps * lim_err, f'{alg_name_2} better', # ha="center", va="center", rotation=45, size=11, zorder=-2) eps = 0.05 # upper left text ax.text(eps * lim_err, (1 - eps) * lim_err, f'{display_name_1} better', ha="left", va="top", rotation=0, size=11, zorder=-2) # bottom right text ax.text((1 - eps) * lim_err, eps * lim_err, f'{display_name_2} better', ha="right", va="bottom", rotation=0, size=11, zorder=-2) def plot_scatter(paths: Paths, filename: str, tables: ResultsTables, coll_names: List[str], alg_name_1: str, alg_name_2: str, test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None, use_validation_errors: bool = False): print(f'Creating scatterplot: {filename}') context_mgr = plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1)) if len(coll_names) == 1 \ else plt.rc_context(figsizes.icml2022_full( height_to_width_ratio=3 if len(coll_names) == 6 else (2 if len(coll_names) == 4 else 0.5))) with context_mgr: if len(coll_names) == 1: fig, ax = plt.subplots(1, 1) axs_list = [ax] elif len(coll_names) == 2: fig, axs = plt.subplots(1, 2) axs_list = [axs[0], axs[1]] elif len(coll_names) == 4: fig, axs = plt.subplots(2, 2) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]] elif len(coll_names) == 6: fig, axs = plt.subplots(3, 2) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]] else: raise ValueError(f'{len(coll_names)=} is not in [1, 2, 4, 6]') for coll_name, ax in zip(coll_names, axs_list): plot_scatter_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name, alg_name_1=alg_name_1, alg_name_2=alg_name_2, val_metric_name=val_metric_name, test_metric_name=test_metric_name, use_validation_errors=use_validation_errors) file_path = paths.plots() / filename utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) def _plot_scatter_with_labels(x_dict: Dict[str, float], y_dict: Dict[str, float], y_intervals: Optional[Dict[str, Tuple[float, float]]], ax: matplotlib.axes.Axes, xlabel: str, ylabel: str, title: Optional[str] = None, name_tfm_func: Optional[Callable[[str], str]] = None, plot_pareto_frontier: bool = True, arrow_alg_names: Optional[List[Tuple[str, str]]] = None, pareto_frontier_width: float = 2., alg_names_to_hide: Optional[List[str]] = None): if alg_names_to_hide is None: alg_names_to_hide = [] # First, convert dictionaries to a format suitable for seaborn # take shared models models = list(set(x_dict.keys()).intersection(set(y_dict.keys()))) models.sort() print(f'{models=}') # show models not in both # print("Models not in both x and y dicts") # print(set(x_dict.keys()).symmetric_difference(set(y_dict.keys()))) x_vals = [x_dict[model] for model in models] y_vals = [y_dict[model] for model in models] # Now, create a DataFrame from the dictionaries for easy plotting import pandas as pd df = pd.DataFrame({'model': models, 'x_value': x_vals, 'y_value': y_vals}) # split model into model_name and model_type # replace underscores with - # df['model'] = df['model'].str.replace('_', '-') # df['model_name'] = df['model'].str.split('-', expand=True)[0] def get_model_type(alg_name: str) -> str: if '-HPO' in alg_name: return 'HPO' elif '-TD' in alg_name: return 'TD' # elif '-PBB-D' in alg_name: # return 'PBB-D' elif '-D' in alg_name: return 'D' else: return 'unknown' # df['model_type'] = df['model'].str.split('-', expand=True)[1].str.split('(', expand=True)[0] df['model_type'] = [get_model_type(alg_name) for alg_name in df['model']] df['color'] = [get_plot_color(alg_name) for alg_name in models] df['alpha'] = [1.0 if alg_name not in alg_names_to_hide else 0.0 for alg_name in models] # Set up the figure size and style # fig = plt.figure(figsize=(10, 10)) # fig, ax = plt.subplots(1, 1, figsize=(10, 10)) # sns.set_theme(style="whitegrid", font_scale=2) print(f'{df=}') color_mapping = {color: color for color in df['color'].unique()} # Create the scatter plot ax = sns.scatterplot( x="x_value", y="y_value", hue="color", style="model_type", data=df, s=400, # size of the points palette=color_mapping, markers={'D': 'o', 'TD': 's', 'HPO': 'X', 'PBB-D': 'P'}, # palette='tab10', # palette can be changed as needed legend=False, # No need to draw legend at this point ax=ax, alpha=df['alpha'], ) ax.set_xscale('log') # ax.set_yscale('log') # Get the color of each point to set the color of the text point_colors = ax.collections[0].get_facecolor() if y_intervals is not None: y_intervals_arr = np.array([y_intervals[model] for model in models]) y_errors_arr = np.stack([np.array(y_vals) - y_intervals_arr[:, 0], y_intervals_arr[:, 1] - np.array(y_vals)], axis=1) for x, y, errors, color in zip(x_vals, y_vals, y_errors_arr, point_colors): ax.errorbar(x, y, elinewidth=4, yerr=errors[:, None], fmt='none', color=color) # Prepare to annotate the points texts = [] for i, point in enumerate(ax.collections[0].get_offsets()): model_name = df.iloc[i]['model'] if model_name in alg_names_to_hide: continue x, y = point text_color = point_colors[i] # Annotate the model names display_name = model_name if name_tfm_func is not None: display_name = name_tfm_func(display_name) # bold if it's an arrow end is_arrow_end = False if arrow_alg_names is None else any( model_name == end_name for _, end_name in arrow_alg_names) if is_arrow_end: display_name = rf'\textbf{{{display_name}}}' text = ax.text(x, y, display_name, color=text_color, fontsize=20, ha='center', va='center') text.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='white')]) texts.append(text) # Use adjust_text to repel the labels # Use adjust_text to repel the labels from each other and the points adjust_text(texts, x=df['x_value'].values, y=df['y_value'].values, avoid_self=False, expand=(1.15, 1.3), ax=ax, ) x_min, x_max = ax.get_xlim() y_min, y_max = ax.get_ylim() eps = 0.12 text_x = x_min ** (1 - eps) * x_max ** eps text_y = y_min + eps * (y_max - y_min) ax.set_axisbelow(True) # scatter.annotate('lower is better', xy=(text_x, text_y), rotation=) # ax.text(text_x, text_y, "lower is better", # ha="center", va="center", rotation=-45, size=30) ax.text(text_x, text_y, "better", ha="center", va="center", rotation=45, size=30, bbox=dict(boxstyle="larrow,pad=0.5", fc="lightgreen", ec="forestgreen", lw=4), zorder=50) # Set arrow coordinates based on the plot limits # arrow_x = x_min ** 0.1 * x_max ** 0.9 # Adjust 0.1 as needed # arrow_y = y_min + 0.1 * (y_max - y_min) # Adjust 0.1 as needed # # # Set the corrected arrow properties # arrow_props = dict(facecolor='red', edgecolor='red', shrink=0.05, width=2, headwidth=10) # # # Add the arrow to the plot # ax.annotate('', xy=(arrow_x, arrow_y), xytext=(x_min, y_min), # arrowprops=arrow_props, annotation_clip=False) if plot_pareto_frontier: xs = np.array(x_vals) ys = np.array(y_vals) perm = np.argsort(xs) xs = xs[perm] ys = ys[perm] xs_pareto = [xs[0], xs[0]] ys_pareto = [ax.get_ylim()[1], ys[0]] for i in range(1, len(xs)): if ys[i] < ys_pareto[-1]: xs_pareto.append(xs[i]) ys_pareto.append(ys_pareto[-1]) xs_pareto.append(xs[i]) ys_pareto.append(ys[i]) xs_pareto.append(ax.get_xlim()[1]) ys_pareto.append(ys_pareto[-1]) ax.plot(xs_pareto, ys_pareto, '--', color='k', linewidth=pareto_frontier_width, zorder=0.8) if arrow_alg_names is not None: # arrow_head_length = for first, second in arrow_alg_names: if first in alg_names_to_hide or second in alg_names_to_hide: continue x1 = x_dict[first] y1 = y_dict[first] x2 = x_dict[second] y2 = y_dict[second] # plt.arrow(x1, y1, x2-x1, y2-y1, length_includes_head=True, # head_width=0.08, head_length=0.00002) color = get_plot_color(second) color = tuple(list(color) + [0.5]) # add alpha channel # color = tuple(0.5 + 0.5*v for v in color) ax.annotate("", xy=(x2, y2), xytext=(x1, y1), zorder=5, # arrowprops=dict(arrowstyle="->"), arrowprops=dict( # facecolor='#444444', facecolor=color, # width=3.0, headwidth=10.0, headlength=8.0, shrink=0.01, edgecolor='none')) # Set the axis labels ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) if title is not None: ax.set_title(title) # sns.reset_orig() def extend_runtimes(times: Dict[str, float], task_type_name: str, keep_gpu: bool = True) -> Dict[str, float]: times = copy.copy(times) opt_groups = get_opt_groups(task_type_name) # for device in ['CPU', 'GPU']: for device in ['CPU']: # compute HPO times for method_name in ['RealMLP', 'MLP-RTDL', 'MLP-PLR', 'ResNet-RTDL', 'XGB', 'LGBM', 'CatBoost', 'TabR', 'RF', 'FTT']: if f'{method_name}-HPO-2_{device}' in times: times[f'{method_name}-HPO_{device}'] = (50. / 2.) * times[f'{method_name}-HPO-2_{device}'] elif f'{method_name}-HPO-1_{device}' in times: times[f'{method_name}-HPO_{device}'] = 50. * times[f'{method_name}-HPO-1_{device}'] elif f'{method_name}-TD_{device}' in times: # simple surrogate time print(f'Warning: Guessing HPO time for {method_name} on device {device} from TD time') times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-TD_{device}'] elif f'{method_name}-S-D_{device}' in times: # simple surrogate time print(f'Warning: Guessing HPO time for {method_name} on device {device} from S-D time') times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-S-D_{device}'] elif f'{method_name}-D_{device}' in times: # simple surrogate time print(f'Warning: Guessing HPO time for {method_name} on device {device} from D time') times[f'{method_name}-HPO_{device}'] = 50 * times[f'{method_name}-D_{device}'] if f'{method_name}-HPO_{device}' in times: times[f'{method_name}-HPO_best-1-auc-ovr_{device}'] = times[f'{method_name}-HPO_{device}'] # print(f'Warning: Guessing no-ls time for RealMLP on device {device} from ls time') # times[f'RealMLP-TD_no-ls_{device}'] = times[f'RealMLP-TD_{device}'] # times[f'RealMLP-TD-S_no-ls_{device}'] = times[f'RealMLP-TD-S_{device}'] for model in ['XGB', 'LGBM', 'CatBoost']: # simple surrogate times if f'{model}-HPO_{device}' not in times and f'{model}-TD_{device}' in times: print(f'Warning: Guessing HPO time for {model} on device {device} from TD time') times[f'{model}-HPO_{device}'] = 50 * times[f'{model}-TD_{device}'] # raw_names = list(set('_'.join(name.split('_')[:-1]) for name in times)) # print(f'Warning: Guessing additional times') # for name in raw_names: # for new_suffix in ['_no-ls', '_val-ce', '_val-ce_no-ls', '_rssc']: # old_name = f'{name}_CPU' # new_name = f'{name}{new_suffix}_CPU' # if new_name not in times and old_name in times: # times[new_name] = times[old_name] for group_name, alg_names in opt_groups.items(): if group_name not in ['-D', '-TD', '-HPO', '-D_val-ce', '-TD_val-ce'] and not group_name.endswith('_prep'): continue # exclude the other ones for now alg_names = [ alg_name.replace('-class', '').replace('-reg', '') for alg_name in alg_names] alg_device_names = [f'{alg_name}_{device}' for alg_name in alg_names] if all(alg_device_name in times for alg_device_name in alg_device_names): sum_time = sum([times[alg_device_name] for alg_device_name in alg_device_names]) times[f'BestModel{group_name}_{device}'] = sum_time times[f'Ensemble{group_name}_{device}'] = sum_time if not keep_gpu: times = {key: value for key, value in times.items() if not 'GPU' in key} times = {get_simplified_name(key): value for key, value in times.items()} return times def plot_pareto_ax(ax: matplotlib.axes.Axes, paths: Paths, tables: ResultsTables, coll_name: str, alg_names: List[str], val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, use_ranks: bool = False, use_normalized_errors: bool = False, tag: Optional[str] = None, use_geometric_mean: bool = True, use_grinnorm_errors: bool = False, shift_eps: float = 1e-2, use_validation_errors: bool = False, arrow_alg_names: Optional[List[Tuple[str, str]]] = None, plot_pareto_frontier: bool = True, alg_names_to_hide: Optional[List[str]] = None, pareto_frontier_width: float = 2.): print(f'Creating plot for {coll_name}') is_reg = TaskCollection.from_name(coll_name, paths).load_infos(paths)[0].tensor_infos[ 'y'].get_cat_size_product() == 0 default_metric_name = ('1-r2' if use_grinnorm_errors else 'nrmse') if is_reg else 'class_error' if val_metric_name is None: val_metric_name = default_metric_name if test_metric_name is None: test_metric_name = default_metric_name table = tables.get(coll_name, n_cv=1, tag=tag or 'paper') rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name, use_relative_score=False, return_percentages=False, val_metric_name=val_metric_name, test_metric_name=test_metric_name, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, use_grinnorm_errors=use_grinnorm_errors, filter_alg_names_list=alg_names, use_geometric_mean=use_geometric_mean, shift_eps=shift_eps, use_validation_errors=use_validation_errors) task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg' time_coll_name = f'meta-train-{task_type_name}' # get runtimes avg_train_times = get_avg_train_times(paths, time_coll_name, per_1k_samples=True) # print(f'{avg_train_times=}') avg_train_times = extend_runtimes(avg_train_times, task_type_name=task_type_name, keep_gpu=False) # print(f'After extending: {avg_train_times=}') # def tfm_key(key: str) -> str: # return key.replace('_CPU', ' (CPU)').replace('_GPU', ' (GPU)') def tfm_key(key: str) -> str: return key.replace('_CPU', '') avg_train_times = {tfm_key(key): value for key, value in avg_train_times.items()} # remove sklearn MLP if 'MLP-SKL-D' in avg_train_times: del avg_train_times['MLP-SKL-D'] # if 'ResNet-RTDL-D' in avg_train_times: # del avg_train_times['ResNet-RTDL-D'] # convert MLP-HPO runtime # get simplified associated names (without the CPU/GPU thing) # generate ensemble/BestModel runtimes? # add CPU/GPU to rel_means_dict keys extended_means_dict = rel_means_dict extended_intervals_dict = rel_intervals_dict print(f'{list(avg_train_times.keys())=}') print(f'{list(extended_means_dict.keys())=}') common_keys = set(avg_train_times.keys()).intersection(set(extended_means_dict.keys())) # avg_train_times = {tfm_alg_name(key): value for key, value in avg_train_times.items() if key in common_keys} # extended_means_dict = {tfm_alg_name(key): value for key, value in extended_means_dict.items() if key in common_keys} # extended_intervals_dict = {tfm_alg_name(key): value for key, value in extended_intervals_dict.items() if # key in common_keys} # extended_means_dict = utils.join_dicts( # *[{f'{key} ({device})': value for key, value in rel_means_dict.items()} for device in ['CPU', 'GPU']] # ) # extended_intervals_dict = utils.join_dicts( # *[{f'{key} ({device})': value for key, value in rel_intervals_dict.items()} for device in ['CPU', 'GPU']] # ) # print('times keys:', sorted(list(avg_train_times.keys()))) # print('means keys:', sorted(list(extended_means_dict.keys()))) # # print(f'x_dict = {avg_train_times}') # print(f'y_dict = {extended_means_dict}') title = coll_name_to_title(coll_name) # coll_name_latex = coll_name # for split_name in ['train', 'test']: # if coll_name == f'meta-{split_name}-{task_type_name}': # coll_name_latex = r'$\mathcal{B}^{\mathrm{' + split_name + r'}}_{\mathrm{' + task_type_name + r'}}$' ylabel = ('Shifted geometric mean' if use_geometric_mean else 'Arithmetic mean') + ' of ' if use_ranks: ylabel = ylabel + r'\textbf{ranks}' else: if use_normalized_errors: ylabel = ylabel + r'\textbf{normalized} ' elif use_grinnorm_errors: ylabel = ylabel + r'\textbf{custom-normalized} ' if task_type_name == 'class': if test_metric_name is None or test_metric_name == 'class_error': ylabel = ylabel + r'\textbf{classification errors}' elif test_metric_name == '1-auc_ovr': ylabel = ylabel + r'\textbf{1-AUC(one-vs-rest)}' elif test_metric_name == 'cross_entropy': ylabel = ylabel + r'\textbf{cross-entropies}' else: raise ValueError(f'Test metric {test_metric_name} not implemented') else: if test_metric_name is None or test_metric_name == 'rmse': ylabel = ylabel + r'\textbf{RMSEs}' elif test_metric_name == 'nrmse': ylabel = ylabel + r'\textbf{nRMSEs}' elif test_metric_name == '1-r2': ylabel = ylabel + r'$1-R^2$' else: raise ValueError(f'Test metric {test_metric_name} not implemented') _plot_scatter_with_labels(avg_train_times, extended_means_dict, y_intervals=extended_intervals_dict, xlabel=r'Average training \textbf{time (CPU)} per 1K samples [s]', # + r' ($\downarrow$)', ylabel=ylabel, ax=ax, title=title, name_tfm_func=get_display_name, arrow_alg_names=arrow_alg_names, plot_pareto_frontier=plot_pareto_frontier, alg_names_to_hide=alg_names_to_hide, pareto_frontier_width=pareto_frontier_width, # ylabel=r'Benchmark score relative to best model', # ylabel=r'Error increase in \% vs best ($\downarrow$)', # title=f'Benchmark scores on {coll_name_latex} vs train time', ) def shorten_coll_names(coll_names: List[str]) -> List[str]: coll_name_dict = {'meta-train-class': 'mtrc', 'meta-train-reg': 'mtrr', 'meta-test-class': 'mtec', 'meta-test-reg': 'mter', 'grinsztajn-class-filtered': 'gcf', 'grinsztajn-reg': 'gr'} short_coll_names = [coll_name if coll_name not in coll_name_dict else coll_name_dict[coll_name] for coll_name in coll_names] return short_coll_names def plot_pareto(paths: Paths, tables: ResultsTables, coll_names: List[str], alg_names: List[str], val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, use_ranks: bool = False, use_normalized_errors: bool = False, filename: Optional[str] = None, filename_suffix: Optional[str] = None, tag: Optional[str] = None, use_grinnorm_errors: bool = False, use_geometric_mean: bool = True, shift_eps: float = 1e-2, use_validation_errors: bool = False, arrow_alg_names: Optional[List[Tuple[str, str]]] = None, plot_pareto_frontier: bool = True, alg_names_to_hide: Optional[List[str]] = None, subfolder: Optional[str] = None, pareto_frontier_width: float = 2., use_2x3: bool = False): print(f'Plotting pareto plot for {coll_names}') sns.set_theme(style="whitegrid", font_scale=2) if len(coll_names) == 1: fig, ax = plt.subplots(1, 1, figsize=(10, 10)) axs_list = [ax] elif len(coll_names) == 2: fig, axs = plt.subplots(1, 2, figsize=(20, 10)) axs_list = [axs[0], axs[1]] elif len(coll_names) == 3: fig, axs = plt.subplots(1, 3, figsize=(30, 10)) axs_list = [axs[0], axs[1], axs[2]] elif len(coll_names) == 4: fig, axs = plt.subplots(2, 2, figsize=(20, 20)) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]] elif len(coll_names) == 6: if use_2x3: fig, axs = plt.subplots(2, 3, figsize=(30, 20)) axs_list = [axs[0, 0], axs[1, 0], axs[0, 1], axs[1, 1], axs[0, 2], axs[1, 2]] else: fig, axs = plt.subplots(3, 2, figsize=(20, 30)) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]] else: raise ValueError(f'{len(coll_names)=} is not in [1, 2, 3, 4, 6]') for coll_name, ax in zip(coll_names, axs_list): # print(f'{val_metric_name=}, {test_metric_name=}, {coll_name=}') plot_pareto_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names, val_metric_name=val_metric_name, test_metric_name=test_metric_name, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, tag=tag, use_grinnorm_errors=use_grinnorm_errors, use_geometric_mean=use_geometric_mean, shift_eps=shift_eps, use_validation_errors=use_validation_errors, arrow_alg_names=arrow_alg_names, plot_pareto_frontier=plot_pareto_frontier, alg_names_to_hide=alg_names_to_hide, pareto_frontier_width=pareto_frontier_width) mean_name = f'geometric_eps-{shift_eps:g}' if use_geometric_mean else 'arithmetic' if use_ranks: mean_name = 'ranks_' + mean_name elif use_normalized_errors: mean_name = 'normerrors_' + mean_name elif use_grinnorm_errors: mean_name = 'grinnormerrors_' + mean_name name_parts = shorten_coll_names(coll_names) + [mean_name] if use_validation_errors: name_parts = ['validation'] + name_parts if use_2x3: name_parts = ['2x3'] + name_parts plots_path = paths.plots() if subfolder is not None: plots_path = plots_path / subfolder if filename is None: file_path = plots_path / f'pareto_{"_".join(name_parts)}.pdf' else: file_path = plots_path / filename if filename_suffix is not None: file_path = file_path.with_stem(f'{file_path.stem}{filename_suffix}') if len(coll_names) in [4, 6]: labels = ['D = defaults {} {} {} {} {} TD = tuned defaults {} {} {} {} {} HPO = hyperparameter optimization', 'Best/Ensemble: out of XGB, LGBM, CatBoost, (Real)MLP'] r = matplotlib.patches.Rectangle((0, 0), 1, 1, fill=False, edgecolor='none', visible=False) fig.legend(handles=[r] * len(labels), labels=labels, fontsize=30, handlelength=0, handletextpad=0, loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=1) # plt.tight_layout(rect=[0, 0.09, 1.0, 1.0]) utils.ensureDir(file_path) plt.savefig(file_path, bbox_inches='tight') plt.close(fig) sns.reset_orig() print(f'Created plot {file_path}') def plot_winrates(paths: Paths, tables: ResultsTables, coll_name: str, alg_names: List[str], val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None): print(f'Plotting winrate matrix plot for {coll_name}') table = tables.get(coll_name) task_collection = TaskCollection.from_name(coll_name, paths) task_infos = task_collection.load_infos(paths) task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg' opt_groups = get_opt_groups(task_type_name) alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{ f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans) for group_name, alg_names in opt_groups.items() }} test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict, test_metric_name=test_metric_name, val_metric_name=val_metric_name) simplify_name_fn = get_simplified_name test_table = test_table.rename_algs(simplify_name_fn) test_table = test_table.filter_algs(alg_names) use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci') if use_task_weighting: separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] task_weights = TaskWeighting(test_table.task_infos, separate_task_names).get_task_weights() else: n_tasks = len(test_table.task_infos) task_weights = np.ones(n_tasks) / n_tasks n_splits = 10 test_table = test_table.filter_n_splits(n_splits) # shape: [n_algs, n_tasks, n_splits] errors = test_table.to_array() # do it once with < and once with <= to make sure that ties count as half a win wins_tensor = 0.5 * ((errors[:, None] <= errors[None, :]).astype(np.float32) + (errors[:, None] < errors[None, :]).astype(np.float32)) avg_wins_per_task = np.mean(wins_tensor, axis=-1) # average over splits # average wins by task weights winrate_matrix = np.einsum('ijt,t->ij', avg_wins_per_task, task_weights) win_percentage_matrix = 100.0 * winrate_matrix perm = np.argsort(np.mean(win_percentage_matrix, axis=-1)) # sort by average winrate win_percentage_matrix = win_percentage_matrix[perm, :][:, perm] alg_names = [test_table.alg_names[i] for i in perm] alg_names = [alg_name.replace('_', r'\_') for alg_name in alg_names] # with matplotlib.rc_context(): # Create a heatmap using seaborn fig = plt.figure(figsize=(10, 8)) sns.set_theme(style="white", font_scale=0.6) mask = np.eye(win_percentage_matrix.shape[0], dtype=bool) heatmap = sns.heatmap(win_percentage_matrix, annot=True, fmt=".1f", cmap="YlGnBu", vmin=0, vmax=100, linewidths=0.5, mask=mask, square=True, cbar_kws={"shrink": 0.8}) display_alg_names = [get_display_name(an) for an in alg_names] # Set labels for rows and columns heatmap.set_xticklabels(display_alg_names, rotation=90, fontsize=8) # Adjust font size heatmap.set_yticklabels(display_alg_names, rotation=0, fontsize=8) # Adjust font size # Remove x and y labels heatmap.set_xlabel('') heatmap.set_ylabel('') # Add a label to the color scale cbar = heatmap.collections[0].colorbar # cbar.set_label("Percentage of row wins", fontsize=10) heatmap.set_title(coll_name_to_title(coll_name) + ', percentage of row wins', fontsize=15) file_path = paths.plots() / f'winrate_matrix_{coll_name}.pdf' utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) sns.reset_orig() def plot_stopping_ax(ax: plt.Axes, paths: Paths, tables: ResultsTables, method: str, classification: bool): esr_list = [10, 20, 50, 100, 300, 1000] ax.set_xscale('log') ax.plot([10, 1000], [0.0, 0.0], 'k--') if classification: combinations = [('meta-train-class', 'stopped on classification error', '', 'tab:blue'), ('meta-train-class', 'stopped on Brier loss', '_val-brier', 'tab:orange'), ('meta-train-class', 'stopped on cross-entropy loss', '_val-ce', 'tab:green')] else: combinations = [('meta-train-reg', 'stopped on RMSE', '', 'tab:blue')] for coll_name, label, suffix, color in combinations: table = tables.get(coll_name, n_cv=1, tag='paper_early_stopping') # print(f'{table.test_table.alg_names=}') rel_alg_name = method + '_esr-1000' # stopped on standard metric rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=rel_alg_name) alg_names = [method + suffix + f'_esr-{esr}' for esr in esr_list] results_list = [rel_results[alg_name] for alg_name in alg_names] lower_list = [rel_intervals[alg_name][0] for alg_name in alg_names] upper_list = [rel_intervals[alg_name][1] for alg_name in alg_names] ax.plot(esr_list, results_list, '.-', color=color, label=label) ax.fill_between(esr_list, lower_list, upper_list, color=color, alpha=0.3) ax.set_xlabel('Stopping patience') ax.set_xticks(esr_list, labels=[str(esr) for esr in esr_list]) ax.grid(True) def plot_stopping(paths: Paths, tables: ResultsTables, classification: bool): print(f'Generating stopping plot') with plt.rc_context(figsizes.icml2022_full(height_to_width_ratio=0.9)): fig, axs = plt.subplots(1, 3, sharey='all') for i, method in enumerate(['XGB-TD', 'LGBM-TD', 'CatBoost-TD']): ax = axs[i] ax.set_title(method) plot_stopping_ax(ax, paths, tables, method=method, classification=classification) # axs[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3) axs[0].set_ylabel(r'Error increase in \%') fig.legend(*axs[0].get_legend_handles_labels(), loc='upper center', bbox_to_anchor=(0.5, 0.15), ncol=3) task_type_name = 'class' if classification else 'reg' file_path = paths.plots() / f'stopping_{task_type_name}.pdf' plt.tight_layout(rect=[0, 0.15, 1.0, 1.0]) if classification: y_min, y_max = axs[0].get_ylim() y_max = min(y_max, 15) axs[0].set_ylim(y_min, y_max) utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) def get_equidistant_blue_colors(n: int): # cmap = plt.get_cmap('viridis') cmap = sns.color_palette("ch:s=.25,rot=-.25", n) # cmap = sns.color_palette("viridis", n) # norm = matplotlib.colors.Normalize(vmin=0, vmax=n - 1) # colors = [cmap(norm(i)) for i in range(n)] colors = [cmap[i] for i in range(n)] return colors def _create_cumul_abl_plot(file_path: Path, benchmark_results: Dict[str, Dict[str, float]], benchmark_intervals: Dict[str, Dict[str, Tuple[float, float]]], alg_names: List[str], colors: List, contribs: List[str], improv_groups: List[str]): n_benchmarks = len(benchmark_results) n_improvements = len(list(benchmark_results.values())[0]) start_color = mcolors.to_rgb('tab:blue') # Color for vanilla MLP end_color = mcolors.to_rgb('tab:green') # Color for final MLP gradient_colors = [mcolors.to_hex(c) for c in np.linspace(start_color, end_color, n_improvements)] start_alpha = 0.3 end_alpha = 0.6 alpha_cumulative_list = np.linspace(start_alpha, end_alpha, n_improvements) start_alpha_improvement = 0.65 end_alpha_improvement = 1. alpha_improvement_list = np.linspace(start_alpha_improvement, end_alpha_improvement, n_improvements) with plt.rc_context(figsizes.icml2022_half(height_to_width_ratio=1.5)): # Plotting fig, axs = plt.subplots(nrows=1, ncols=n_benchmarks, sharey=True) # for i, col in enumerate(df.columns): for i, (col, results) in enumerate(benchmark_results.items()): ax = axs[i] # values = df[col].values # bar_height = 1.0 # bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * bar_height bar_height = 1.0 bar_positions = np.arange(len(alg_names), dtype=np.float64)[::-1] * (bar_height + 0.1) # Handle empty strings in alg_names to create gaps between bars # mask = df.index != '' mask = [alg_name != '' for alg_name in alg_names] non_empty_indices = np.where(mask)[0] non_empty_alg_names = [alg_name for alg_name in alg_names if alg_name != ''] values = [results[alg_name] if alg_name in results else 0.0 for alg_name in non_empty_alg_names] ax.xaxis.grid(True) # Plot only if the method name is not an empty string non_empty_values = values non_empty_bar_positions = bar_positions[non_empty_indices] print(non_empty_bar_positions) intervals = np.array([benchmark_intervals[col][alg_name] if alg_name in results else (0.0, 0.0) for alg_name in non_empty_alg_names]).transpose() rel_intervals = intervals - non_empty_values errors = np.array([-rel_intervals[0], rel_intervals[1]]) # turn them into (absolute) errors for j in range(len(non_empty_values)): value = non_empty_values[j] last_value = value if j == 0 else non_empty_values[j - 1] ax.barh(non_empty_bar_positions[j], min(value, last_value), align='edge', color=gradient_colors[j], alpha=alpha_cumulative_list[j], height=bar_height) if value > last_value: ax.barh(non_empty_bar_positions[j], value - last_value, left=last_value, align='edge', color=gradient_colors[j], alpha=alpha_improvement_list[j], height=bar_height) elif value < last_value: ax.barh(non_empty_bar_positions[j], last_value - value, left=value, align='edge', color=gradient_colors[j], # color='white', edgecolor='red', hatch='/', linewidth=2, # color='red', fill=False, # color='white', edgecolor='tab:green', hatch='//' * 3, facecolor='none', linewidth=0, alpha=alpha_improvement_list[j], height=bar_height) ax.errorbar(non_empty_values, non_empty_bar_positions + 0.5 * bar_height, xerr=errors, fmt='none', color='gray', linewidth=0.8) # Add method names on the y-axis ax.tick_params(left=False) ax.set_yticks(bar_positions + 0.5 * bar_height) # Get the default font size for y-tick labels default_fontsize = plt.rcParams['ytick.labelsize'] font_properties = {'family': 'sans-serif', 'size': default_fontsize + 1} ax.set_yticklabels(alg_names, fontdict=font_properties) # ax.set_xlabel(r'Error increase in \% vs best ($\downarrow$)') ax.set_title(col) # Remove frame around plot ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False) # Set x-axis ticks and gridlines ax.xaxis.set_ticks_position('bottom') # Highlight x=0 tick and corresponding gridline # ax.axvline(x=-0.09, color='black', linewidth=1.5) #FIXME it'd be better but right now it's a bit off compared to the grid lines color_map = {'New': '#ff7f0e', 'Unusual': '#2ca02c', 'default': (0.35, 0.35, 0.35)} colors_contrib = [color_map.get(key, 'black') for key in contribs] for label, color in zip(ax.get_yticklabels(), colors_contrib): label.set_color(color) max_value = max(max(results.values()) for results in benchmark_results.values()) for ax in axs: ax.set_xlim(0, max_value * 1.1) # Add some padding # Identify the unique categories and their y-coordinates # unique values with the right order unique_groups = list(dict.fromkeys(improv_groups)) group_indices = {group: [] for group in unique_groups} # Loop over improvements and store the indices for each group for i, group in enumerate(improv_groups): if group in group_indices: group_indices[group].append(i) # Calculate the bracket positions bracket_positions = {} bracket_widths = {} for group, indices_ in group_indices.items(): # add 1 to the indices to take into account the first bar indices = [i + 1 for i in indices_] start_pos = non_empty_bar_positions[min(indices)] - 0.0 end_pos = non_empty_bar_positions[max(indices)] bracket_positions[group] = (start_pos + end_pos) / 2 + bar_height / 2 + 0.0 bracket_widths[group] = (start_pos - end_pos) * 0.9 + bar_height - 0.4 # Call the draw_bracket function for each unique group text_offset = 0.3 # Offset for the text annotation from the bracket # find the very left of the figure in ax[0] coordinates left = -31.5 # TODO for group in unique_groups: y = bracket_positions[group] width = bracket_widths[group] # show text on the left side of the figure axs[0].annotate(group, xy=(left, y), xytext=(left - text_offset, y), ha='right', va='center', color='black', fontsize='small', rotation=90, arrowprops=dict(arrowstyle=f'-[, widthB={width}, lengthB=0.5', lw=1., color='black'), annotation_clip=False, font_properties={'family': 'sans-serif', 'size': default_fontsize - 1}) # make a legend legend_elements = [mpatches.Patch(facecolor=color_map[key], edgecolor='black', label=key) for key in color_map \ if key != "default"] font_properties = {'family': 'sans-serif', 'size': default_fontsize} fig.legend(handles=legend_elements, loc='lower left', bbox_to_anchor=(0.09, 0.00), prop=font_properties) # # Set common labels and adjust layout fig.text(0.65, -0.02, r'Benchmark score improvement (\%) vs. vanilla', ha='center') # fig.suptitle('Method Performance Comparison', y=1.05) # plt.show() # plt.tight_layout() # break the annotations # utils.ensureDir(file_path) plt.savefig(file_path) plt.close(fig) def plot_cumulative_ablations(paths: Paths, tables: ResultsTables, filename: str = None, val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, use_geometric_mean: bool = True, shift_eps: float = 1e-2): print(f'Creating cumulative ablations plot') improvements = { 'vanilla': (r'\textbf{Vanilla MLP}', 'default'), 'robust-scale-smooth-clip': ('Robust scale + smooth clip', 'New', "Preprocessing"), 'one-hot-small-cat': ('One-hot for small cat.', 'default', "Preprocessing"), 'no-early-stop': ('No early stopping', 'default', "Hyperparameters"), 'last-best-epoch': ('Last best epoch', 'Unusual', "Hyperparameters"), 'lr-multi-cycle': (r'$\mathrm{coslog}_4$ lr sched', 'Unusual', "Hyperparameters"), 'beta2-0.95': (r'Adam $\beta_2 = 0.95$', 'Unusual', "Hyperparameters"), 'label-smoothing': (r'Label smoothing (class.)', 'Unusual', "Hyperparameters"), 'output-clipping': (r'Output clipping (reg.)', 'Unusual', "Hyperparameters"), 'ntp': (r'NT parametrization', 'Unusual', "Architecture"), 'different-act': (r'Act. fn. SELU / Mish', 'default', "Architecture"), 'param-act': (r'Parametric act. fn.', 'Unusual', "Architecture"), 'front-scale': (r'Scaling layer', 'New', "Architecture"), 'num-emb-pl': (r'Num. embeddings: PL', 'default', "Architecture"), 'num-emb-pbld': (r'PL emb.\ $\to$ PBLD emb.', 'New', "Architecture"), 'alt-pdrop-0.15': (r'Dropout $p=0.15$', 'default', "Regularization"), 'alt-pdrop-flat-cos': (r'Dropout sched: $\mathrm{flat\_cos}$', 'New', "Regularization"), 'alt-wd-0.02': (r'Weight decay wd $= 0.02$', 'default', "Regularization"), 'alt-wd-flat-cos': (r'wd sched: $\mathrm{flat\_cos}$', 'New', "Regularization"), 'alt-bias-init-he+5': (r'Bias init: he+5', 'Unusual', "Initialization"), 'alt-weight-init-std': (r'Weight init: data-driven', 'New', "Initialization"), 'final': (r'\textbf{= RealMLP}', "default") } group_labels = {key: value[0] for key, value in improvements.items()} contribs = [value[1] for key, value in improvements.items()] improv_groups = [value[2] for key, value in improvements.items() if len(value) > 2] coll_names = ['meta-train-class', 'meta-train-reg'] benchmark_results = {} benchmark_intervals = {} for coll_name in coll_names: table = tables.get(coll_name, tag='paper_cumulative_ablations_new') rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name, val_metric_name=val_metric_name, test_metric_name=test_metric_name, use_relative_score=False, return_percentages=False, use_geometric_mean=use_geometric_mean, shift_eps=shift_eps) alg_names = list(rel_means_dict.keys()) vanilla_alg_names = [alg_name for alg_name in alg_names if 'vanilla' in alg_name] vanilla_alg_results = [rel_means_dict[alg_name] for alg_name in vanilla_alg_names] best_vanilla_alg_name = vanilla_alg_names[np.argmin(vanilla_alg_results)] # get the results again, but now relative to the best vanilla alg, in percent rel_means_dict, rel_intervals_dict = get_benchmark_results(paths, table=table, coll_name=coll_name, val_metric_name=val_metric_name, test_metric_name=test_metric_name, rel_alg_name=best_vanilla_alg_name, use_geometric_mean=use_geometric_mean, shift_eps=shift_eps) # group different lr values together alg_group_names = [alg_name.split('_')[-2] if len(alg_name.split('_')) >= 2 else '' for alg_name in alg_names] # alg_group_names_unique = list(set(alg_group_names)) rel_means_dict_group = dict() rel_intervals_dict_group = dict() for alg_group_name, display_name in group_labels.items(): # alg names in this group group_alg_names = [an for an, agn in zip(alg_names, alg_group_names) if agn == alg_group_name] if len(group_alg_names) == 0: print(f'No algs for group {alg_group_name}') continue best_alg_name = group_alg_names[np.argmin([rel_means_dict[an] for an in group_alg_names])] print(f'best lr: {best_alg_name.split("_")[-1]} for {alg_group_name}') rel_means_dict_group[alg_group_name] = -rel_means_dict[best_alg_name] low, high = rel_intervals_dict[best_alg_name] rel_intervals_dict_group[alg_group_name] = -high, -low benchmark_results[coll_name] = rel_means_dict_group benchmark_intervals[coll_name] = rel_intervals_dict_group for coll_name in coll_names: for mydict in [benchmark_results, benchmark_intervals]: # copy the last result because we need it twice but we can't have the same dictionary key twice print(f'{list(mydict[coll_name].keys())=}') mydict[coll_name]['final'] = mydict[coll_name]['alt-weight-init-std'] # change keys to descriptions def map_keys(f: Dict, to_be_mapped: Dict): return {f[key]: value for key, value in to_be_mapped.items()} for coll_name in benchmark_results: benchmark_results[coll_name] = map_keys(group_labels, benchmark_results[coll_name]) benchmark_intervals[coll_name] = map_keys(group_labels, benchmark_intervals[coll_name]) alg_names = list(benchmark_results['meta-train-class'].keys()) # colors = ['b'] * len(alg_names) colors = get_equidistant_blue_colors(len(list(group_labels.keys()))) # colors = ['tab:blue'] * len(list(group_labels.keys())) if filename is None: filename = f'cumulative_ablations.pdf' file_path = paths.plots() / filename _create_cumul_abl_plot(file_path=file_path, benchmark_results=benchmark_results, benchmark_intervals=benchmark_intervals, alg_names=alg_names, colors=colors, contribs=contribs, improv_groups=improv_groups) def plot_cdd_ax(ax: matplotlib.axes.Axes, paths: Paths, tables: ResultsTables, coll_name: str, alg_names: List[str], val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, tag: Optional[str] = None, use_validation_errors: bool = False): print(f'Creating plot for {coll_name}') table = tables.get(coll_name, n_cv=1, tag=tag or 'paper') simplify_name_fn = get_simplified_name n_splits = 10 task_collection = TaskCollection.from_name(coll_name, paths) task_infos = task_collection.load_infos(paths) task_type_name = 'class' if task_infos[0].tensor_infos['y'].is_cat() else 'reg' opt_groups = get_opt_groups(task_type_name) alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{ f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans) for group_name, alg_names in opt_groups.items() }} test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict, test_metric_name=test_metric_name, val_metric_name=val_metric_name, use_validation_errors=use_validation_errors) test_table = test_table.rename_algs(simplify_name_fn) # print(f'{test_table.alg_names=}') # print(f'{filter_alg_names_list=}') test_table = test_table.filter_algs(alg_names) # new code test_table = test_table.filter_n_splits(n_splits) # shape: [n_algs, n_tasks, n_splits] errors = test_table.to_array() errors = np.mean(errors, axis=2) # average over splits # adapted from https://sherbold.github.io/autorank/ data = pd.DataFrame() for i, alg_name in enumerate(test_table.alg_names): data[get_display_name(alg_name)] = errors[i] from autorank import autorank, plot_stats, create_report, latex_table result = autorank(data, alpha=0.05, verbose=False, order='ascending', force_mode='nonparametric') plot_stats(result, ax=ax, allow_insignificant=True) print(create_report(result)) ax.set_title('grinsztajn-class' if coll_name == 'grinsztajn-class-filtered' else coll_name) def plot_cdd(paths: Paths, tables: ResultsTables, coll_names: List[str], alg_names: List[str], val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, filename: Optional[str] = None, tag: Optional[str] = None, use_validation_errors: bool = False): print(f'Plotting pareto plot for {coll_names}') old_value = plt.rcParams['text.usetex'] plt.rcParams['text.usetex'] = False # apparently doesn't work with the cdd plot package (autorank) assert len(coll_names) in [1, 2, 4, 6] if len(coll_names) == 1: fig, ax = plt.subplots(1, 1, figsize=(6, 4)) axs_list = [ax] elif len(coll_names) == 2: fig, axs = plt.subplots(1, 2, figsize=(12, 4)) axs_list = [axs[0], axs[1]] elif len(coll_names) == 4: fig, axs = plt.subplots(2, 2, figsize=(12, 8)) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]] else: fig, axs = plt.subplots(3, 2, figsize=(12, 12)) axs_list = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1], axs[2, 0], axs[2, 1]] for coll_name, ax in zip(coll_names, axs_list): plot_cdd_ax(ax=ax, paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names, val_metric_name=val_metric_name, test_metric_name=test_metric_name, tag=tag, use_validation_errors=use_validation_errors) name_parts = shorten_coll_names(coll_names) if use_validation_errors: name_parts = ['validation'] + name_parts if filename is None: file_path = paths.plots() / f'cdd_{"_".join(name_parts)}.pdf' else: file_path = paths.plots() / filename utils.ensureDir(file_path) plt.savefig(file_path, bbox_inches='tight') plt.close(fig) plt.rcParams['text.usetex'] = old_value ================================================ FILE: pytabkit/bench/eval/runtimes.py ================================================ from typing import Dict import numpy as np from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.models import utils def get_avg_train_times(paths: Paths, coll_name: str, per_1k_samples: bool = False) -> Dict[str, float]: task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) alg_names = [path.name for path in paths.times().iterdir()] result = dict() for alg_name in alg_names: file_paths = [paths.times_alg_task(alg_name, task_desc=task_info.task_desc) / 'times.yaml' for task_info in task_infos] if all(utils.existsFile(file_path) for file_path in file_paths): single_times = [utils.deserialize(file_path, use_yaml=True)['fit_time'] for file_path in file_paths] if per_1k_samples: # use 0.6 since that is the fraction of training samples single_times = [single_time / ((0.6 * task_info.n_samples) / 1000) for single_time, task_info in zip(single_times, task_infos)] mean_time = np.mean(single_times) result[alg_name] = mean_time return result def get_avg_predict_times(paths: Paths, coll_name: str, per_1k_samples: bool = False) -> Dict[str, float]: task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) alg_names = [path.name for path in paths.times().iterdir()] result = dict() for alg_name in alg_names: file_paths = [paths.times_alg_task(alg_name, task_desc=task_info.task_desc) / 'times.yaml' for task_info in task_infos] if all(utils.existsFile(file_path) for file_path in file_paths): single_times = [utils.deserialize(file_path, use_yaml=True)['predict_time'] for file_path in file_paths] if per_1k_samples: # use 0.6 since that is the fraction of training samples single_times = [single_time / ((0.2 * task_info.n_samples) / 1000) for single_time, task_info in zip(single_times, task_infos)] mean_time = np.mean(single_times) result[alg_name] = mean_time return result ================================================ FILE: pytabkit/bench/eval/tables.py ================================================ from typing import List, Optional import numpy as np from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results, get_opt_groups, get_simplified_name, \ get_display_name from pytabkit.bench.eval.evaluation import TaskWeighting, FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector from pytabkit.models import utils from pytabkit.models.data.data import TaskType from pytabkit.models.data.nested_dict import NestedDict def _get_table_str(*parts: List[List[str]]): part_rows = [[' & '.join(row) + r' \\' for row in part] for part in parts] n_cols = max(len(row) for part in parts for row in part) begin_table_str = r'\begin{tabular}{' + ('c' * n_cols) + r'}' + '\n' + r'\toprule' end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}' all_row_strs = [begin_table_str] for part in part_rows[:-1]: all_row_strs.extend(part) all_row_strs.append(r'\midrule') all_row_strs.extend(part_rows[-1]) all_row_strs.append(end_table_str) complete_str = '\n'.join(all_row_strs) return complete_str def generate_ds_table(paths: Paths, task_collection: TaskCollection, include_openml_ids: bool = False): print(f'Generating dataset table for {task_collection.coll_name}') task_infos = task_collection.load_infos(paths) task_infos.sort(key=lambda ti: ti.task_desc.task_name) file_path = paths.plots() / f'datasets_{task_collection.coll_name}.tex' is_classification = any(ti.task_type == TaskType.CLASSIFICATION for ti in task_infos) # columns to include: name, n_samples, n_numerical, n_categorical, largest_category, openml id, # (link), (subsampled), (n_classes), (citation), (weight) table_rows = [['Name', r'\#samples', r'\#num.\ features', r'\#cat.\ features', r'largest \#categories']] if is_classification: table_rows[0].append(r'\#classes') if include_openml_ids: table_rows[0].append('OpenML task ID') for task_info in task_infos: row = [] row.append(task_info.task_desc.task_name.replace('_', r'\_')) row.append(str(task_info.n_samples)) row.append(str(task_info.tensor_infos['x_cont'].get_n_features())) n_cat = task_info.tensor_infos['x_cat'].get_n_features() row.append(str(n_cat)) # subtract 1 for the missing class row.append(str(task_info.tensor_infos['x_cat'].get_cat_sizes().max().item() - 1) if n_cat > 0 else '') if is_classification: row.append(str(task_info.tensor_infos['y'].get_cat_size_product())) if include_openml_ids: row.append(str(task_info.more_info_dict.get('openml_task_id', ''))) table_rows.append(row) begin_table_str = r'\begin{tabular}{' + ('c' * len(table_rows[0])) + r'}' + '\n' + r'\toprule' row_strs = [' & '.join(row) + r' \\' for row in table_rows] end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}' all_row_strs = [begin_table_str, row_strs[0], r'\midrule'] + row_strs[1:] + [end_table_str] complete_str = '\n'.join(all_row_strs) utils.writeToFile(file_path, complete_str) def generate_collections_table(paths: Paths): print(f'Creating collections table') coll_display_names = {'meta-train-class': r'$\mathcal{B}^{\operatorname{train}}_{\mathrm{class}}$', 'meta-test-class': r'$\mathcal{B}^{\operatorname{test}}_{\mathrm{class}}$', 'grinsztajn-class-filtered': r'$\mathcal{B}^{\operatorname{Grinsztajn}}_{\mathrm{class}}$', 'meta-train-reg': r'$\mathcal{B}^{\operatorname{train}}_{\mathrm{reg}}$', 'meta-test-reg': r'$\mathcal{B}^{\operatorname{test}}_{\mathrm{reg}}$', 'grinsztajn-reg': r'$\mathcal{B}^{\operatorname{Grinsztajn}}_{\mathrm{reg}}$'} coll_names = list(coll_display_names.keys()) # todo: number of distinct data sets rows = [r'\#datasets', r'\#dataset groups', r'min \#samples', r'max \#samples', r'max \#classes', r'max \#features', r'max \#categories'] table_columns = {'': rows} for coll_name in coll_names: task_collection = TaskCollection.from_name(coll_name, paths) task_infos = task_collection.load_infos(paths) task_infos.sort(key=lambda ti: ti.task_desc.task_name) is_classification = any(ti.task_type == TaskType.CLASSIFICATION for ti in task_infos) n_samples_list = [] n_features_list = [] max_cat_size_list = [] n_classes_list = [] for task_info in task_infos: n_samples_list.append(task_info.n_samples) n_features_list.append(task_info.tensor_infos['x_cont'].get_n_features() + task_info.tensor_infos['x_cat'].get_n_features()) n_cat = task_info.tensor_infos['x_cat'].get_n_features() # subtract 1 for the missing class max_cat_size_list.append( task_info.tensor_infos['x_cat'].get_cat_sizes().max().item() - 1 if n_cat > 0 else 0) if is_classification: n_classes_list.append(task_info.tensor_infos['y'].get_cat_size_product()) else: n_classes_list.append(0) separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] if coll_name.startswith('meta-train'): n_dataset_groups = TaskWeighting(task_infos, separate_task_names).get_n_groups() else: n_dataset_groups = len(task_infos) table_columns[coll_display_names[coll_name]] = \ [str(len(task_infos)), str(n_dataset_groups), str(min(n_samples_list)), str(max(n_samples_list)), str(max(n_classes_list)), str(max(n_features_list)), str(max(max_cat_size_list))] keys = list(table_columns.keys()) n_info_rows = len(table_columns[keys[0]]) table_rows = [keys] + [[table_columns[key][i] for key in keys] for i in range(n_info_rows)] begin_table_str = r'\begin{tabular}{' + ('c' * len(table_rows[0])) + r'}' + '\n' + r'\toprule' row_strs = [' & '.join(row) + r' \\' for row in table_rows] end_table_str = r'\bottomrule' + '\n' + r'\end{tabular}' all_row_strs = [begin_table_str, row_strs[0], r'\midrule'] + row_strs[1:] + [end_table_str] complete_str = '\n'.join(all_row_strs) file_path = paths.plots() / f'collections_summary.tex' utils.writeToFile(file_path, complete_str) def generate_individual_results_table(paths: Paths, tables: ResultsTables, filename: str, coll_name: str, alg_names: List[str], test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None): table = tables.get(coll_name) means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False, test_metric_name=test_metric_name, val_metric_name=val_metric_name, return_percentages=False, use_task_mean=False, use_geometric_mean=False) alg_names = [an for an in alg_names if an in means] table_head = [['Dataset'] + [get_display_name(an) for an in alg_names]] table_body = [] enumerated_task_infos = list(enumerate(table.test_table.task_infos)) enumerated_task_infos.sort(key=lambda tup: tup[1].task_desc.task_name.lower()) print(f'{coll_name=}') print(f'{list(means.keys())=}') for task_idx, task_info in enumerated_task_infos: row_scores = [means[alg_name][task_idx] for alg_name in alg_names] row_errs = [means[alg_name][task_idx] - intervals[alg_name][0][task_idx] for alg_name in alg_names] min_row_score = np.min(row_scores) is_best_list = [score == min_row_score for score in row_scores] is_significant_list = [score <= min_row_score + stderr for score, stderr in zip(row_scores, row_errs)] row_strs = [] for is_best, is_significant, row_score, row_err in zip(is_best_list, is_significant_list, row_scores, row_errs): cur_str = f'{row_score:4.3f}' if is_best: cur_str = r'\textbf{' + cur_str + r'}' elif is_significant: cur_str = r'\underline{' + cur_str + r'}' cur_str = cur_str + r'$\pm$' + f'{row_err:4.3f}' row_strs.append(cur_str) table_body.append([task_info.task_desc.task_name] + row_strs) # escape underscores for latex table_head = [[val.replace('_', r'\_') for val in row] for row in table_head] table_body = [[val.replace('_', r'\_') for val in row] for row in table_body] table_str = _get_table_str(table_head, table_body) file_path = paths.plots() / filename utils.writeToFile(file_path, table_str) def generate_ablations_table(paths: Paths, tables: ResultsTables): print(f'Generating ablations table') # load results from the right tag (maybe with MLP-best-ablation) # problem: relative model should be the best one of the defaults (with best lr) # group by and optimize lrfactor coll_names = ['meta-train-class', 'meta-train-reg'] # all_group_names = dict() # all_best_lrfactors = dict() abl_names = [ (r'MLP-TD (without ablation)', 'default'), # (r'MLP-TD (fixed lr factor = 1.0)', 'default_lrfactor-1.0'), ('', ''), (r'Num.\ embeddings: PL', 'num-embeddings-pl'), (r'Num.\ embeddings: PLR', 'num-embeddings-plr'), (r'Num.\ embeddings: None', 'num-embeddings-none'), ('', ''), (r'Adam $\beta_2=0.999$ instead of $\beta_2=0.95$', 'beta2-0.999'), ('', ''), ('Learning rate schedule = cosine decay', 'lr-cos-decay'), ('Learning rate schedule = constant', 'lr-constant'), ('', ''), ('No label smoothing', 'no-label-smoothing'), ('', ''), (r'No learnable scaling', 'no-front-scale'), ('', ''), ('Non-parametric activation', 'non-parametric-act'), ('', ''), (r'Activation=Mish', 'act-mish'), (r'Activation=ReLU', 'act-relu'), (r'Activation=SELU', 'act-selu'), ('', ''), ('No dropout', 'pdrop-0.0'), (r'Dropout prob.\ $0.15$ (constant)', 'pdrop-0.15'), ('', ''), ('No weight decay', 'wd-0.0'), # ('Weight decay = 0.02 ($\operatorname{flat\_cos}$)', 'wd-0.02-flatcos'), ('Weight decay = 0.02 (constant)', 'wd-0.02'), ('', ''), (r'Standard param + no weight decay', 'standard-param_no-wd'), ('', ''), ('No data-dependent init', 'normal-init'), ('', ''), ('First best epoch instead of last best', 'first-best-epoch'), ('', ''), ('Only one-hot encoding', 'no-cat-embs'), # ('First best epoch (fixed lr factor = 0.5)', 'first-best-epoch_lrfactor-0.5'), ] results_dict = NestedDict() # index by [short_group_name][coll_name][property] # possible properties: 'score', 'lower', 'upper', 'best_lr_factor', for coll_name in coll_names: table = tables.get(coll_name, n_cv=1, tag='paper_mlp_ablations') results, _ = get_benchmark_results(paths, table=table, coll_name=coll_name, use_relative_score=False, return_percentages=False, simplify_name_fn=lambda x: x.replace(' [bag-1]', '')) default_keys = [key for key in results if 'default' in key] # print(f'{default_keys=}') default_scores = [results[key] for key in default_keys] best_key = default_keys[np.argmin(default_scores)] rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=best_key, simplify_name_fn=lambda x: x.replace(' [bag-1]', '')) keys = list(key for key in rel_results.keys() if key.startswith('RealMLP-TD-')) # keys = list(rel_results.keys()) group_names = list(set([key.split('lrfactor-')[0] for key in keys])) # all_group_names[coll_name] = group_names for group_name in group_names: # remove the 'MLP-TD-reg-ablation_' and last '_' short_group_name = r'_'.join(group_name.split('_')[1:-1]) group_keys = [key for key in keys if key.startswith(group_name)] group_results = [rel_results[key] for key in group_keys] best_key = group_keys[np.argmin(group_results)] # print(f'{best_key=}') results_dict[short_group_name, coll_name, 'best_lr_factor'] = best_key.split('lrfactor-')[1] results_dict[short_group_name, coll_name, 'score'] = rel_results[best_key] best_interval = rel_intervals[best_key] results_dict[short_group_name, coll_name, 'lower'] = best_interval[0] results_dict[short_group_name, coll_name, 'upper'] = best_interval[1] for key in keys: # also add non-optimized versions to the table # add default with default lr # short_group_name = 'default_lrfactor-1.0' # key = [key for key in rel_results.keys() if key.endswith('default_lrfactor-1.0')][0] short_group_name = '_'.join(key.split('_')[1:]) results_dict[short_group_name, coll_name, 'best_lr_factor'] = '' results_dict[short_group_name, coll_name, 'score'] = rel_results[key] best_interval = rel_intervals[key] results_dict[short_group_name, coll_name, 'lower'] = best_interval[0] results_dict[short_group_name, coll_name, 'upper'] = best_interval[1] # all_best_lrfactors[coll_name] = best_lrfactors table_head = [[''] + [r'\multicolumn{2}{c}{' + coll_name + r'}' for coll_name in coll_names], ['Ablation'] + [r'Error increase in \%', 'best lr factor'] * len(coll_names)] # all_group_names = sorted(list(results_dict.get_dict().keys())) table_body = [] # for group_name in all_group_names: for label, short_group_name in abl_names: # short_group_name = r'\_'.join(group_name.split('_')[1:-1]) row = [label] for coll_name in coll_names: if (short_group_name, coll_name, 'best_lr_factor') in results_dict: results = results_dict[short_group_name, coll_name] score = results['score'] lower = results['lower'] upper = results['upper'] row.append(f'{score:2.1f} [{lower:2.1f}, {upper:2.1f}]') row.append(results['best_lr_factor']) else: row.append('') row.append('') table_body.append(row) table_str = _get_table_str(table_head, table_body) file_path = paths.plots() / 'ablations.tex' utils.writeToFile(file_path, table_str) def generate_refit_table(paths: Paths, tables: ResultsTables, alg_family: str): print(f'Generating refit table for {alg_family}') coll_names = ['meta-train-class', 'meta-test-class', 'meta-train-reg', 'meta-test-reg'] table_head = [['', r'\multicolumn{4}{c}{Error \textbf{reduction} relative to 1 fold in \%}'], ['Method'] + coll_names] methods_labels_names = [ (f' (bagging, 1 model, indiv. stopping)', f'_mean-cv-False_mean-refit-False [bag-1]'), (f' (bagging, 1 model, joint stopping)', f'_mean-cv-True_mean-refit-True [bag-1]'), (f' (bagging, 5 models, indiv. stopping)', f'_mean-cv-False_mean-refit-False [bag-5]'), (f' (bagging, 5 models, joint stopping)', f'_mean-cv-True_mean-refit-True [bag-5]'), (f' (refitting, 1 model, indiv. stopping)', f'_mean-cv-False_mean-refit-False [ens-1]'), (f' (refitting, 1 model, joint stopping)', f'_mean-cv-True_mean-refit-True [ens-1]'), (f' (refitting, 5 models, indiv. stopping)', f'_mean-cv-False_mean-refit-False [ens-5]'), (f' (refitting, 5 models, joint stopping)', f'_mean-cv-True_mean-refit-True [ens-5]') ] labels = [f'{alg_family}-TD{label_suffix}' for label_suffix, _ in methods_labels_names] table_body_columns = [labels[0:]] for coll_name in coll_names: column = [] table = tables.get(coll_name, n_cv=5, tag='paper') # print(f'{table.test_table.alg_names=}') task_type_name = 'class' if 'class' in coll_name else 'reg' rel_alg_name = f'{alg_family}-TD-{task_type_name}_mean-cv-False_mean-refit-False [bag-1]' rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=rel_alg_name, simplify_name_fn=lambda x: x) alg_names = [f'{alg_family}-TD-{task_type_name}{suffix}' for _, suffix in methods_labels_names] results_list = [rel_results[alg_name] for alg_name in alg_names] for alg_name in alg_names[0:]: result = rel_results[alg_name] lower, upper = rel_intervals[alg_name] is_best = (result == np.min(results_list)) not_significantly_worse = (np.min(results_list) >= lower) result_str = f'{-result:2.1f}' if is_best: result_str = r'\textbf{' + result_str + r'}' elif not_significantly_worse: result_str = r'\underline{' + result_str + r'}' column.append(result_str + f' [{-upper:2.1f}, {-lower:2.1f}]') table_body_columns.append(column) table_body = utils.shift_dim_nested(table_body_columns, 0, 1) table_str = _get_table_str(table_head, table_body) file_path = paths.plots() / f'refit_table_{alg_family}.tex' utils.writeToFile(file_path, table_str) def generate_preprocessing_table(paths: Paths, tables: ResultsTables): print(f'Generating preprocessing table') coll_names = ['meta-train-class', 'meta-train-reg'] table_head = [['', r'\multicolumn{2}{c}{Error \textbf{increase} relative to robust scale + smooth clip in \%}'], ['Method'] + coll_names] methods_labels_names = [ (r'Robust scale + smooth clip', f'RealMLP-TD-S_tfms-mc-rs-sc-oh'), (r'Robust scale', f'RealMLP-TD-S_tfms-mc-rs-oh'), (r'Standardize + smooth clip', f'RealMLP-TD-S_tfms-std-sc-oh'), (r'Standardize', f'RealMLP-TD-S_tfms-std-oh'), (r'Quantile transform (output dist.\ = normal)', f'RealMLP-TD-S_tfms-quantile-oh'), (r'Quantile transform (RTDL version)', f'RealMLP-TD-S_tfms-quantiletabr-oh'), (r'KDI transform ($\alpha = 1$, output dist.\ = normal)', f'RealMLP-TD-S_tfms-kdi1-oh'), ] labels = [label for label, _ in methods_labels_names] table_body_columns = [labels] for coll_name in coll_names: column = [] table = tables.get(coll_name, n_cv=1, tag='paper_preprocessing') # print(f'{table.test_table.alg_names=}') rel_alg_name = f'RealMLP-TD-S_tfms-mc-rs-sc-oh' rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=rel_alg_name) alg_names = [alg_name for _, alg_name in methods_labels_names] results_list = [rel_results[alg_name] for alg_name in alg_names] for alg_name in alg_names: result = rel_results[alg_name] lower, upper = rel_intervals[alg_name] is_best = (result == np.min(results_list)) not_significantly_worse = (np.min(results_list) >= lower) result_str = f'{result:2.1f}' if is_best: result_str = r'\textbf{' + result_str + r'}' elif not_significantly_worse: result_str = r'\underline{' + result_str + r'}' column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]') table_body_columns.append(column) table_body = utils.shift_dim_nested(table_body_columns, 0, 1) table_str = _get_table_str(table_head, table_body) file_path = paths.plots() / f'preprocessing_ablation.tex' utils.writeToFile(file_path, table_str) def generate_stopping_table(paths: Paths, tables: ResultsTables): print(f'Generating stopping table') coll_names = ['meta-train-class', 'meta-train-reg'] table_head = [['', r'\multicolumn{2}{c}{Error \textbf{increase} relative to no early stopping in \%}'], ['Method'] + coll_names] table_body = [] for i, method in enumerate(['XGB-TD', 'LGBM-TD', 'CatBoost-TD']): esr_list = [1000, 300, 100, 50, 20, 10] labels = [method + f' (patience = {esr})' for esr in esr_list] table_body_columns = [labels] for coll_name in coll_names: column = [] table = tables.get(coll_name, n_cv=1, tag='paper_early_stopping') # print(f'{table.test_table.alg_names=}') rel_alg_name = method + '_esr-1000' rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=rel_alg_name) alg_names = [method + f'_esr-{esr}' for esr in esr_list] results_list = [rel_results[alg_name] for alg_name in alg_names] for alg_name in alg_names: result = rel_results[alg_name] lower, upper = rel_intervals[alg_name] is_best = (result == np.min(results_list)) result_str = f'{result:2.1f}' if is_best: result_str = r'\textbf{' + result_str + r'}' column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]') table_body_columns.append(column) new_rows = utils.shift_dim_nested(table_body_columns, 0, 1) if i > 0: new_rows[0][0] = r'\midrule' + '\n' + new_rows[0][0] table_body.extend(new_rows) table_str = _get_table_str(table_head, table_body) file_path = paths.plots() / f'early_stopping_table.tex' utils.writeToFile(file_path, table_str) def generate_architecture_table(paths: Paths, tables: ResultsTables): print(f'Generating architecture table') coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg'] table_head = [['', r'\multicolumn{4}{c}{Error \textbf{reduction} relative to MLP-D in \%}'], ['Method'] + coll_names] methods_labels_names = [ (r'MLP-D', f'MLP-RTDL-D'), (r'MLP-D (RS+SC)', f'MLP-RTDL-D_rssc'), (r'MLP-D (RS+SC, no wd, meta-tuned lr)', f'MLP-RTDL-reprod'), (r'MLP-D (RS+SC, no wd, meta-tuned lr, PL embeddings)', f'MLP-RTDL-reprod-pl'), (r'MLP-D (RS+SC, no wd, meta-tuned lr, RealMLP architecture)', f'MLP-RTDL-reprod-RealMLP-arch'), (r'RealMLP-TD-S', f'RealMLP-TD-S'), (r'RealMLP-TD', f'RealMLP-TD'), (r'TabR-S-D', f'TabR-S-D'), (r'TabR-S-D (RS+SC)', f'TabR-S-D_rssc'), (r'ResNet-D', f'ResNet-RTDL-D'), (r'ResNet-D (RS+SC)', f'ResNet-RTDL-D_rssc'), ] labels = [label for label, _ in methods_labels_names] table_body_columns = [labels] for coll_name in coll_names: column = [] table = tables.get(coll_name, n_cv=1, tag='paper') # print(f'{table.test_table.alg_names=}') rel_alg_name = f'MLP-RTDL-D' rel_results, rel_intervals = get_benchmark_results(paths, table=table, coll_name=coll_name, rel_alg_name=rel_alg_name) alg_names = [alg_name for _, alg_name in methods_labels_names] results_list = [rel_results[alg_name] for alg_name in alg_names] for alg_name in alg_names: result = rel_results[alg_name] lower, upper = rel_intervals[alg_name] is_best = (result == np.min(results_list)) not_significantly_worse = (np.min(results_list) >= lower) # flip sign result = -result lower, upper = -upper, -lower result_str = f'{result:2.1f}' if is_best: result_str = r'\textbf{' + result_str + r'}' elif not_significantly_worse: result_str = r'\underline{' + result_str + r'}' column.append(result_str + f' [{lower:2.1f}, {upper:2.1f}]') table_body_columns.append(column) table_body = utils.shift_dim_nested(table_body_columns, 0, 1) table_str = _get_table_str(table_head, table_body) table_str = table_str.replace('ccccc', 'lcccc') # make first column left-aligned file_path = paths.plots() / f'arch_and_preprocessing.tex' utils.writeToFile(file_path, table_str) ================================================ FILE: pytabkit/bench/run/__init__.py ================================================ ================================================ FILE: pytabkit/bench/run/results.py ================================================ from pathlib import Path from typing import Dict, List import numpy as np from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskInfo from pytabkit.models import utils class ResultManager: """ Stores experimental results and can save and load them. """ def __init__(self): # indexing convention: # self.metrics_dict['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name] = float self.metrics_dict = {} # indexed by ['cv'/'refit'], then for example fields like ['y_preds'], ['fit_params'] # or ['sub_info'] for hyperopt sub-results self.other_dict = {} # should be a numpy array of shape [n_models, n_samples, output_dim] self.y_preds_cv = None self.y_preds_refit = None def add_results(self, is_cv: bool, results_dict: Dict) -> None: """ Add a dictionary of results. :param is_cv: Whether these results are from cross-validation (True) or refitting (False). :param results_dict: Dictionary of results """ cv_str = 'cv' if is_cv else 'refit' if cv_str not in self.metrics_dict: self.metrics_dict[cv_str] = {} if cv_str not in self.other_dict: self.other_dict[cv_str] = {} for key, value in results_dict.items(): if key == 'metrics': self.metrics_dict[cv_str] = value elif key == 'y_preds': if is_cv: self.y_preds_cv = value else: self.y_preds_refit = value else: self.other_dict[cv_str][key] = value def save(self, path: Path) -> None: utils.serialize(path / 'metrics.yaml', self.metrics_dict, use_yaml=True) # random search hpo often generates numpy datatype scalars, but these cannot be saved by msgpack, # so we convert them other_dict = utils.numpy_to_native_rec(self.other_dict) utils.serialize(path / 'other.msgpack.gz', other_dict, use_msgpack=True, compressed=True) # also save as yaml for readability utils.serialize(path / 'other.yaml', other_dict, use_yaml=True) if self.y_preds_cv is not None: np.savez_compressed(path / 'y_preds_cv.npz', y_preds=self.y_preds_cv) if self.y_preds_refit is not None: np.savez_compressed(path / 'y_preds_refit.npz', y_preds=self.y_preds_refit) @staticmethod def load(path: Path, load_other: bool = True, load_preds: bool = True): """ Load results. :param path: Data path. :param load_other: If True, load other_dict. :param load_preds: If True, load the model predictions. :return: """ rm = ResultManager() rm.metrics_dict = utils.deserialize(path / 'metrics.yaml', use_yaml=True) if load_other: rm.other_dict = utils.deserialize(path / 'other.msgpack.gz', use_msgpack=True, compressed=True) for mode in ['cv', 'refit']: if mode in rm.other_dict and 'y_preds' in rm.other_dict[mode]: # other_dict was created by old code and still contains y_preds if mode == 'cv': rm.y_preds_cv = rm.other_dict[mode]['y_preds'] else: rm.y_preds_refit = rm.other_dict[mode]['y_preds'] if load_preds: if utils.existsFile(path / 'y_preds_cv.npz'): rm.y_preds_cv = np.load(path / 'y_preds_cv.npz')['y_preds'] if utils.existsFile(path / 'y_preds_refit.npz'): rm.y_preds_refit = np.load(path / 'y_preds_refit.npz')['y_preds'] return rm def save_summaries(paths: Paths, task_infos: List[TaskInfo], alg_name: str, n_cv: int, rerun=False) -> None: """ Compress the results into result_summaries that can be loaded faster for evaluation. :param paths: Path configuration. :param task_infos: Task infos of tasks that should be summarized. :param alg_name: Name of the method whose results should be summarized. :param n_cv: Number of cross-validation splits for which the results should be summarized. :param rerun: Whether to re-compute the summaries even if summaries are already present. """ for task_info in task_infos: task_desc = task_info.task_desc src_path = paths.results_alg_task(task_desc, alg_name, n_cv) dest_path = paths.summary_alg_task(task_desc, alg_name, n_cv) if not rerun and utils.existsDir(dest_path): continue # indexed by [split_type][split_idx]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_index)][metric_name] metrics_dict = {} for split_type_path in src_path.iterdir(): split_type = split_type_path.name split_id_metrics_list = [] split_id = 0 while True: split_id_path = split_type_path / str(split_id) if not utils.existsDir(split_id_path): break rm = ResultManager.load(split_id_path, load_other=False, load_preds=False) split_id_metrics_list.append(rm.metrics_dict) split_id += 1 if split_id >= 1: # there exists a split metrics_dict[split_type] = split_id_metrics_list if len(metrics_dict) > 0: # shift split_idx dimension to the end results_dict = utils.shift_dim_nested(metrics_dict, 1, 6) # print(f'{results_dict=}') # results_dict[split_type]['cv'/'refit']['train'/'val'/'test'][str(n_models)][str(start_idx)][metric_name][split_idx] utils.serialize(dest_path / 'metrics.msgpack.gz', results_dict, use_msgpack=True, compressed=True) ================================================ FILE: pytabkit/bench/run/task_execution.py ================================================ import shutil import traceback from typing import List, Optional import numpy as np from pytabkit.bench.alg_wrappers.general import AlgWrapper from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskPackage, TaskInfo from pytabkit.bench.run.results import save_summaries, ResultManager from pytabkit.bench.scheduling.schedulers import BaseJobScheduler from pytabkit.models import utils from pytabkit.models.training.logging import StdoutLogger import glob import math from pytabkit.bench.scheduling.jobs import AbstractJob from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.training.metrics import Metrics class TabBenchJob(AbstractJob): """ Internal helper class implementing AbstractJob for running tabular benchmarking jobs with our scheduling code. """ def __init__(self, alg_name: str, alg_wrapper: AlgWrapper, task_package: TaskPackage, paths: Paths, metrics: Optional[Metrics] = None): """ :param alg_name: Unique name of the method (for saving results). :param alg_wrapper: Wrapper implementing the ML method. :param task_package: Task package containing information on dataset and splits. :param paths: Data path configuration. """ self.alg_name = alg_name self.alg_wrapper = alg_wrapper self.task_package = task_package self.paths = paths self.metrics = metrics def get_group(self) -> str: """ :return: Group name, in this case just the name of the AlgWrapper class. """ return self.alg_wrapper.__class__.__name__ def __call__(self, assigned_resources: NodeResources) -> bool: """ Run the experiment with the given resources. :param assigned_resources: Assigned resources. :return: False if the job completed more quickly because results were partially already saved. """ task_desc = self.task_package.task_info.task_desc print(f'Running {self.alg_name} on {len(self.task_package.split_infos)} splits of dataset {task_desc} ' f'with {assigned_resources.get_n_threads()} threads' , flush=True) logger = StdoutLogger() # check whether any data directories exist, i.e. whether data is already available dirs_exist = [utils.existsDir(self.paths.results_alg_task_split(task_desc, self.alg_name, self.task_package.n_cv, split_info.split_type, split_info.id)) for split_info in self.task_package.split_infos] # check whether the run is a normal run which does not have unusually short runtime due to pre-computed data finished_normally = self.task_package.rerun or not any(dirs_exist) # create tmp_folders for saving temporary data in case the run is interrupted and needs to be restarted tmp_folders = [self.paths.results_alg_task_split(task_desc, alg_name=self.task_package.alg_name, n_cv=self.task_package.n_cv, split_type=split_info.split_type, split_id=split_info.id) / 'tmp' for split_info in self.task_package.split_infos] result_managers_dict = self.alg_wrapper.run(self.task_package, logger, assigned_resources, tmp_folders, self.metrics) for alg_name_suffix, result_managers in result_managers_dict.items(): for rm, split_info in zip(result_managers, self.task_package.split_infos): rm.save(self.paths.results_alg_task_split(task_desc, self.alg_name + alg_name_suffix, self.task_package.n_cv, split_info.split_type, split_info.id)) # delete tmp_folders to save disk space for tmp_folder in tmp_folders: if utils.existsDir(tmp_folder): shutil.rmtree(tmp_folder) print(f'Finished running {self.alg_name} on {len(self.task_package.split_infos)} splits of dataset {task_desc}', flush=True) return finished_normally def get_required_resources(self) -> RequiredResources: return self.alg_wrapper.get_required_resources(self.task_package) def get_desc(self) -> str: split_ids = [split_info.id for split_info in self.task_package.split_infos] split_str = f'splits {sorted(split_ids)}' if len(split_ids) == 1: split_str = f'split {split_ids[0]}' elif all([split_id == split_ids[0] + i for i, split_id in enumerate(split_ids)]): # we have a range split_str = f'splits {split_ids[0]}-{split_ids[-1]}' return self.alg_name + f' on {split_str} of task {self.task_package.task_info.task_desc}' class RunConfig: """ This class stores some benchmark settings that a method can be run with. """ def __init__(self, n_tt_splits: int, n_cv: int = 1, n_refit: int = 0, use_default_split: bool = False, trainval_fraction: float = 0.8, train_fraction: float = 0.75, save_y_pred: bool = False, min_split_idx: int = 0, metrics: Optional[Metrics] = None): """ :param n_tt_splits: Number of trainval-test-splits to evaluate the method with. :param n_cv: Number of cross-validation folds. If n_cv=1, use a single random split. :param n_refit: Number of models that should be refitted (and ensembled) on the training and validation set. :param use_default_split: Whether the default split of the datasets should be used. :param trainval_fraction: Fraction in (0, 1) of the data that should be used for training and validation set. The rest will be used for the test set. :param train_fraction: Only used if n_cv=1. In this case, out of the training+validation data, the given fraction of the data is used for training. :param save_y_pred: Whether the predictions on the whole dataset should be saved (can use a considerable amount of disk storage, e.g. 3 GB for running a single method on meta-train and meta-test benchmarks). :param min_split_idx: Minimum index of the split that should be used. Can be set larger than zero if only a sub-range of the splits should be run. :param metrics: Metrics object that specifies which metrics should be evaluated. """ self.n_tt_splits = n_tt_splits self.n_cv = n_cv self.n_refit = n_refit self.use_default_split = use_default_split self.trainval_fraction = trainval_fraction self.train_fraction = train_fraction self.save_y_pred = save_y_pred self.min_split_idx = min_split_idx self.metrics = metrics class TabBenchJobManager: """ This class can be used to add and run jobs for tabular benchmarks. """ def __init__(self, paths: Paths): """ :param paths: Data path configuration. """ self.paths = paths self.jobs = [] self.save_args = [] def add_jobs(self, task_infos: List[TaskInfo], run_config: RunConfig, alg_name: str, alg_wrapper: AlgWrapper, tags: Optional[List[str]] = None, rerun: bool = False) -> None: """ Add jobs for the given method with the given run configuration on all task infos where results are not already available (except if rerun=True). Will also store the algorithm configuration and copy the current source files to the corresponding algorithm folder. :param task_infos: List of TaskInfo objects representing the datasets on which the method should be run. :param run_config: Run configuration. :param alg_name: Name of the method, should be unique (is used for storing and printing the results) :param alg_wrapper: Wrapper implementing the ML method. :param tags: List of tags associated to the method (can be used for selecting a subset of methods later). :param rerun: If True, run all combinations even if there are already computed results stored for it. (For large reruns, we rather recommend renaming the old method with rename_alg.py and then running the jobs again with the new name and rerun=False. This avoids problems if the rerun crashes and preserves the old results for comparison.) """ # todo: update after updating project structure if tags is None: tags = ['default'] dummy_task_package = TaskPackage(task_infos[0], split_infos=task_infos[0].get_random_splits(run_config.n_tt_splits, trainval_fraction=run_config.trainval_fraction, train_fraction=run_config.train_fraction)[ 0:1], n_cv=run_config.n_cv, n_refit=run_config.n_refit, paths=self.paths, rerun=rerun, alg_name=alg_name, save_y_pred=run_config.save_y_pred) # possible versions of the same alg that are generated alg_suffixes = alg_wrapper.get_pred_param_names(dummy_task_package) task_packages = [] for task_info in task_infos: if run_config.use_default_split: tt_split_infos = task_info.get_default_splits(run_config.n_tt_splits) else: tt_split_infos = task_info.get_random_splits(run_config.n_tt_splits, trainval_fraction=run_config.trainval_fraction, train_fraction=run_config.train_fraction) tt_split_infos = tt_split_infos[run_config.min_split_idx:] if not rerun: # filter out splits where results have already been computed tt_split_infos = [split_info for split_info in tt_split_infos if not all(utils.existsFile( self.paths.results_alg_task_split(task_info.task_desc, alg_name + suffix, run_config.n_cv, split_info.split_type, split_info.id) / 'metrics.yaml') for suffix in alg_suffixes)] n_tt_splits = len(tt_split_infos) if n_tt_splits == 0: continue max_n_vectorized = alg_wrapper.get_max_n_vectorized(task_info) n_splits_per_package = min(n_tt_splits, max(1, max_n_vectorized // max(run_config.n_cv, run_config.n_refit))) n_packages_per_task = math.ceil(n_tt_splits / n_splits_per_package) # distribute load more evenly across packages # (e.g. have split sizes (4, 4, 4) instead of (5, 5, 2) for n_tt_splits=12) n_splits_per_package = math.ceil(n_tt_splits / n_packages_per_task) batch_idxs = [n_splits_per_package * i for i in range((n_tt_splits - 1) // n_splits_per_package + 1)] \ + [n_tt_splits] for start, stop in zip(batch_idxs[:-1], batch_idxs[1:]): task_packages.append(TaskPackage(task_info, split_infos=tt_split_infos[start:stop], n_cv=run_config.n_cv, n_refit=run_config.n_refit, paths=self.paths, rerun=rerun, alg_name=alg_name, save_y_pred=run_config.save_y_pred)) for tp in task_packages: self.jobs.append(TabBenchJob(alg_name=alg_name, alg_wrapper=alg_wrapper, task_package=tp, paths=self.paths, metrics=run_config.metrics)) if len(task_packages) > 0: for suffix in alg_suffixes: full_alg_name = alg_name + suffix # store alg info because something is actually being run # todo: this might not work on Windows # copy python files py_files = glob.glob('scripts/*.py') + glob.glob('pytabkit/**/*.py', recursive=True) utils.serialize(self.paths.algs() / full_alg_name / 'wrapper.pkl', alg_wrapper) extended_config = utils.join_dicts(alg_wrapper.config, {'alg_name': alg_name, 'pred_params_name': suffix, 'wrapper_class_name': alg_wrapper.__class__.__name__}) utils.serialize(self.paths.algs() / full_alg_name / 'extended_config.yaml', extended_config, use_yaml=True) utils.serialize(self.paths.algs() / full_alg_name / 'tags.yaml', tags, use_yaml=True) for py_file in py_files: utils.copyFile(py_file, self.paths.algs() / full_alg_name / 'src' / py_file) for suffix in alg_suffixes: rerun_summary = True # always create the summary since a part of the results might have changed. self.save_args.append((self.paths, task_infos, alg_name + suffix, run_config.n_cv, rerun_summary)) def run_jobs(self, scheduler: BaseJobScheduler) -> None: """ Runs the added jobs with the given scheduler. After all jobs are done, creates the result summaries for faster loading of results. :param scheduler: Scheduler for running the jobs. """ print(f'Starting scheduler') scheduler.add_jobs(self.jobs) scheduler.run() for args in self.save_args: try: save_summaries(*args) except Exception as e: traceback.print_exc() def run_alg_selection(paths: Paths, config: RunConfig, task_infos: List[TaskInfo], target_alg_name: str, alg_names: List[str], val_metric_name: str, tags: List[str] = ['paper'], rerun: bool = False): n_cv = config.n_cv split_type = SplitType.DEFAULT if config.use_default_split else SplitType.RANDOM assert len(alg_names) > 0 assert config.n_refit == 0 # not implemented otherwise for task_info in task_infos: task_desc = task_info.task_desc for split_id in range(config.n_tt_splits): target_path = paths.results_alg_task_split(task_desc, target_alg_name, n_cv, split_type, split_id) if utils.existsFile(target_path / 'metrics.yaml') and not rerun: continue print(f'Running algorithm selection for {target_alg_name} on split {split_id} of task {task_desc}') best_alg_name = None best_val_score = np.inf best_alg_idx = None # find best alg for i, alg_name in enumerate(alg_names): rm = ResultManager.load(paths.results_alg_task_split(task_desc, alg_name, n_cv, split_type, split_id), load_other=False, load_preds=False) # todo: probably shouldn't use i in both loops val_score = np.mean([rm.metrics_dict['cv']['val']['1'][str(j)][val_metric_name] for j in range(n_cv)]) # print(f'validation score for model {i} with alg_name {alg_name}: {val_score}') # print(f'{val_score=}, {alg_name=}, {i=}') if val_score < best_val_score or best_alg_name is None: best_val_score = val_score best_alg_name = alg_name best_alg_idx = i # print(f'{best_val_score=}, {best_alg_name=}, {best_alg_idx=}') # load full results of best alg and save them to target directory rm = ResultManager.load(paths.results_alg_task_split(task_desc, best_alg_name, n_cv, split_type, split_id)) rm.other_dict['cv']['fit_params'] = dict(best_alg_idx=best_alg_idx, best_alg_name=best_alg_name, sub_fit_params=rm.other_dict['cv']['fit_params']) rm.save(target_path) # save alg in algs folder py_files = glob.glob('scripts/*.py') + glob.glob('pytabkit/**/*.py', recursive=True) # utils.serialize(paths.algs() / target_alg_name / 'wrapper.pkl', alg_wrapper) extended_config = dict(sub_algs=alg_names) utils.serialize(paths.algs() / target_alg_name / 'extended_config.yaml', extended_config, use_yaml=True) utils.serialize(paths.algs() / target_alg_name / 'tags.yaml', tags, use_yaml=True) for py_file in py_files: utils.copyFile(py_file, paths.algs() / target_alg_name / 'src' / py_file) # save summaries print(f'Saving summaries') save_summaries(paths, task_infos, target_alg_name, n_cv=n_cv, rerun=True) ================================================ FILE: pytabkit/bench/scheduling/__init__.py ================================================ ================================================ FILE: pytabkit/bench/scheduling/execution.py ================================================ import os import time import multiprocessing as mp import traceback from typing import Tuple, Optional, List import numpy as np from pytabkit.bench.scheduling.jobs import JobRunner from pytabkit.bench.scheduling.resource_manager import ResourceManager, JobInfo from pytabkit.bench.scheduling.resources import NodeResources, SystemResources from pytabkit.models.utils import FunctionProcess def get_gpu_rams_gb(use_reserved: bool = True): """ Returns: gpu_rams_gb: total GPU memory per visible device (GB) gpu_rams_fixed_gb: this process GPU memory per visible device (GB) - reserved (default): torch caching allocator reserved bytes (often matches "process used" better) - allocated: live tensor bytes only """ # do it in torch, it respects CUDA_VISIBLE_DEVICES and doesn't need the pynvml dependency BYTES_TO_GB = 1024.0 ** 3 import torch gpu_rams_gb = [] gpu_rams_fixed_gb = [] n = torch.cuda.device_count() # respects CUDA_VISIBLE_DEVICES ("" => 0) for i in range(n): with torch.cuda.device(i): _free_b, total_b = torch.cuda.mem_get_info() gpu_rams_gb.append(total_b / BYTES_TO_GB) if use_reserved: used_b = torch.cuda.memory_reserved(i) else: used_b = torch.cuda.memory_allocated(i) gpu_rams_fixed_gb.append(used_b / BYTES_TO_GB) return gpu_rams_gb, gpu_rams_fixed_gb def measure_node_resources(node_id: int) -> Tuple[NodeResources, NodeResources]: """ Function that measures available resources. :param node_id: Node ID that will be used to identify the node in the returned NodeResources. :return: Returns a tuple of NodeResources objects. The first one contains the total available resources, and the second one contains the resources that a single process (with PyTorch GPU usage) uses without doing anything. """ import torch n_gpus = torch.cuda.device_count() if n_gpus > 0: # init cuda # alloc dummy tensors to know how much memory PyTorch uses for its runtime dummy_tensors = [torch.ones(1).to(f'cuda:{i}') for i in range(n_gpus)] gpu_rams_gb, gpu_rams_fixed_gb = get_gpu_rams_gb() else: gpu_rams_gb = [] gpu_rams_fixed_gb = [] import psutil import os cpu_ram_gb = psutil.virtual_memory().available / (1024. ** 3) cpu_ram_fixed_gb = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3 n_threads = mp.cpu_count() n_physical_cores = n_threads // 2 node_resources = NodeResources(node_id=node_id, n_threads=n_threads, cpu_ram_gb=cpu_ram_gb, gpu_usages=np.ones(n_gpus), gpu_rams_gb=np.asarray(gpu_rams_gb), physical_core_usages=np.ones(n_physical_cores)) fixed_node_resources = NodeResources(node_id=node_id, n_threads=0.0, cpu_ram_gb=cpu_ram_fixed_gb, gpu_usages=np.zeros(n_gpus), gpu_rams_gb=np.asarray(gpu_rams_fixed_gb), physical_core_usages=np.zeros(n_physical_cores)) # print('measure_gpu_resources:', gpu_rams_gb, gpu_rams_fixed_gb) # return np.asarray(gpu_rams_gb), np.asarray(gpu_rams_fixed_gb) return node_resources, fixed_node_resources def node_runner(feedback_queue, job_queue, node_id: int): mp.set_start_method('fork', force=True) # get resources in separate process so CUDA runtime is shut down when the process is terminated # this means that this process will not use up CUDA memory all the time node_resources, fixed_node_resources = FunctionProcess(measure_node_resources, node_id).start().pop_result() feedback_queue.put((node_resources, fixed_node_resources)) processes = [] process_rams_gb = [] # print(f'DEBUG: start loop', flush=True) while True: # get new jobs from queue while not job_queue.empty(): try: job_str = job_queue.get(timeout=0.1) # print(f'DEBUG: got job str', flush=True) except Exception as e: print(traceback.format_exc()) # might have been queue.Empty or ray.util.queue.Empty exception break # queue is empty if job_str is False: # termination signal # cannot use None as termination signal since that is already the timeout signal return # or check if processes are still running? import dill job_data = dill.loads(job_str) # print(f'DEBUG: got job data', flush=True) processes.append(FunctionProcess(JobRunner(*job_data)).start()) process_rams_gb.append(0.0) # check for finished processes for i, p in enumerate(processes): process_rams_gb[i] = max(process_rams_gb[i], p.get_ram_usage_gb()) if p.is_done(): result = p.pop_result() result.set_max_cpu_ram_gb(process_rams_gb[i]) # print(f'Node {node_id}: Before putting result in feedback_queue', flush=True) feedback_queue.put(result) # print(f'Node {node_id}: After putting result in feedback_queue', flush=True) del processes[i] del process_rams_gb[i] # print(f'.', end='', flush=True) time.sleep(0.01) # get RAM statistics of all processes and total RAM usage # if any process is finished, send time and RAM statistics of that process through the feedback queue # maybe have a logging queue? class NodeManager: def start(self): raise NotImplementedError() # start nodes, return queues and node ids? def terminate(self): raise NotImplementedError() # terminate nodes? class RayJobManager(NodeManager): def __init__(self, max_n_threads: Optional[int] = None, available_cpu_ram_multiplier: float = 1.0, available_gpu_ram_multiplier: float = 1.0, **ray_kwargs): self.ray_kwargs = ray_kwargs self.runner_futures = [] # keep node_runner futures for termination self.job_queues = [] self.feedback_queues = [] self.resource_manager: Optional[ResourceManager] = None self.max_n_threads = max_n_threads self.available_cpu_ram_multiplier = available_cpu_ram_multiplier self.available_gpu_ram_multiplier = available_gpu_ram_multiplier def start(self) -> None: import ray # take some ray arguments from os.environ if available for (ray_name, environ_name) in [('address', 'ip_head'), ('_redis_password', 'redis_password')]: if environ_name in os.environ and ray_name not in self.ray_kwargs: self.ray_kwargs[ray_name] = os.environ[environ_name] ray.init(**self.ray_kwargs) from ray.util import queue nodes = ray.nodes() print(f'Nodes: {nodes}') feedback_queues = [queue.Queue() for i in range(len(nodes))] job_queues = [queue.Queue() for i in range(len(nodes))] for i, node in enumerate(nodes): node_id = f'node:{node["NodeManagerAddress"]}' num_gpus = 0 if 'GPU' not in node['Resources'] else round(node['Resources']['GPU']) future = ray.remote(num_gpus=num_gpus)(node_runner).options(resources={node_id: 1.0}) \ .remote(feedback_queue=feedback_queues[i], job_queue=job_queues[i], node_id=i) self.runner_futures.append(future) print(f'Started {len(job_queues)} nodes', flush=True) n_nodes = len(job_queues) total_resources: List[Optional[NodeResources]] = [None] * n_nodes fixed_resources: List[Optional[NodeResources]] = [None] * n_nodes for feedback_queue in feedback_queues: nr, fnr = feedback_queue.get() # should be a NodeResources object total_resources[nr.node_id] = nr fixed_resources[fnr.node_id] = fnr if self.max_n_threads is not None: total_resources[nr.node_id].set_n_threads(min(total_resources[nr.node_id].get_n_threads(), self.max_n_threads)) total_resources[nr.node_id].set_cpu_ram_gb( self.available_cpu_ram_multiplier * total_resources[nr.node_id].get_cpu_ram_gb()) total_resources[nr.node_id].set_gpu_rams_gb( self.available_gpu_ram_multiplier * total_resources[nr.node_id].get_gpu_rams_gb()) print(f'Acquired node resources', flush=True) self.resource_manager = ResourceManager(total_resources=SystemResources(total_resources), fixed_resources=SystemResources(fixed_resources)) self.job_queues = job_queues self.feedback_queues = feedback_queues def get_resource_manager(self) -> ResourceManager: if self.resource_manager is None: raise RuntimeError('called get_resource_manager() before start()') return self.resource_manager def submit_job(self, job_info: JobInfo) -> None: import dill if self.resource_manager is None: raise RuntimeError('called submit_job() before start()') job = job_info.job job_id = job_info.job_id assigned_resources = job_info.assigned_resources if assigned_resources is None: raise RuntimeError('assigned_resources for submitted job must not be None') node_id = assigned_resources.node_id print(f'Scheduling job {job.get_desc()} on node {node_id}', flush=True) job_str = dill.dumps((job, job_id, assigned_resources)) self.job_queues[node_id].put(job_str) self.resource_manager.job_started(job_info) def pop_finished_job_infos(self, timeout_s: float = -1.0) -> List[JobInfo]: if self.resource_manager is None: raise RuntimeError('called pop_results() before start()') has_new_result = False start_time = time.time() job_infos = [] while not has_new_result: if timeout_s > 0.0 and time.time() > start_time + timeout_s: # timeout return job_infos for feedback_queue in self.feedback_queues: while not feedback_queue.empty(): job_result = feedback_queue.get() job_info = self.resource_manager.job_finished(job_result) job_infos.append(job_info) has_new_result = True if not has_new_result: time.sleep(0.05) return job_infos def terminate(self) -> None: for jq in self.job_queues: jq.put(False) # termination signal import ray # maybe wait only a bit and then hard terminate otherwise? ray.get(self.runner_futures) ray.shutdown() # class LocalNodeManager(NodeManager): # # start node_runner in a thread # pass ================================================ FILE: pytabkit/bench/scheduling/jobs.py ================================================ import time import traceback import sys from typing import Optional from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.models.alg_interfaces.base import RequiredResources class JobResult: """ Helper class to store information about a job that has been run. """ def __init__(self, job_id: int, time_s: float, oom_cpu: bool = False, oom_gpu: bool = False, finished_normally: bool = True, exception_msg: Optional[str] = None): """ :param job_id: Job id. :param time_s: Time in seconds that the job ran for. :param oom_cpu: Whether an out-of-memory error occurred on the CPU. :param oom_gpu: Whether an out-of-memory error occurred on the GPU. :param finished_normally: Whether the job ran normally, such that its time and RAM values are representative of how it would normally run. For example, if the job ran faster because the results were already partially precomputed, it should not count towards the time estimation. Of course, if an exception occurred, we should have finished_normally=False. :param exception_msg: Exception message (if there was any). """ self.job_id = job_id self.time_s = time_s self.oom_cpu = oom_cpu self.oom_gpu = oom_gpu self.finished_normally = finished_normally self.exception_msg = exception_msg self.failed = exception_msg is not None self.max_cpu_ram_gb = 0.0 assert exception_msg is None or not finished_normally def set_max_cpu_ram_gb(self, value: float) -> None: """ Set the maximum RAM usage of the job. :param value: maximum RAM usage in GiB. """ self.max_cpu_ram_gb = value class AbstractJob: """ Abstract base class for jobs that can be scheduled using schedulers in schedulers.py. """ def get_group(self) -> str: """ :return: Should return a "group name" string. All jobs with the same "group name" will have a common time factor that is adjusted on-the-fly during scheduling based on already completed jobs. """ raise NotImplementedError() def __call__(self, assigned_resources: NodeResources) -> bool: """ Should perform the main computation of the job. Problematic exceptions should not be caught within this method, they will be caught and printed in the scheduler. :param assigned_resources: Resources that are assigned to this job (conforming with the resources requested in get_required_resources()). :return: Should return True if the execution finished normally such that the timing of this job is representative. In cases where pre-computed results were available such that the job is shorter than usual, return False. """ raise NotImplementedError() def get_required_resources(self) -> RequiredResources: """ :return: Return the resources requested by this job. """ raise NotImplementedError() def get_desc(self) -> str: """ :return: Return a description that can be logged, e.g., when the job is started and when it finishes. """ raise NotImplementedError() class JobRunner: """ Helper class that runs an AbstractJob, catches exceptions, measures time and RAM usage, and returns its result. """ def __init__(self, job: AbstractJob, job_id: int, assigned_resources: NodeResources): """ :param job: The job to be run. :param job_id: An ID that will be returned at the end so that the job can be identified. :param assigned_resources: Assigned resources to run the job. """ self.job = job self.job_id = job_id self.assigned_resources = assigned_resources def __call__(self) -> JobResult: """ Runs the job computation. :return: Returns a JobResult object that includes information about the job. """ start_time = time.time() oom_gpu = False oom_cpu = False exception_msg = None try: finished_normally = self.job(self.assigned_resources) except Exception as e: finished_normally = False exception_msg = traceback.format_exc() print(exception_msg, file=sys.stderr, flush=True) if isinstance(e, MemoryError): oom_cpu = True elif isinstance(e, RuntimeError) and 'cuda out of memory' in exception_msg.lower(): oom_gpu = True elif isinstance(e, KeyboardInterrupt): raise e end_time = time.time() return JobResult(job_id=self.job_id, time_s=end_time-start_time, oom_cpu=oom_cpu, oom_gpu=oom_gpu, finished_normally=finished_normally, exception_msg=exception_msg) ================================================ FILE: pytabkit/bench/scheduling/resource_manager.py ================================================ import copy import enum import time from typing import Optional from pytabkit.bench.scheduling.jobs import AbstractJob, JobResult from pytabkit.bench.scheduling.resources import NodeResources, SystemResources class JobStatus(enum.Enum): REMAINING = 0 RUNNING = 1 SUCCEEDED = 2 FAILED = 3 class JobInfo: def __init__(self, job: AbstractJob, job_id: int, start_time: Optional[float] = None, assigned_resources: Optional[NodeResources] = None, job_result: Optional[JobResult] = None): self.job = job self.job_id = job_id self.start_time = start_time self.assigned_resources = assigned_resources self.required_resources = job.get_required_resources() self.job_result = job_result def get_status(self) -> JobStatus: if self.start_time is None: return JobStatus.REMAINING elif self.job_result is None: return JobStatus.RUNNING elif self.job_result.failed: return JobStatus.FAILED else: return JobStatus.SUCCEEDED def set_started(self, assigned_resources: NodeResources): self.start_time = time.time() self.assigned_resources = assigned_resources def set_finished(self, job_result: JobResult): self.job_result = job_result def is_remaining(self): return self.get_status() == JobStatus.REMAINING def is_running(self): return self.get_status() == JobStatus.RUNNING def is_finished(self): return self.get_status() in [JobStatus.FAILED, JobStatus.SUCCEEDED] def is_failed(self): return self.get_status() == JobStatus.FAILED def is_succeed(self): return self.get_status() == JobStatus.SUCCEEDED class ResourceManager: """ Keeps track of running jobs and available resources. """ def __init__(self, total_resources: SystemResources, fixed_resources: SystemResources): self.total_resources = total_resources self.fixed_resources = fixed_resources self.running_job_infos = dict() # map job_id to job_info def get_fixed_resources(self): return self.fixed_resources def get_total_resources(self): return self.total_resources def get_free_resources(self): free_resources = copy.deepcopy(self.total_resources) for ji in self.running_job_infos.values(): ar = ji.assigned_resources free_resources.resources[ar.node_id] -= ar return free_resources def job_started(self, job_info: JobInfo): job_info.start_time = time.time() if job_info.job_id in self.running_job_infos: raise RuntimeError(f'Trying to start job {job_info.job.get_desc()}, which is already running!') self.running_job_infos[job_info.job_id] = job_info def job_finished(self, job_result: JobResult) -> JobInfo: ji = self.running_job_infos[job_result.job_id] ji.set_finished(job_result) if job_result.exception_msg is not None: print(f'Job failed: {ji.job.get_desc()}\nException: {job_result.exception_msg}') del self.running_job_infos[job_result.job_id] return ji ================================================ FILE: pytabkit/bench/scheduling/resources.py ================================================ from typing import Optional, List import numpy as np import copy from pytabkit.models.alg_interfaces.base import InterfaceResources, RequiredResources # already add fixed GPU RAM in assigned resources? (problem: does try_assign know these fixed resources?) # or have fixed_resources: NodeResources that are added each time? # problem: fixed resources only need to be added to those GPUs that are actually assigned # or maybe a method add_fixed_resources that takes in the fixed GPU RAM assignments class NodeResources: """ Represents available/used/free resources on a compute node. """ def __init__(self, node_id: int, n_threads: float, cpu_ram_gb: float, gpu_usages: np.ndarray, gpu_rams_gb: np.ndarray, physical_core_usages: np.ndarray): self.node_id = node_id self.n_gpus = len(gpu_usages) self.data: np.ndarray = np.array(np.concatenate( [[n_threads, cpu_ram_gb], gpu_usages, gpu_rams_gb, physical_core_usages])) self.data.setflags(write=True) def get_n_threads(self) -> int: return round(self.data[0]) def set_n_threads(self, n_threads: int): # somehow necessary because self.data can get non-writeable after transmitting it from another ray process self.data = np.copy(self.data) self.data[0] = n_threads def get_cpu_ram_gb(self) -> float: return self.data[1] def set_cpu_ram_gb(self, cpu_ram_gb: float) -> None: # somehow necessary because self.data can get non-writeable after transmitting it from another ray process self.data = np.copy(self.data) self.data[1] = cpu_ram_gb def set_gpu_rams_gb(self, gpu_rams_gb: np.ndarray) -> None: # somehow necessary because self.data can get non-writeable after transmitting it from another ray process self.data = np.copy(self.data) self.data[2+self.n_gpus:2+2*self.n_gpus] = gpu_rams_gb def get_gpu_usages(self) -> np.ndarray: return self.data[2:2+self.n_gpus] def get_gpu_rams_gb(self) -> np.ndarray: return self.data[2+self.n_gpus:2+2*self.n_gpus] def get_physical_core_usages(self) -> np.ndarray: return self.data[2+2*self.n_gpus:] def get_n_physical_cores(self) -> int: return len(self.data) - (2+2*self.n_gpus) def get_total_gpu_ram_gb(self) -> float: return np.sum(self.get_gpu_rams_gb()) def get_total_gpu_usage(self) -> float: return np.sum(self.get_gpu_usages()) def get_used_gpu_ids(self) -> np.ndarray: # todo: naming return np.argwhere(self.get_gpu_usages() > 1e-8)[:, 0] def get_used_physical_cores(self) -> np.ndarray: return np.argwhere(self.get_physical_core_usages() > 1e-8)[:, 0] def get_resource_vector(self) -> np.ndarray: return np.asarray([self.get_n_threads(), self.get_cpu_ram_gb(), self.get_total_gpu_usage(), self.get_total_gpu_ram_gb()]) def get_interface_resources(self) -> InterfaceResources: return InterfaceResources(n_threads=self.get_n_threads(), gpu_devices=[f'cuda:{i}' for i in self.get_used_gpu_ids()]) def __iadd__(self, other: 'NodeResources') -> 'NodeResources': # operator += self.data += other.data # todo: some compatibility checks? return self def __isub__(self, other: 'NodeResources') -> 'NodeResources': self.data -= other.data return self def __imul__(self, other: 'NodeResources') -> 'NodeResources': self.data *= other.data return self def __itruediv__(self, other: 'NodeResources') -> 'NodeResources': self.data /= other.data return self def __add__(self, other: 'NodeResources') -> 'NodeResources': result = copy.deepcopy(self) result += other return result def __sub__(self, other: 'NodeResources') -> 'NodeResources': result = copy.deepcopy(self) result -= other return result def __mul__(self, other: 'NodeResources') -> 'NodeResources': result = copy.deepcopy(self) result *= other return result def __truediv__(self, other: 'NodeResources') -> 'NodeResources': result = copy.deepcopy(self) result /= other return result def try_assign(self, required_resources: RequiredResources, fixed_resources: 'SystemResources') -> Optional['NodeResources']: rr = required_resources fr = fixed_resources.resources[self.node_id] if not rr.should_add_fixed_resources(): fr = NodeResources.zeros_like(fr) # todo: distribution across GPUs is potentially suboptimal # CPU stuff n_threads = fr.get_n_threads() + rr.n_threads if self.get_n_threads() < n_threads: return None cpu_ram_gb = fr.get_cpu_ram_gb() + rr.cpu_ram_gb if self.get_cpu_ram_gb() < cpu_ram_gb: return None n_cores = rr.n_explicit_physical_cores physical_core_usages = np.zeros(self.get_n_physical_cores()) if n_cores > 0: free_pcu = self.get_physical_core_usages() free_in_sequence = np.convolve(free_pcu, np.ones(n_cores),'valid') idx = np.argmax(free_in_sequence >= n_cores - 0.5) if free_in_sequence[idx] >= n_cores - 0.5: physical_core_usages[idx:idx+n_cores] = 1.0 else: return None # GPU stuff gpu_usages = np.zeros(self.n_gpus) gpu_rams_gb = np.zeros(self.n_gpus) gpu_usages_all = fr.get_gpu_usages() + rr.gpu_usage gpu_rams_gb_all = fr.get_gpu_rams_gb() + rr.gpu_ram_gb gpu_availability = np.logical_and(gpu_usages_all <= self.get_gpu_usages() + 1e-8, gpu_rams_gb_all <= self.get_gpu_rams_gb()) # print(f'{fr.get_gpu_rams_gb()=}, {rr.gpu_ram_gb=}') # print(f'{gpu_usages_all=}, {gpu_rams_gb_all=}, {self.get_gpu_usages()=}, {self.get_gpu_rams_gb()=}, {gpu_availability=}') available_gpus = np.argwhere(gpu_availability)[:, 0] # squeeze second dimension # sort available gpus by usage available_gpu_usages = self.get_gpu_usages()[available_gpus] # pick gpus with most free resources first available_gpus = available_gpus[np.argsort(available_gpu_usages)[::-1]] # print('gpu selection:', gpu_availability, available_gpu_usages, available_gpus) if len(available_gpus) < rr.n_gpus: return None else: gpu_ids = available_gpus[:rr.n_gpus] for i in gpu_ids: gpu_usages[i] = gpu_usages_all[i] gpu_rams_gb[i] = gpu_rams_gb_all[i] return NodeResources(node_id=self.node_id, n_threads=n_threads, cpu_ram_gb=cpu_ram_gb, gpu_usages=gpu_usages, gpu_rams_gb=gpu_rams_gb, physical_core_usages=physical_core_usages) # todo: maybe a __str__ or __repr__ method for printing? @staticmethod def zeros_like(node_resources: 'NodeResources') -> 'NodeResources': result = copy.deepcopy(node_resources) result.data *= 0 return result class SystemResources: """ System resources, consisting of NodeResources for each node. """ def __init__(self, resources: List[NodeResources]): self.resources = resources def __getitem__(self, index: int): return self.resources[index] def __len__(self): return len(self.resources) def __iadd__(self, other): for i in range(len(self.resources)): self.resources[i] += other.resources[i] return self def __isub__(self, other): for i in range(len(self.resources)): self.resources[i] -= other.resources[i] return self def __imul__(self, other): for i in range(len(self.resources)): self.resources[i] *= other.resources[i] return self def __itruediv__(self, other): for i in range(len(self.resources)): self.resources[i] /= other.resources[i] return self def __add__(self, other): result = copy.deepcopy(self) result += other return result def __sub__(self, other): result = copy.deepcopy(self) result -= other return result def __mul__(self, other): result = copy.deepcopy(self) result *= other return result def __truediv__(self, other): result = copy.deepcopy(self) result /= other return result def get_n_threads(self): return sum([r.get_n_threads() for r in self.resources]) def get_cpu_ram_gb(self): return sum([r.get_cpu_ram_gb() for r in self.resources]) def get_gpu_usage(self): return sum([r.get_total_gpu_usage() for r in self.resources]) def get_gpu_ram_gb(self): return sum([r.get_total_gpu_ram_gb() for r in self.resources]) def get_num_gpus(self): return sum([r.n_gpus for r in self.resources]) def get_resource_vector(self): return sum([r.get_resource_vector() for r in self.resources]) # todo: maybe a __str__ or __repr__ method for printing? ================================================ FILE: pytabkit/bench/scheduling/schedulers.py ================================================ import copy import sys import time from typing import List, Dict, Union import numpy as np from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.jobs import AbstractJob from pytabkit.bench.scheduling.resource_manager import JobInfo def format_length_s(duration: float) -> str: seconds = int(duration) minutes = seconds // 60 seconds -= minutes * 60 hours = minutes // 60 minutes -= hours * 60 days = hours // 24 hours -= days * 24 result = f'{seconds}s' if minutes > 0: result = f'{minutes}m' + result if hours > 0: result = f'{hours}h' + result if days > 0: result = f'{days}d' + result return result def format_date_s(time_s: float) -> str: return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_s)) class BaseJobScheduler: """ Base scheduler class where the logic for selecting which jobs should be run next still has to be implemented. Contains functionality for printing intermediate states and the main loop in run(). """ def __init__(self, job_manager: RayJobManager): self.start_time = time.time() self.job_manager = job_manager self.job_infos: List[JobInfo] = [] def _submit_more_jobs(self) -> None: # to be implemented in subclasses raise NotImplementedError() def add_jobs(self, jobs: List[AbstractJob]): for job in jobs: self.job_infos.append(JobInfo(job, job_id=len(self.job_infos))) def run(self): if len(self.job_infos) == 0: print(f'No jobs to run') return self.job_manager.start() self._print_start() while self._has_unfinished_jobs(): self._submit_more_jobs() self._print_progress() wait_period_s = 30 finished_job_infos = self.job_manager.pop_finished_job_infos(timeout_s=wait_period_s) if len(finished_job_infos) == 0: # no jobs finished after wait_period_s, print a running report and then wait for longer self._print_running_jobs() finished_job_infos = self.job_manager.pop_finished_job_infos() for job_info in finished_job_infos: # update the status of the job infos that have been finished self.job_infos[job_info.job_id] = job_info # todo: register finished job infos in self self._print_end() self.job_manager.terminate() def _has_unfinished_jobs(self) -> bool: return any(not ji.is_finished() for ji in self.job_infos) def _print_start(self): self.start_time = time.time() print( f'############################### START REPORT ##################################\n' f'# Start date: {format_date_s(self.start_time)}\n' f'# Number of jobs: {len(self.job_infos)}\n' f'###############################################################################', flush=True ) def _print_end(self): end_time = time.time() duration = end_time - self.start_time group_stats = self._compute_group_stats() ram_factors = [ji.job_result.max_cpu_ram_gb / ji.assigned_resources.get_cpu_ram_gb() for ji in self.job_infos] ram_factors.sort(reverse=True) if len(ram_factors) > 5: ram_factors = ram_factors[:5] time_factors_string = '\n'.join([f'# Time factor for {key}: {value["time_factor"]}' for key, value in group_stats.items()]) n_jobs_failed = len([ji for ji in self.job_infos if ji.is_failed()]) print( f'################################ END REPORT ###################################\n' f'# Start date: {format_date_s(self.start_time)}\n' f'# End date: {format_date_s(end_time)}\n' f'# Duration: {format_length_s(duration)}\n' f'# Number of failed jobs: {n_jobs_failed}\n' f'# Largest RAM factors: {ram_factors}\n' f'{time_factors_string}\n' f'###############################################################################', flush=True ) def _compute_group_stats(self) -> Dict[str, Dict[str, Union[int, float]]]: job_groups = [ji.job.get_group() for ji in self.job_infos] groups = set(job_groups) group_stats = {} for group in groups: job_infos: List[JobInfo] = [ji for ji, jg in zip(self.job_infos, job_groups) if jg == group] started_job_infos = [ji for ji in job_infos if not ji.is_remaining()] running_job_infos = [ji for ji in started_job_infos if ji.is_running()] finished_job_infos = [ji for ji in job_infos if ji.is_finished()] finished_job_infos_with_time = [ji for ji in finished_job_infos if ji.job_result.finished_normally] n_started = len(started_job_infos) n_running = len(running_job_infos) n_finished = len(finished_job_infos) n_finished_with_time = len(finished_job_infos_with_time) if n_started == 0 or (n_finished_with_time == 0 and n_running == 0): time_factor = 1.0 elif n_finished_with_time == 0: current_time = time.time() elapsed_time = sum([current_time - ji.start_time for ji in running_job_infos]) predicted_time_units = sum([ji.required_resources.time_s for ji in running_job_infos]) time_factor = max(1.0, elapsed_time / (predicted_time_units + 1e-8)) else: used_time = sum([ji.job_result.time_s for ji in finished_job_infos_with_time]) predicted_time_units = sum([ji.required_resources.time_s for ji in finished_job_infos_with_time]) time_factor = used_time / (predicted_time_units + 1e-8) group_stats[group] = {'time_factor': time_factor, 'n_started': n_started, 'n_running': n_running, 'n_finished': n_finished, 'n_finished_with_time': n_finished_with_time} return group_stats def _get_time_estimates(self, job_infos: List[JobInfo], group_stats: Dict[str, Dict[str, Union[int, float]]]) \ -> np.ndarray: current_time = time.time() startup_time_s = 1.0 # guessed time_estimates = [] for ji in job_infos: if ji.is_finished(): time_estimates.append(0.0) # job is already finished continue rr = ji.required_resources time_estimate = group_stats[ji.job.get_group()]['time_factor'] * rr.time_s if not ji.is_remaining(): time_estimate = max(0.0, time_estimate - (current_time - ji.start_time)) else: time_estimate += startup_time_s time_estimates.append(time_estimate) return np.asarray(time_estimates) def _print_progress(self): group_stats = self._compute_group_stats() resource_manager = self.job_manager.get_resource_manager() start_time = self.start_time current_time = time.time() elapsed_time = current_time - start_time total_resources = resource_manager.get_total_resources() fixed_resources = resource_manager.get_fixed_resources() average_fixed_resources = (fixed_resources * total_resources).get_resource_vector() \ / (total_resources.get_resource_vector() + 1e-8) job_infos = self.job_infos n_jobs_finished = len([ji for ji in job_infos if ji.is_finished()]) # succeeded and failed ones n_jobs_remaining = len([ji for ji in job_infos if ji.is_remaining()]) n_jobs_failed = len([ji for ji in job_infos if ji.is_failed()]) n_jobs_running = len(job_infos) - n_jobs_finished - n_jobs_remaining time_estimates = self._get_time_estimates(job_infos, group_stats=group_stats) argmax_time_estimate = np.argmax(time_estimates) longest_job_desc = job_infos[argmax_time_estimate].job.get_desc() longest_time_estimate: float = time_estimates[argmax_time_estimate] system_resource_vec = total_resources.get_resource_vector() # estimate \sum_{jobs} job_resources * remaining_job_time # (could also do physical cores, but that should be covered by threads) total_job_time_resource_vec = sum([ji.required_resources.get_resource_vector(average_fixed_resources) * te for ji, te in zip(job_infos, time_estimates)]) # todo: improve this estimate towards the end of a run? remaining_time_estimate = np.max(total_job_time_resource_vec / (system_resource_vec + 1e-8)) elapsed_fraction = elapsed_time / (elapsed_time + remaining_time_estimate) end_date_str = format_date_s(current_time + remaining_time_estimate) # todo: also print predicted system usage in percent (relative to criticality of resources)? # or print current relative resource usages and remaining task relative resource usages # todo: also log this somewhere automatically? print( f'############################ INTERMEDIATE REPORT ##############################\n' f'# {n_jobs_finished} jobs finished ({n_jobs_failed} failed), {n_jobs_running} jobs running, {n_jobs_remaining} jobs remaining\n' f'# Elapsed: {format_length_s(elapsed_time)} ({elapsed_time:.2f}s)\n' f'# Remaining: {format_length_s(remaining_time_estimate)} ({remaining_time_estimate:.2f}s)\n' f'# Percent completed: {100 * elapsed_fraction:.2f}%\n' f'# Estimated end time: {end_date_str}\n' f'# Current time: {format_date_s(current_time)}\n' f'# Longest remaining job: {longest_job_desc} with {format_length_s(longest_time_estimate)}\n' f'###############################################################################', flush=True ) def _print_running_jobs(self): group_stats = self._compute_group_stats() current_time = time.time() job_infos = self.job_infos n_jobs_finished = len([ji for ji in job_infos if ji.is_finished()]) n_jobs_remaining = len([ji for ji in job_infos if ji.is_remaining()]) n_jobs_running = len(job_infos) - n_jobs_finished - n_jobs_remaining time_estimates = self._get_time_estimates(job_infos, group_stats=group_stats) job_strs = [] sorted_time_idxs = np.argsort(time_estimates) # for ji, time_estimate in zip(job_infos, time_estimates): for i in sorted_time_idxs: ji = job_infos[i] time_estimate = time_estimates[i] if not ji.is_running(): continue # job is not currently running job: AbstractJob = ji.job job_desc = job.get_desc() job_str = (f'# Job {job_desc} has been running for {format_length_s(current_time-ji.start_time)}' f', estimated remaining time: {format_length_s(time_estimate)}') job_strs.append(job_str) print( f'############################### RUNNING REPORT ################################\n' f'# Current time: {format_date_s(current_time)}, {n_jobs_running} jobs are running:\n' + '\n'.join(job_strs) + '\n' + f'###############################################################################', flush=True ) class SimpleJobScheduler(BaseJobScheduler): """ Simple scheduler. Submits jobs with the largest estimated time. If a job doesn't fit, jobs with not too much smaller time can be submitted instead. In the beginning, the scheduler ensures that at least three jobs from each group are run (e.g. 3x XGB, 3x LGBM, 3x MLP). """ def _submit_more_jobs(self) -> None: min_starts_per_group = 3 job_infos = [ji for ji in self.job_infos if ji.is_remaining()] # need running jobs as well for n_started_times? if len(job_infos) == 0: print(f'No job infos remaining') return group_stats = self._compute_group_stats() job_times = self._get_time_estimates(job_infos, group_stats) n_started_times = {key: value['n_running'] + value['n_finished_with_time'] for key, value in group_stats.items()} resource_manager = self.job_manager.get_resource_manager() # n_started_times = [group_stats[ji['job'].get_group()]['n_running'] # + group_stats[ji['job'].get_group()]['n_finished_with_time'] for ji in job_infos] free_resources = copy.deepcopy(resource_manager.get_free_resources()) fixed_resources = resource_manager.get_fixed_resources() if any(value < min_starts_per_group for value in n_started_times.values()): # need to start jobs first from groups where we don't have enough time measurements yet # do this by increasing their job_times estimate job_times_offset = 2 * np.max(job_times) for group, n_started in n_started_times.items(): if n_started < min_starts_per_group: job_idxs = np.asarray([i for i, ji in enumerate(job_infos) if ji.job.get_group() == group], dtype=np.int32) sort_perm = np.argsort(job_times[job_idxs]) n_offset = min(len(sort_perm), min_starts_per_group - n_started) # add job_times_offset to the n_offset jobs from this group with largest time estimate job_times[job_idxs[sort_perm[-n_offset:]]] += job_times_offset # if a job with time estimate t cannot be started, # don't start jobs with time estimate less than min_time_factor * t # the maximum value of t is tracked in max_non_started_time min_time_factor = 0.1 max_non_started_time = 0.0 job_idxs_sorted = np.argsort(job_times)[::-1] # sort descending for job_idx in job_idxs_sorted: if job_times[job_idx] < min_time_factor * max_non_started_time: # don't start too fast jobs if other much slower ones are waiting return job_info = job_infos[job_idx] # otherwise, try assigning the job for node_idx, r in enumerate(free_resources.resources): # print(f'{fixed_resources.__dict__=}') # print(f'{job_info.required_resources.__dict__=}') # print(f'{r.data=}, {r.get_resource_vector()=}, {node_idx=}') assigned_resources = r.try_assign(job_info.required_resources, fixed_resources) # print(f'{bool(assigned_resources)=}') if assigned_resources is not None: job_info.set_started(assigned_resources) self.job_manager.submit_job(job_info) free_resources.resources[node_idx] -= assigned_resources break else: # could not assign the job max_non_started_time = max(max_non_started_time, job_times[job_idx]) class CustomJobScheduler(BaseJobScheduler): """ More complicated scheduler with different heuristics for which jobs to submit first (based on which resources it thinks are scarce, estimated time, which methods have not been run yet, etc.). This scheduler can be slow for a large number of jobs (say 10,000 or more). """ def _submit_more_jobs(self) -> None: # todo: how to handle OOM errors? Reduce total memory of nodes? Or increase memory of jobs? # Or add constants to free_resources? # maybe check if last error is at least one minute ago or so # current error handling: count job as finished, don't rerun min_starts_per_group = 3 job_infos = [ji for ji in self.job_infos if not ji.is_finished()] group_stats = self._compute_group_stats() job_times = self._get_time_estimates(job_infos, group_stats) n_started_time = {key: value['n_running'] + value['n_finished_with_time'] for key, value in group_stats.items()} resource_manager = self.job_manager.get_resource_manager() # n_started_time = [group_stats[ji['job'].get_group()]['n_running'] # + group_stats[ji['job'].get_group()]['n_finished_with_time'] for ji in job_infos] total_resources = resource_manager.get_total_resources() free_resources = copy.deepcopy(resource_manager.get_free_resources()) fixed_resources = resource_manager.get_fixed_resources() print('total_resources.get_resource_vector():', total_resources.get_resource_vector()) system_rv = total_resources.get_resource_vector() job_availability = np.asarray([1.0 if ji.is_remaining() else 0.0 for ji in job_infos]) # n_nodes x 4 total_node_rvs = np.asarray([r.get_resource_vector() for r in total_resources.resources]) # shape: 4 average_fixed_rv = (fixed_resources * total_resources).get_resource_vector() \ / (total_resources.get_resource_vector() + 1e-8) job_rvs = np.asarray([ji.required_resources.get_resource_vector(average_fixed_rv) for ji in job_infos]) remaining_job_time_rv = sum([job_rv * job_time for job_rv, job_time in zip(job_rvs, job_times)]) remaining_times_by_resource = remaining_job_time_rv / (system_rv + 1e-10) remaining_distr = remaining_times_by_resource / (np.max(remaining_times_by_resource) + 1e-8) criticality = np.exp(5.0 * remaining_distr) criticality /= np.sum(criticality) # tempered softmax # max_remaining_time = np.max(remaining_times_by_resource) node_job_runability = np.asarray( [[r.try_assign(ji.required_resources, fixed_resources) is not None for ji in job_infos] for r in total_resources.resources]) job_runability = np.any(node_job_runability, axis=0) # print('job_runability:', job_runability) for i in np.argwhere(~job_runability): # job i cannot run on any node, even if they are completely empty resource_vector = job_infos[int(i)].required_resources.get_resource_vector(average_fixed_rv) print(f'The following job does not fit on any node: {job_infos[int(i)].job.get_desc()}' f', its required resource vector is {resource_vector}.', file=sys.stderr, flush=True) job_availability[i] = 0.0 while np.sum(job_availability) > 0.0: # if nodes get full before jobs run out, a return statement in the loop is used used_resources = total_resources - free_resources used_node_rvs = np.asarray([r.get_resource_vector() for r in used_resources.resources]) # All scores will have shape n_nodes x n_jobs or broadcast to it # ----- Assignability ----- assignments = [[r.try_assign(ji.required_resources, fixed_resources) for ji in job_infos] for r in free_resources.resources] assignability_score = np.asarray([[1.0 if a is not None else 0.0 for a in l] for l in assignments]) # ----- Uncertainty score ----- uncertainty_score = np.asarray([ max(0.0, min_starts_per_group - n_started_time[ji.job.get_group()]) for ji in job_infos]) uncertainty_score = uncertainty_score[None, :] # ----- Short Job Penalty ----- # only use still available jobs for remaining partial sums job_times_rvs = job_times[:, None] * job_rvs * job_availability[:, None] perm = np.argsort(job_times) time_rv_partial_sums = np.zeros_like(job_times_rvs) time_rv_partial_sums[perm] = np.cumsum(job_times_rvs[perm], axis=0) time_partial_sums = [np.max(trps / (system_rv + 1e-8)) for trps in time_rv_partial_sums] max_time = np.max(job_times) # todo: use times of all jobs, including currently running ones? partial_sum_threshold = 3 * max_time # heuristic # penalty in [0, 1], largest for shortest jobs # shape: n_jobs short_job_penalty = (partial_sum_threshold - time_partial_sums) / partial_sum_threshold short_job_penalty[short_job_penalty < 0.0] = 0.0 short_job_penalty = short_job_penalty[None, :] # extend by node dimension # ----- Time score ----- # could also use max_remaining_time in denominator instead time_score = job_times[None, :] / (max_time + 1e-8) # in [0, 1] # ----- Resource score ----- resource_score = np.sum(job_rvs[None, :, :] * criticality[None, None, :], axis=-1) resource_score /= (np.max(resource_score) + 1e-8) # now in [0, 1] # ----- Utilization score ----- # use as shape: n_nodes x n_jobs x 4 new_resources = used_node_rvs[:, None, :] + job_rvs[None, :, :] new_utilization = new_resources / (total_node_rvs[:, None, :] + 1e-10) # what you could have got with uniform utilization new_opt_resources = np.max(new_utilization, axis=-1, keepdims=True) * total_node_rvs[:, None, :] # multiplying utilization with resources avoids the 0/0 GPU utilization problem new_missed_resources = new_opt_resources - new_resources old_resources = used_node_rvs[:, None, :] old_utilization = old_resources / (total_node_rvs[:, None, :] + 1e-10) # what you could have got with uniform utilization old_opt_resources = np.max(old_utilization, axis=-1, keepdims=True) * total_node_rvs[:, None, :] # multiplying utilization with resources avoids the 0/0 GPU utilization problem old_missed_resources = old_opt_resources - old_resources missing_improvement = np.sum((new_missed_resources - old_missed_resources) * criticality[None, None, :], axis=-1) running_improvement = np.sum(job_rvs[None, :, :] * criticality[None, None, :], axis=-1) # should be in (-\infty, 1] utilization_score = np.max(new_utilization, axis=-1) * missing_improvement / (running_improvement + 1e-8) utilization_score = utilization_score / (1.0 + np.abs(utilization_score)) # now in (-1, 1) # ----- Joint score ----- # print(utilization_score.shape, time_score.shape, resource_score.shape, assignability_score.shape, # short_job_penalty.shape, uncertainty_score.shape) joint_score = utilization_score + 0.3 * time_score + 0.2 * resource_score - 0.5 * assignability_score \ - 5 * short_job_penalty + 1000 * uncertainty_score low_value = np.min(joint_score) - 1 joint_score[:, job_availability <= 0.5] = low_value joint_score[~node_job_runability] = low_value # ----- Find next node-job pair ----- # strategy: find next best node-job pair. # If no assignment possible, terminate. # If job can be run now (assignable), add to list and recompute scores. # If job is not assignable to node, # block all jobs on node and block job on all nodes where it is not assignable. # Then loop back to next best node-job pair. while True: # loop until an assignment is found or all nodes are blocked by unassignable jobs best_idxs = np.unravel_index(np.argmax(joint_score), joint_score.shape) if joint_score[best_idxs] == low_value: print('No job remaining') return node_idx = best_idxs[0] job_idx = best_idxs[1] assigned_resources = assignments[node_idx][job_idx] if assigned_resources is None: # node is too full to run job now print('Node too full') # block node for now joint_score[node_idx, :] = low_value # make sure that job can only be stolen by other nodes if they are assignable joint_score[assignability_score[:, job_idx] == 0.0, job_idx] = low_value else: print('Assigning job') job_availability[job_idx] = 0.0 job_info = job_infos[job_idx] job_info.set_started(assigned_resources) self.job_manager.submit_job(job_info) free_resources.resources[node_idx] -= assigned_resources n_started_time[job_info.job.get_group()] += 1 break # leave inner loop, recompute scores ================================================ FILE: pytabkit/models/__init__.py ================================================ ================================================ FILE: pytabkit/models/alg_interfaces/__init__.py ================================================ ================================================ FILE: pytabkit/models/alg_interfaces/alg_interfaces.py ================================================ import functools import warnings from pathlib import Path from typing import List, Tuple, Any, Optional, Dict import torch from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.data.nested_dict import NestedDict from pytabkit.models.hyper_opt.hyper_optimizers import HyperOptimizer from pytabkit.models import utils from pytabkit.models.data.data import DictDataset, TaskType from pytabkit.models.torch_utils import cat_if_necessary from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import Metrics class AlgInterface: """ AlgInterface is an abstract base class for tabular ML methods with an interfaces that offers more possibilities than a standard scikit-learn interface. In particular, it allows for parallelized fitting of multiple models, bagging, and refitting. The idea is as follows: - The dataset can be split into a test set and the remaining data. (We call this a trainval-test split.) The fit() method allows to specify multiple such splits, and some AlgInterface implementations (NNAlgInterface) allow to vectorize computations across these splits. However, for vectorization, we may require that the test set sizes are identical in all splits. - The remaining data can further be split into training and validation data. (We call this a train-val split.) AlgInterface allows to fit with one or multiple train-val splits, which can also be vectorized in NNAlgInterface. Optionally, the function `get_refit_interface()` allows to extract an AlgInterface that can be used for fitting the model on training+validation set with the best settings found on the validation set in the cross-validation stage (represented by self.fit_params). These "best settings" could be an early stopping epoch or number of trees, or best hyperparameters found by hyperparameter optimization. We call this refitting. Another feature of AlgInterface is that it provides methods to get (an estimate of) required resources and to evaluate metrics on training, validation, and test set. """ def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): """ :param fit_params: This parameter can be used to store the best hyperparameters found during fit() in (cross-)validation mode. These can then be used for fit() in refitting mode. If fit_params is not None, it should be a list with one dictionary per trainval-test split. The dictionaries then contain the obtained hyperparameters for each of the trainval-test splits. Normally, there are no best parameters per train-val split as we might not have the same number of refitted models as train-val splits. :param config: Other parameters. """ self.config = config self.fit_params = fit_params self.curr_pred_params_name = '' def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: """ Fit the models on the given data and splits. Should be overridden by subclasses unless fit_and_eval() is overloaded. In the latter case, this method will by default use fit_and_eval() and discard the evaluation. :param ds: DictDataset representing the dataset. Should be on the CPU. :param idxs_list: List containing one SplitIdxs object per trainval-test split. Indices should be on the CPU. :param interface_resources: Resources assigned to fit(). :param logger: Logger that can be used for logging. :param tmp_folders: List of paths that can be used for storing intermediate data. The paths can be None, in which case methods will try not to save intermediate results. There should be one folder per trainval-test-split (i.e. only one per k-fold CV). :param name: Name of the algorithm (for logging). :return: May return information about different possible fit_params settings that can be used. Say a variable `results` is returned that is not None. Then, results[tt_split_idx][tv_split_idx] should be a list of tuples (params, loss). This is useful for k-fold cross-validation, where the params with the best average loss (averaged over tv_split_idx) can be selected for fit_params. """ if self.__class__.fit_and_eval == AlgInterface.fit_and_eval: raise NotImplementedError() # avoid infinite recursion else: self.fit_and_eval(ds, idxs_list, interface_resources, logger, tmp_folders, name, metrics=None, return_preds=False) return None def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics], return_preds: bool) -> List[NestedDict]: """ Run fit() with the given parameters and then return the result of eval() with the given metrics. This method can be overridden instead of fit() if it is more convenient. The idea is that for hyperparameter optimization, one has to evaluate each hyperparameter combination anyway after training it, so it is more efficient to implement fit_and_eval() and return the evaluation of the best method at the end. See the documentation of fit() and eval() for the meaning of the parameters and returned values. """ if self.__class__.fit == AlgInterface.fit: raise NotImplementedError() # avoid infinite recursion self.fit(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources, logger=logger, tmp_folders=tmp_folders, name=name) return self.eval(ds=ds, idxs_list=idxs_list, metrics=metrics, return_preds=return_preds) def eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], metrics: Optional[Metrics], return_preds: bool) -> List[NestedDict]: """ Evaluates the (already fitted) method using various metrics on training, validation, and test sets. The results will also contain the found fit_params and optionally the predictions on the dataset. This method should normally not be overridden in subclasses. :param ds: Dataset. :param idxs_list: List of indices for the training-validation-test splits, one per trainval-test split as in fit(). :param metrics: Metrics object that defines which metrics should be evaluated. If metrics is None, an empty list will be returned (which might avoid unnecessary computation when implementing fit() through fit_and_eval()). :param return_preds: Whether the predictions on the dataset should be included in the returned results. :return: Returns a list with one NestedDict for every trainval-test split. Denote by `results` such a NestedDict object. Then, `results` will contain the following contents: results['metrics', 'train'/'val'/'test', str(n_models), str(start_idx), metric_name] = metric_value Here, an ensemble of the predictions of models [start_idx:start_idx+n_models] will be used. results['y_preds'] = a list (converted from a tensor) with predictions on the whole dataset, included only if return_preds==True. results['fit_params'] = self.fit_params """ if metrics is None: results = [] # for idxs in idxs_list: # result = NestedDict() # for split_name in ['train', 'val', 'test']: # result['metrics'][split_name]['1']['0'] = dict() # if return_preds: # pass # results.append(dict(metrics)) return results X, y = ds.split_xy() y = y.tensors['y'] y_pred_full = self.predict(X).detach().cpu() # print(f'{y_pred_full[0, idxs_list[0].val_idxs[0, 4]]=}') # print(f'{self.predict(X.get_sub_dataset(idxs_list[0].val_idxs[0]))[0, 4]=}') # print(f'{idxs_list[0].val_idxs[0, 4]=}') # print(f'{y=}') # print(f'{y_pred_full=}') # print(f'{y.shape=}') # print(f'{y_pred_full.shape=}') idx = 0 results_list = [] for split_idx, idxs in enumerate(idxs_list): results = NestedDict() y_preds = y_pred_full[idx:idx + idxs.n_trainval_splits] if return_preds: results['y_preds'] = y_preds.numpy().tolist() idx += idxs.n_trainval_splits if idxs.test_idxs is not None: # print(f'{y_preds.shape=}') # print(f'{y.shape=}') results['metrics', 'test'] = metrics.compute_metrics_dict( y_preds=[y_preds[i, idxs.test_idxs] for i in range(y_preds.shape[0])], y=y[idxs.test_idxs], use_ens=True) train_metrics = NestedDict() val_metrics = NestedDict() for i in range(idxs.n_trainval_splits): train_dict = metrics.compute_metrics_dict([y_preds[i, idxs.train_idxs[i]]], y[idxs.train_idxs[i]], use_ens=False) train_metrics['1', str(i)] = train_dict['1', '0'] if idxs.val_idxs is not None and idxs.val_idxs.shape[-1] > 0: # print(f'{y_preds[0, idxs.val_idxs[0, 4]]=}') val_dict = metrics.compute_metrics_dict([y_preds[i, idxs.val_idxs[i]]], y[idxs.val_idxs[i]], use_ens=False) val_metrics['1', str(i)] = val_dict['1', '0'] # print(f'{val_metrics=}') # print(f'{idxs.val_idxs.shape[-1]=}') # print(f'{torch.min(y_preds[0, idxs.val_idxs[0]]).item()=}') # print(f'{ds.tensors["x_cont"][idxs.val_idxs[0, 4]]=}') # print(f'{ds.tensors["x_cat"][idxs.val_idxs[0, 4]]=}') results['metrics', 'train'] = train_metrics if idxs.val_idxs is not None: results['metrics', 'val'] = val_metrics if self.fit_params is not None: results['fit_params'] = self.fit_params[split_idx] results_list.append(results) return results_list def predict(self, ds: DictDataset) -> torch.Tensor: """ Method to predict labels on the given dataset. Override in subclasses. :param ds: Dataset on which to predict labels :return: Returns a tensor of shape [n_trainval_splits * n_splits, ds.n_samples, output_shape] In the classification case, output_shape will be the number of classes (even in the binary case) and the outputs will be logits (i.e., softmax should be applied to get probabilities) In the regression case, output_shape will be the target dimension (often 1). """ raise NotImplementedError() def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': """ Returns another AlgInterface that is configured for refitting on the training and validation data. Override in subclasses. :param n_refit: Number of models that should be refitted (with different seeds) per trainval-test split. :param fit_params: Fit parameters (see the constructor) that should be used for refitting. If fit_params is None, self.fit_params will be used instead. :return: Returns the AlgInterface object for refitting. """ raise NotImplementedError() def get_fit_params(self) -> Optional[List[Dict]]: """ :return: Return self.fit_params. """ return self.fit_params def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: """ Estimate the required resources for fit(). :param ds: Dataset. Does not have to contain tensors. :param n_cv: Number of train-val splits per trainval-test split. :param n_refit: Number of refitted models per trainval-test split. :param n_splits: Number of trainval-test splits. :param split_seeds: Seeds for every trainval-test split. :return: Returns estimated required resources. """ raise NotImplementedError() # ------- for alg interfaces that can predict with multiple versions of an algorithm def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: # override in subclasses if more options are available return {'': dict()} def get_current_predict_params_name(self): return self.curr_pred_params_name def get_current_predict_params_dict(self): return self.get_available_predict_params()[self.curr_pred_params_name] def set_current_predict_params(self, name: str) -> None: self.curr_pred_params_name = name def to(self, device: str) -> None: warnings.warn(f'.to() method does nothing for {self.__class__} (not implemented)') class MultiSplitWrapperAlgInterface(AlgInterface): # todo: do we need the option to run this with a "split batch size" > 1 for the NNInterface? def __init__(self, single_split_interfaces: List[AlgInterface], **config): super().__init__(single_split_interfaces=single_split_interfaces, **config) # todo: could allow parallel evaluation, but not for now self.single_split_interfaces = single_split_interfaces def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': # return interface with the hyperparameters found by cross-validation for refitting # this can only be called if some fit method has been called before with validation data fit_params = fit_params or self.fit_params if fit_params is not None: assert len(fit_params) == len(self.single_split_interfaces) fit_params_list = [[p] for p in fit_params] else: fit_params_list = [None] * len(self.single_split_interfaces) return MultiSplitWrapperAlgInterface([s.get_refit_interface(n_refit, p) for p, s in zip(fit_params_list, self.single_split_interfaces)]) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(self.single_split_interfaces) == len(idxs_list) assert len(idxs_list) == len(tmp_folders) for split_idx in range(len(idxs_list)): self.single_split_interfaces[split_idx].fit(ds, [idxs_list[split_idx]], interface_resources, logger, [tmp_folders[split_idx]], name) self.fit_params = [ssi.fit_params[0] for ssi in self.single_split_interfaces] return None def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics], return_preds: bool) -> List[NestedDict]: assert len(self.single_split_interfaces) == len(idxs_list) assert len(idxs_list) == len(tmp_folders) results_list = [] for split_idx in range(len(idxs_list)): results_list.extend(self.single_split_interfaces[split_idx].fit_and_eval( ds, [idxs_list[split_idx]], interface_resources, logger, [tmp_folders[split_idx]], name, metrics, return_preds)) return results_list def predict(self, ds: DictDataset) -> torch.Tensor: return cat_if_necessary([s.predict(ds) for s in self.single_split_interfaces], dim=0) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: single_resources = [ ssi.get_required_resources(ds, n_cv, n_refit, n_splits=1, split_seeds=[split_seeds[i]], n_train=n_train) for i, ssi in enumerate(self.single_split_interfaces)] return RequiredResources.combine_sequential(single_resources) def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: return self.single_split_interfaces[0].get_available_predict_params() def set_current_predict_params(self, name: str) -> None: super().set_current_predict_params(name) for ssi in self.single_split_interfaces: ssi.set_current_predict_params(name) class SingleSplitAlgInterface(AlgInterface): pass # this class is just to document that the fit() and fit_and_eval() functions can only take one split class OptAlgInterface(SingleSplitAlgInterface): def __init__(self, hyper_optimizer: HyperOptimizer, max_resource_config: Dict, **config): super().__init__(**config) # self.create_alg_interface = create_alg_interface self.hyper_optimizer = hyper_optimizer # a configuration that can be passed to self.create_alg_interface() # which should be used for resource estimation. # E.g. for tree-based methods this should involve the maximum depth and maximum n_estimators # that can be used during HPO. self.max_resource_config = max_resource_config # self.fit_params['hyper_fit_params'] will contain the optimized parameters, # self.fit_params['sub_fit_params'] will contain the fit_params of the best fitted alg_interface self.best_alg_interface = None self.opt_step = 0 # list where all results from all optimization steps can be stored (except y_preds, to save memory) # this list will then be included into the final results, such that one can retrospectively simulate # what would have happened if the optimization had been terminated earlier self.results_list = [] def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: raise NotImplementedError() def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': if fit_params is not None: assert len(fit_params) == 1 # single split else: assert self.fit_params is not None fit_params = self.fit_params # print(f'{fit_params=}') alg_interface = self.create_alg_interface(n_refit, **utils.join_dicts(self.config, fit_params[0]['hyper_fit_params'])) # the alg_interface itself may have other hypers that have been fit return alg_interface.get_refit_interface(n_refit, fit_params[0]['sub_fit_params']) def objective(self, params, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folder: Optional[Path], name: str, metrics: Optional[Metrics], return_preds: bool) -> Tuple[float, Tuple[List[NestedDict], AlgInterface]]: self.opt_step += 1 tmp_folder = tmp_folder / f'step_{self.opt_step}' if tmp_folder is not None else None could_load = False # try to load results if tmp_folder is not None and utils.existsFile(tmp_folder / 'DONE'): # should be able to load the results alg_interface = utils.deserialize(tmp_folder / 'alg_interface.pkl', compressed=True) results = utils.deserialize(tmp_folder / 'results.pkl') sub_fit_params = utils.deserialize(tmp_folder / 'fit_params.pkl') loaded_params = utils.deserialize(tmp_folder / 'params.pkl') if loaded_params != params: print('Got different params than the saved ones, ' 'hyperparameter optimizer might be non-deterministic') print(f'{params=}') print(f'{loaded_params=}', flush=True) # logger.log(1, 'Got different params than the saved ones, ' # 'hyperparameter optimizer might be non-deterministic') # don't set could_load to true, recompute utils.delete_file(tmp_folder / 'DONE') else: could_load = True if not could_load: # compute results tmp_folders = [tmp_folder / 'alg_interface' if tmp_folder is not None else None] alg_interface = self.create_alg_interface(idxs_list[0].n_trainval_splits, **utils.join_dicts(self.config, params)) results = alg_interface.fit_and_eval(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources, logger=logger, tmp_folders=tmp_folders, name=name, metrics=metrics, return_preds=return_preds) sub_fit_params = alg_interface.get_fit_params() # save results if tmp_folder is not None: utils.serialize(tmp_folder / 'alg_interface.pkl', alg_interface, compressed=True) utils.serialize(tmp_folder / 'results.pkl', results) # serialize fit_params separately in case the alg_interface cannot be loaded utils.serialize(tmp_folder / 'fit_params.pkl', sub_fit_params) utils.serialize(tmp_folder / 'params.pkl', params) # save the "DONE" file last to indicate that all other files have been completely written utils.writeToFile(tmp_folder / 'DONE', '') # todo: could do sub_fit_params[0] instead since it's only one split anyway? results[0]['fit_params'] = {'hyper_fit_params': params, 'sub_fit_params': sub_fit_params} # store all parameters and results (metrics) without predictions self.results_list.append(utils.update_dict(results[0].get_dict(), remove_keys=['y_preds'])) val_loss = metrics.compute_val_score(results[0]['metrics']['val']) return val_loss, (results, alg_interface) def fit_and_eval(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str, metrics: Optional[Metrics], return_preds: bool) -> List[NestedDict]: assert len(idxs_list) == 1 # this is a SingleSplitAlgInterface assert len(tmp_folders) == 1 # this is a SingleSplitAlgInterface split_idxs = idxs_list[0] tmp_folder = tmp_folders[0] opt_desc = f'split {split_idxs.split_id} of {name}' if metrics is None: # create metrics because we need to have a validation score task_type = TaskType.CLASSIFICATION if ds.tensor_infos['y'].is_cat() else TaskType.REGRESSION val_metric_name = self.config.get('val_metric_name', Metrics.default_val_metric_name(task_type)) metrics = Metrics(metric_names=[val_metric_name], val_metric_name=val_metric_name, task_type=task_type) self.opt_step = 0 f = functools.partial(self.objective, ds=ds, idxs_list=idxs_list, interface_resources=interface_resources, logger=logger, tmp_folder=tmp_folder, name=name, metrics=metrics, return_preds=return_preds) hyper_fit_params, (results, best_alg_interface) = self.hyper_optimizer.optimize( f=f, seed=split_idxs.sub_split_seeds[0], opt_desc=opt_desc, logger=logger) self.best_alg_interface = best_alg_interface self.fit_params = [results[0]['fit_params']] results[0]['opt_step_results'] = self.results_list return results def predict(self, ds: DictDataset) -> torch.Tensor: return self.best_alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: ref_alg_interface = self.create_alg_interface(n_sub_splits=1, **self.max_resource_config) single_resources = ref_alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=split_seeds, n_train=n_train) single_resources.time_s *= (self.hyper_optimizer.get_n_hyperopt_steps() * n_cv + n_refit) * n_splits return single_resources class RandomParamsAlgInterface(SingleSplitAlgInterface): def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config): """ :param model_idx: used for seeding along with the seed given in fit(), so we can do random search HPO by combining multiple RandomParamsNNAlgInterface objects with different model_idx values- :param fit_params: Fit parameters (stopping epoch for refitting). :param config: Configuration parameters. """ super().__init__(fit_params=fit_params, **config) self.model_idx = model_idx self.alg_interface = None def _sample_params(self, is_classification: bool, seed: int, n_train: int): raise NotImplementedError() # override in subclass def _create_interface_from_config(self, n_tv_splits: int, **config): raise NotImplementedError() def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': raise NotImplementedError('Refit is not fully implemented...') # return RandomParamsNNAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params, # **self.config) def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int, n_tv_splits: int): # this is also set in get_required_resources, but okay if self.fit_params is None: hparam_seed = utils.combine_seeds(seed, self.model_idx) is_classification = not ds.tensor_infos['y'].is_cont() self.fit_params = [self._sample_params(is_classification, hparam_seed, n_train)] # todo: need epoch for refit return self._create_interface_from_config(n_tv_splits=n_tv_splits, fit_params=None, **utils.update_dict(self.config, self.fit_params[0])) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 n_tv_splits = idxs_list[0].n_trainval_splits self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, n_train=idxs_list[0].n_train, n_tv_splits=n_tv_splits) print(f'{self.fit_params[0]=}') self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name) self.fit_params[0]['sub_fit_params'] = self.alg_interface.fit_params[0] def predict(self, ds: DictDataset) -> torch.Tensor: self.alg_interface.set_current_predict_params(self.get_current_predict_params_name()) return self.alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert len(split_seeds) == 1 alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train=n_train, n_tv_splits=n_cv) return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train) ================================================ FILE: pytabkit/models/alg_interfaces/autogluon_model_interfaces.py ================================================ import copy import os from typing import List, Any, Optional import numpy as np import pandas as pd import torch from pytabkit.models import utils from pytabkit.models.alg_interfaces.base import RequiredResources, InterfaceResources from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.utils import FunctionProcess class AutoGluonModelAlgInterface(SklearnSubSplitInterface): # parameters: use_gpu?, hp_family?, model_types, max_n_models_per_type # possible values for hp_family: default, zeroshot, zeroshot_hpo, zeroshot_hpo_hybrid, default_FTT, light # possible values for model_types: 'FASTAI', 'NN_TORCH', 'FT_TRANSFORMER', 'XGB', 'CAT', 'GBM', 'RF', 'XT' def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: from autogluon.tabular import TabularPredictor params_config = [] params = utils.extract_params(self.config, params_config) params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0] val_metric_name = self.config.get('val_metric_name') # todo: random_state? other_kwargs = dict() if self.n_classes > 0: problem_type = 'binary' if self.n_classes == 2 else 'multiclass' if val_metric_name is None or val_metric_name == 'class_error': eval_metric = 'accuracy' elif val_metric_name == 'cross_entropy': eval_metric = 'log_loss' else: raise ValueError(f'{val_metric_name=} not implemented') else: problem_type = 'regression' if val_metric_name is None or val_metric_name == 'rmse': eval_metric = 'rmse' elif val_metric_name.startswith('pinball('): problem_type = 'quantile' eval_metric = 'pinball_loss' other_kwargs = dict(quantile_levels=[float(val_metric_name[len('pinball('):-1])]) else: raise ValueError(f'{val_metric_name=} not implemented') self.eval_metric = eval_metric return TabularPredictor(label='label', eval_metric=eval_metric, problem_type=problem_type, path=self.config.get('tmp_folder', None), verbosity=self.config.get('verbosity', 0), log_to_file=False, **other_kwargs) def _create_df(self, X: pd.DataFrame, y: Optional[np.ndarray]): new_columns = {'input_' + col_name: X[col_name] for col_name in X.columns} if y is not None: new_columns['label'] = y df = pd.DataFrame(new_columns) if y is not None: is_reg = y.dtype.kind == 'f' df['label'] = df['label'].astype('float64' if is_reg else 'category') return df def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 use_gpu = self.config.get('use_gpu', False) model_types = self.config['model_types'] if isinstance(model_types, str): model_types = [model_types] has_ft_transformer = 'FT_TRANSFORMER' in model_types updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5, 'n_samples*n_features': 5e-6} ram_params = {'': 0.5 if use_gpu else 3.0, 'ds_onehot_size_gb': 1.5} gpu_ram_params = {'': 0.4, 'ds_onehot_size_gb': 1.5, 'n_features': 3e-2 if has_ft_transformer else 1e-4} if use_gpu else None rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1 if use_gpu else 0, gpu_usage=0.02 if use_gpu else 0.0) return rc.get_required_resources(ds) def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): df = self._create_df(x_df, y) # by default, we ignore the validation set since most sklearn methods do not support it n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False hparams_selected = dict() from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config hparams = copy.deepcopy(get_hyperparameter_config(self.config.get('hp_family', 'default'))) interface_resources: InterfaceResources = self.config['interface_resources'] cuda_ids = [device[len('cuda:'):] for device in interface_resources.gpu_devices if device.startswith('cuda:')] use_gpu = len(cuda_ids) > 0 # todo: this is only correct if the variable wasn't already set before # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(cuda_ids) print(f'_fit_sklearn: {torch.cuda.is_initialized()=}') # todo: does it work? print(f'{torch.cuda.device_count()=}') print(f'{cuda_ids=}') print(f'{os.getenv("CUDA_VISIBLE_DEVICES")=}') max_n_models_per_type = self.config.get('max_n_models_per_type', 0) hparams_idx = self.config.get('hparams_idx', None) model_types = self.config['model_types'] if isinstance(model_types, str): model_types = [model_types] for key, value in hparams.items(): # if key in ['FASTAI', 'NN_TORCH', 'FT_TRANSFORMER']: if key not in model_types: continue if not isinstance(value, list): value = [value] if hparams_idx is not None: value = [value[hparams_idx]] if max_n_models_per_type > 0 and len(value) > max_n_models_per_type: value = value[:max_n_models_per_type] for config in value: config['ag_args_fit'] = dict(num_gpus=1 if use_gpu else 0) if key == 'FT_TRANSFORMER': config['ag_args_fit']['_max_features'] = 100_000 config['_max_features'] = 100_000 hparams_selected[key] = value print(f'{hparams_selected=}') self.model.fit(df.iloc[train_mask], tuning_data=df.iloc[~train_mask], presets='medium_quality', fit_weighted_ensemble=False, fit_full_last_level_weighted_ensemble=False, hyperparameters=hparams_selected, ) # fit_func = lambda df, hparams_selected, train_mask, model: model.fit(df.iloc[train_mask], tuning_data=df.iloc[~train_mask], # presets='medium_quality', # fit_weighted_ensemble=False, # fit_full_last_level_weighted_ensemble=False, # hyperparameters=hparams_selected, # ) # # print(f'Running fit on autogluon model') # # # fit_func(df, hparams_selected, train_mask, self.model) # self.model = FunctionProcess(fit_func, df, hparams_selected, train_mask, self.model).start().pop_result() # print(f'fit completed') def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict(self._create_df(x_df, None)).to_numpy() def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict_proba(self._create_df(x_df, None)).to_numpy() ================================================ FILE: pytabkit/models/alg_interfaces/base.py ================================================ from typing import Optional, List import numpy as np import torch class SplitIdxs: """ Represents multiple train-validation-test splits for AlgInterface. """ def __init__(self, train_idxs: torch.Tensor, val_idxs: Optional[torch.Tensor], test_idxs: Optional[torch.Tensor], split_seed: int, sub_split_seeds: List[int], split_id: int): """ :param train_idxs: Tensor of shape (n_trainval_splits, n_train_idxs). Each of the train-val splits needs to have the same number of training samples. The elements of the tensor should index the training set elements in a larger dataset. :param val_idxs: Tensor of shape (n_trainval_splits, n_val_idxs), or None if no validation set should be used. :param test_idxs: Tensor of shape (n_test_idxs,). The same test set will be used for all train-val splits. :param split_seed: Random seed for algorithms on this split. :param sub_split_seeds: Separate random seeds for algorithms on each train-val split (length should be n_trainval_splits). :param split_id: ID of this split (for logging/saving purposes). """ self.train_idxs = train_idxs self.val_idxs = val_idxs self.test_idxs = test_idxs self.split_seed = split_seed self.sub_split_seeds = sub_split_seeds self.split_id = split_id self.n_trainval_splits = train_idxs.shape[0] self.n_train = train_idxs.shape[-1] self.n_val = 0 if val_idxs is None else val_idxs.shape[-1] self.n_test = 0 if test_idxs is None else test_idxs.shape[-1] if len(self.sub_split_seeds) != self.n_trainval_splits: raise ValueError('len(self.alg_seeds) != self.n_trainval_splits') if val_idxs is not None and val_idxs.shape[0] != self.n_trainval_splits: raise ValueError('val_idxs.shape[0] != self.n_trainval_splits') def get_sub_split_idxs(self, i: int) -> 'SubSplitIdxs': return SubSplitIdxs(self.train_idxs[i], self.val_idxs[i] if self.val_idxs is not None else None, self.test_idxs, self.sub_split_seeds[i]) def get_sub_split_idxs_alt(self, i: int) -> 'SplitIdxs': return SplitIdxs(self.train_idxs[i:i+1], self.val_idxs[i:i+1] if self.val_idxs is not None else None, self.test_idxs, self.split_seed, self.sub_split_seeds[i:i+1], split_id=self.split_id) class SubSplitIdxs: """ Represents a single trainval-test split with multiple train-val splits """ def __init__(self, train_idxs: torch.Tensor, val_idxs: Optional[torch.Tensor], test_idxs: Optional[torch.Tensor], alg_seed: int): # train_idxs: n_train_idxs # val_idxs: n_val_idxs (optional) # test_idxs: n_test_idxs (optional) self.train_idxs = train_idxs self.val_idxs = val_idxs self.test_idxs = test_idxs self.alg_seed = alg_seed self.n_train = train_idxs.shape[-1] self.n_val = 0 if val_idxs is None else val_idxs.shape[-1] self.n_test = 0 if test_idxs is None else test_idxs.shape[-1] class InterfaceResources: """ Simple class representing resources that a method is allowed to use (number of threads and GPUs). """ def __init__(self, n_threads: int, gpu_devices: List[str], time_in_seconds: Optional[int] = None): self.n_threads = n_threads self.gpu_devices = gpu_devices self.time_in_seconds = time_in_seconds class RequiredResources: """ Represents estimated/requested resources by a method. """ def __init__(self, time_s: float, n_threads: float, cpu_ram_gb: float, n_gpus: int = 0, gpu_usage: float = 1.0, gpu_ram_gb: float = 0.0, n_explicit_physical_cores: int = 0): self.n_threads = n_threads self.cpu_ram_gb = cpu_ram_gb self.n_gpus = n_gpus self.gpu_usage = gpu_usage self.gpu_ram_gb = gpu_ram_gb self.time_s = time_s # for liquidSVM, want to have contiguous core indices self.n_explicit_physical_cores = n_explicit_physical_cores def get_resource_vector(self, fixed_resource_vector: np.ndarray): own_resources = np.asarray([self.n_threads, self.cpu_ram_gb, self.gpu_usage, self.gpu_ram_gb]) if self.should_add_fixed_resources(): # do not use fixed cpu ram since that is also measured for GPU usage own_resources += fixed_resource_vector multiplier = np.asarray([1.0, 1.0, self.n_gpus, self.n_gpus]) return multiplier * own_resources def should_add_fixed_resources(self) -> bool: return self.n_gpus > 0 @staticmethod def combine_sequential(resources_list: List['RequiredResources']): return RequiredResources(time_s=sum([r.time_s for r in resources_list]), n_threads=max([r.n_threads for r in resources_list]), cpu_ram_gb=max([r.cpu_ram_gb for r in resources_list]), n_gpus=max([r.n_gpus for r in resources_list]), gpu_usage=max([r.gpu_usage for r in resources_list]), gpu_ram_gb=max([r.gpu_ram_gb for r in resources_list]), n_explicit_physical_cores=max([r.n_explicit_physical_cores for r in resources_list]), ) ================================================ FILE: pytabkit/models/alg_interfaces/calibration.py ================================================ import traceback from pathlib import Path from typing import List, Optional, Tuple, Dict, Any, Callable import numpy as np import scipy import sklearn import torch import torch.nn as nn from dask.array import greater from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import log_loss, make_scorer from sklearn.model_selection import StratifiedKFold, GridSearchCV from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.data.data import DictDataset from pytabkit.models.training.logging import Logger import math class PostHocCalibrationAlgInterface(AlgInterface): def __init__(self, alg_interface: AlgInterface, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.alg_interface = alg_interface self.calibrators = [] self.n_calibs = [] def _transform_probs(self, probs: np.ndarray) -> np.ndarray: offset = self.config.get('calib_input_offset', 0.0) if offset != 0.0: probs = probs + offset probs = probs / np.sum(probs, axis=-1, keepdims=True) return probs def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) \ -> Optional[List[List[List[Tuple[Dict, float]]]]]: self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name) y_preds = self.alg_interface.predict(ds) self.n_tv_splits_list_ = [idxs.n_trainval_splits for idxs in idxs_list] if self.config.get('calibrate_per_fold', True): for tt_split_idx, split_idxs in enumerate(idxs_list): for tv_split_idx in range(split_idxs.n_trainval_splits): val_idxs = split_idxs.val_idxs[tv_split_idx] y = ds.tensors['y'][val_idxs] y_pred = y_preds[len(self.calibrators), val_idxs] y_pred_probs = torch.softmax(y_pred, dim=-1) import probmetrics.calibrators import probmetrics.distributions calib = probmetrics.calibrators.get_calibrator(**self.config) if self.config.get('calibrate_with_logits', True): calib.fit_torch(y_pred=probmetrics.distributions.CategoricalLogits(y_pred.detach().cpu()), y_true_labels=y[:, 0]) else: calib.fit(self._transform_probs(y_pred_probs.detach().cpu().numpy()), y.cpu().numpy()[:, 0]) self.calibrators.append(calib) self.n_calibs.append(val_idxs.shape[-1]) else: y_pred_idx = 0 for tt_split_idx, split_idxs in enumerate(idxs_list): y_pred_list = [] y_list = [] for tv_split_idx in range(split_idxs.n_trainval_splits): val_idxs = split_idxs.val_idxs[tv_split_idx] y_pred_list.append(y_preds[y_pred_idx, val_idxs]) y_list.append(ds.tensors['y'][val_idxs]) y_pred_idx += 1 y_pred = torch.cat(y_pred_list, dim=0) y = torch.cat(y_list, dim=0) import probmetrics.calibrators import probmetrics.distributions calib = probmetrics.calibrators.get_calibrator(**self.config) if self.config.get('calibrate_with_logits', True): calib.fit_torch(y_pred=probmetrics.distributions.CategoricalLogits(y_pred.detach().cpu()), y_true_labels=y[:, 0].detach().cpu()) else: calib.fit(self._transform_probs(torch.softmax(y_pred, dim=-1).detach().cpu().numpy()), y.cpu().numpy()[:, 0]) self.calibrators.extend([calib] * split_idxs.n_trainval_splits) self.n_calibs.extend([y_pred.shape[0]] * split_idxs.n_trainval_splits) self.fit_params = [dict(sub_fit_params=fp) for fp in self.alg_interface.fit_params] return None def predict(self, ds: DictDataset) -> torch.Tensor: y_preds = self.alg_interface.predict(ds) y_preds_probs = torch.softmax(y_preds, dim=-1) y_preds_calib = [] if self.config.get('ensemble_before_calib', False): start_idx = 0 for n_tv_splits in self.n_tv_splits_list_: avg_probs = y_preds_probs[start_idx:start_idx+n_tv_splits].mean(dim=0, keepdim=True) y_preds_probs[start_idx:start_idx + n_tv_splits] = avg_probs start_idx += n_tv_splits y_preds = torch.log(y_preds_probs + 1e-30) for i in range(y_preds.shape[0]): if self.config.get('calibrate_with_logits', True): from probmetrics.distributions import CategoricalLogits y_pred_calib = self.calibrators[i].predict_proba_torch( CategoricalLogits(y_preds[i].detach().cpu())).get_probs() else: y_pred_calib = self.calibrators[i].predict_proba( self._transform_probs(y_preds_probs[i].detach().cpu().numpy())) # the np.array(...) is for avoiding read-only array warnings y_pred_calib = torch.as_tensor(np.array(y_pred_calib), dtype=torch.float32) if self.config.get('use_calib_offset', False): y_pred_calib += 1. / self.n_calibs[i] y_pred_calib = torch.log(y_pred_calib + 1e-30) y_preds_calib.append(y_pred_calib) result = torch.stack(y_preds_calib, dim=0) # print(f'{y_preds.shape=}, {result.shape=}') return result def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: return self.alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train) def to(self, device: str) -> None: self.alg_interface.to(device) ================================================ FILE: pytabkit/models/alg_interfaces/catboost_interfaces.py ================================================ import copy import warnings from pathlib import Path from typing import Optional, Dict, Any, List, Tuple, Union import numpy as np import torch from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.resource_params import ResourceParams from pytabkit.models import utils from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, \ SingleSplitWrapperAlgInterface, \ SklearnSubSplitInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, \ OptAlgInterface, RandomParamsAlgInterface from pytabkit.models.training.metrics import Metrics class CatBoostSklearnSubSplitInterface(SklearnSubSplitInterface): def _get_cat_indexes_arg_name(self) -> str: return 'cat_features' def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None, 1000), ('depth', ['depth', 'max_depth'], 6), ('random_strength', None, 1.0), ('l2_leaf_reg', None, 3.0), ('depth', ['depth', 'max_depth'], 6), ('learning_rate', ['lr', 'learning_rate', 'eta']), ('one_hot_max_size', None), ('bagging_temperature', None), ('leaf_estimation_iterations', None), ('bootstrap_type', None), ('subsample', None), ('sampling_frequency', None), ('boosting_type', None), ('colsample_bylevel', ['colsample_bylevel', 'rsm'], None), ('min_data_in_leaf', ['min_data_in_leaf', 'min_child_samples'], None), ('grow_policy', None), ('num_leaves', None), ('border_count', ['border_count', 'max_bin']), ('thread_count', ['thread_count', 'n_threads'], n_threads), ('verbose', None, False), ('allow_writing_files', None, False), ] params = utils.extract_params(self.config, params_config) if self.n_classes > 0: from catboost import CatBoostClassifier return CatBoostClassifier(random_state=seed, **params) else: from catboost import CatBoostRegressor return CatBoostRegressor(random_state=seed, **params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.cb_class_time, cpu_ram_params=ResourceParams.cb_class_ram) return rc.get_required_resources(ds) class CatBoostCustomMetric: # see https://stackoverflow.com/questions/65462220/how-to-create-custom-eval-metric-for-catboost # and https://catboost.ai/en/docs/concepts/python-usages-examples def __init__(self, metric_name: str, is_classification: bool, is_higher_better: bool = False, select_pred_col: Optional[int] = None): self.metric_name = metric_name self.is_classification = is_classification self.is_higher_better = is_higher_better self.select_pred_col = select_pred_col def is_max_optimal(self): return self.is_higher_better def evaluate(self, approxes, target, weight): assert len(target) == len(approxes[0]) assert weight is None y = torch.as_tensor(target, dtype=torch.long if self.is_classification else torch.float32) if len(y.shape) == 1: y = y[:, None] y_pred = torch.as_tensor(np.array(approxes), dtype=torch.float32).t() # CatBoost already provides logits in approxes if self.select_pred_col is not None: y_pred = y_pred[:, self.select_pred_col, None] if self.is_classification and y_pred.shape[1] == 1: # binary classification, CatBoost provides logits of the class 1 p = torch.sigmoid(y_pred) y_pred_probs = torch.cat([1. - p, p], dim=1) y_pred = torch.log(y_pred_probs + 1e-30) # print(f'{y.shape=}, {y_pred.shape=}') # print(f'{y_pred=}') loss = Metrics.apply(y_pred, y, self.metric_name).item() weight_sum = y.shape[0] return weight_sum * loss, weight_sum def get_final_error(self, error, weight): return error / (weight + 1e-38) class CatBoostSubSplitInterface(TreeBasedSubSplitInterface): def _get_params(self): # target parameter names, possible source parameter names, default value params_config = [('n_estimators', None, 1000), ('depth', ['depth', 'max_depth'], 6), ('random_strength', None, 1.0), ('l2_leaf_reg', None, 3.0), ('learning_rate', ['lr', 'learning_rate', 'eta']), ('one_hot_max_size', None), ('bagging_temperature', None), ('leaf_estimation_iterations', None), ('bootstrap_type', None), ('subsample', None), ('boosting_type', None, 'Plain'), # fix default to Plain to equalize CPU and GPU ('colsample_bylevel', ['colsample_bylevel', 'rsm'], None), ('min_data_in_leaf', ['min_data_in_leaf', 'min_child_samples'], None), ('grow_policy', None), ('max_leaves', ['max_leaves', 'num_leaves'], None), ('border_count', ['border_count', 'max_bin'], 254), # fix default to 254 for GPU as well ('used_ram_limit', None), ('od_type', 'Iter'), ('od_pval', None), ('od_wait', ['od_wait', 'early_stopping_rounds'], None), ('sampling_frequency', None), ('max_ctr_complexity', None), ('model_size_reg', None), ] params = utils.extract_params(self.config, params_config) params['verbose'] = self.config.get('verbosity', 0) > 0 bootstrap_type = params.get('bootstrap_type', 'Bayesian') if bootstrap_type == 'Bayesian': if 'subsample' in params: del params['subsample'] elif bootstrap_type == 'Bernoulli': if 'bagging_temperature' in params: del params['bagging_temperature'] grow_policy = params.get('grow_policy', 'SymmetricTree') if grow_policy != 'Lossguide': if 'max_leaves' in params: del params['max_leaves'] if grow_policy == 'SymmetricTree': if 'min_data_in_leaf' in params: del params['min_data_in_leaf'] return params def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': assert n_refit == 1 return CatBoostSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config) def _get_eval_metric(self, val_metric_name: Optional[str], n_classes: int) -> Union[str, CatBoostCustomMetric]: if n_classes == 0: if val_metric_name is None or val_metric_name == 'rmse': return 'RMSE' else: return CatBoostCustomMetric(metric_name=val_metric_name, is_classification=n_classes > 0, is_higher_better=False) # else: # raise ValueError(f'Validation metric "{val_metric_name}" is currently not implemented for CatBoost') else: # classification if val_metric_name is None or val_metric_name == 'classification_error': return 'ZeroOneLoss' elif val_metric_name == 'cross_entropy': return 'Logloss' if n_classes == 2 else 'MultiClass' elif val_metric_name == 'brier' and n_classes == 2: # catboost doesn't support brier score for multiclass yet return 'BrierScore' else: return CatBoostCustomMetric(metric_name=val_metric_name, is_classification=n_classes > 0, is_higher_better=False) # else: # raise ValueError(f'Validation metric "{val_metric_name}" is currently not implemented for CatBoost') # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/catboost_experiment.py def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]: params = copy.deepcopy(params) device: Optional[str] = params.pop('device', None) if device is not None and device.startswith('cuda'): params['task_type'] = 'GPU' params['devices'] = device.split(':')[1] if device.startswith('cuda:') else '0' if n_classes == 0: train_metric_name = self.config.get('train_metric_name', 'mse') # val_metric_name = self.config.get('val_metric_name', 'rmse') if train_metric_name == 'mse': params['loss_function'] = 'RMSE' elif train_metric_name.startswith('pinball('): quantile_str = train_metric_name[len('pinball('):-1] params['loss_function'] = f'Quantile:alpha={quantile_str}' else: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') elif n_classes == 2: params.update({'loss_function': 'Logloss'}) else: params.update({'loss_function': 'MultiClass', 'classes_count': n_classes}) params['eval_metric'] = self._get_eval_metric(self.config.get('val_metric_name', None), n_classes) params['allow_writing_files'] = False params['use_best_model'] = False # otherwise trees would get removed based only on a single split for key in ['random_strength', 'one_hot_max_size', 'leaf_estimation_iterations']: if key in params: params[key] = int(params[key]) return params def _convert_ds(self, ds: DictDataset) -> Any: import catboost x_df = ds.without_labels().to_df() if self.config.get('shuffle_columns', False): if not hasattr(self, 'col_perm_'): self.col_perm_ = np.random.permutation(x_df.shape[1]) x_df = x_df.iloc[:, self.col_perm_] label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy() cat_features = x_df.select_dtypes(include='category').columns.tolist() return catboost.Pool(x_df, label, cat_features=cat_features) def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int, n_threads: int, val_metric_name: Optional[str] = None, tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]: import catboost # print(f'Fitting CatBoost') n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item() params = self._preprocess_params(params, n_classes) params.update({'random_seed': seed, 'thread_count': n_threads}) if val_ds is None: params = utils.update_dict(params, remove_keys=['od_type', 'od_pval', 'od_wait']) if tmp_folder is not None: params.update({'allow_writing_files': True, 'save_snapshot': True, 'snapshot_file': str(tmp_folder / 'catboost_model.cbm'), 'snapshot_interval': 120.0}) # with these parameters, catboost will reload from the model automatically if it is there bst = catboost.CatBoost(params) with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='Can\'t optimize method "evaluate" because self argument is used') bst.fit(self._convert_ds(train_ds), eval_set=None if val_ds is None else self._convert_ds(val_ds)) if val_ds is not None: evals_result = bst.get_evals_result() # print(f'{evals_result["validation"]=}') eval_metric = self._get_eval_metric(self.config.get('val_metric_name', None), n_classes) eval_metric_name = eval_metric if isinstance(eval_metric, str) else eval_metric.__class__.__name__ val_errors = evals_result['validation'][eval_metric_name] else: val_errors = None return bst, val_errors def _predict(self, bst, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor: # bst should be of type catboost.CatBoost # print(f'CatBoost _predict(): {other_params=}') ntree_end = 0 if other_params is None else other_params['n_estimators'] prediction_type = 'RawFormulaVal' if n_classes == 0 else 'LogProbability' y_pred = torch.as_tensor( bst.predict(self._convert_ds(ds), ntree_end=ntree_end, prediction_type=prediction_type), dtype=torch.float32) if n_classes == 0: y_pred = y_pred.unsqueeze(-1) # print(f'{y_pred.shape=}') # print(f'{y_pred.mean(dim=0)=}') # # if torch.any(y_pred == -np.inf): # y_pred_prob = torch.softmax(y_pred, dim=-1) # # y_pred_prob = y_pred_prob.clamp(1e-10, 1) # y_pred = torch.log(y_pred_prob + 1e-30) # y_pred = torch.clamp(y_pred, -100.0, 100.0) # todo # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}') return y_pred def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, max_n_threads=8, max_depth=6), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.cb_class_time, cpu_ram_params=ResourceParams.cb_class_ram) return rc.get_required_resources(ds) class CatBoostHyperoptAlgInterface(OptAlgInterface): def __init__(self, space=None, n_hyperopt_steps: int = 50, **config): from hyperopt import hp default_config = {} max_config = {} # if space is None: # modified space from catboost quality benchmarks # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/catboost_experiment.py # space = { # 'depth': hp.choice('depth', [6]), # # only 'ctr_target_border_count' exists for this catboost version # # 'ctr_border_count': hp.choice('ctr_border_count', [16]), # 'border_count': hp.choice('border_count', [128]), # # deprecated, CounterMax not allowed # # 'ctr_description': hp.choice('ctr_description', [['Borders', 'CounterMax']]), # 'learning_rate': hp.loguniform('learning_rate', -5, 0), # 'random_strength': hp.choice('random_strength', [1, 20]), # 'one_hot_max_size': hp.choice('one_hot_max_size', [0, 25]), # 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)), # 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), # 'used_ram_limit': hp.choice('used_ram_limit', [100000000000]), # } # need to add defaults as well if space is None: space = config.get('hpo_space_name', None) if space == 'NODE' or space == 'popov': # space from NODE paper: # Popov, Morozov, and Babenko, Neural oblivious decision ensembles for deep learning on tabular data # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'learning_rate': hp.loguniform('learning_rate', -5, 0), 'random_strength': hp.quniform('random_strength', 1, 20, 1), 'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1), 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)), 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 10, 1), } default_config = dict(n_estimators=2048) max_config['max_depth'] = 6 elif space == 'shwartz-ziv': # from Shwartz-Ziv and Armon, Tabular data: Deep learning is not all you need # same as NODE except higher upper bound for leaf estimation iterations # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'learning_rate': hp.loguniform('learning_rate', -5, 0), 'random_strength': hp.quniform('random_strength', 1, 20, 1), 'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1), 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)), 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1), } default_config = dict(n_estimators=2048) # not specified from the paper, so we take the value from NODE max_config['max_depth'] = 6 elif space == 'tabpfn' or space == 'hollmann': # from Hollmann, Müller, Eggensperger, Hutter, # TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second # similar to shwartz-ziv except that one_hot_max_size is not specified and n_estimators is optimized # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'n_estimators': hp.quniform('n_estimators', 100, 4000, 1), 'learning_rate': hp.loguniform('learning_rate', -5, 0), 'random_strength': hp.quniform('random_strength', 1, 20, 1), 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)), 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1), } elif space == 'gorishniy': # from Gorishniy, Rubachev, Khrulkov, Babenko, Revisiting Deep Learning Models for Tabular Data space = { 'max_depth': hp.quniform('max_depth', 3, 10), 'learning_rate': hp.loguniform('learning_rate', np.log(1e-5), 0), 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)), 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 10, 1), } default_config = dict(n_estimators=2000) max_config['max_depth'] = 10 config = utils.update_dict(default_config, config) super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(), n_hyperopt_steps=n_hyperopt_steps, **config), max_resource_config=utils.join_dicts(config, max_config), **config) def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**config) for i in range(n_sub_splits)]) class RandomParamsCatBoostAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) # adapted from Shwartz-Ziv et al. hpo_space_name = self.config.get('hpo_space_name', 'shwartz-ziv') if hpo_space_name == 'shwartz-ziv': space = { 'learning_rate': np.exp(rng.uniform(-5, 0)), 'random_strength': rng.integers(1, 20, endpoint=True), 'one_hot_max_size': rng.integers(0, 25, endpoint=True), 'l2_leaf_reg': np.exp(rng.uniform(0, np.log(10))), 'bagging_temperature': rng.uniform(0, 1), 'leaf_estimation_iterations': rng.integers(1, 20, endpoint=True), 'n_estimators': 1000, 'max_depth': 6 } elif hpo_space_name == 'large': # todo: there should be no harm in tuning nan_mode in ['Min', 'Max'] space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(5e-1))), # bootstrap 'bootstrap_type': rng.choice(['Bayesian', 'Bernoulli']), # todo: could do more 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.5, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']), 'max_depth': rng.integers(1, 10, endpoint=True), # todo: support more for Lossguide 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))), # only for Depthwise and Lossguide 'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))), # only for Lossguide 'colsample_bylevel': rng.uniform(0.5, 1.0), 'random_strength': rng.uniform(0.0, 20.0), # todo: make log-uniform? 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(2, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(2e0))), 'max_ctr_complexity': rng.integers(1, 5, endpoint=True), } elif hpo_space_name == 'large-v2': # slightly shrunk version of large # todo: there should be no harm in tuning nan_mode in ['Min', 'Max'] space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.5, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']), # todo: could shrink 'max_depth': rng.integers(2, 10, endpoint=True), # todo: support more for Lossguide 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))), # only for Depthwise and Lossguide 'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))), # only for Lossguide 'colsample_bylevel': rng.uniform(0.5, 1.0), 'random_strength': rng.uniform(0.0, 20.0), # todo: make log-uniform? 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(2, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(2e0))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v3': # slightly shrunk version of large-v2 (shrunk random_strength and max_depth, colsample_bylevel, model_size_reg) # avoided removing lossguide for now # todo: there should be no harm in tuning nan_mode in ['Min', 'Max'] space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.5, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise', 'Lossguide']), # todo: could shrink 'max_depth': rng.integers(4, 10, endpoint=True), # todo: support more for Lossguide 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128.0)))), # only for Depthwise and Lossguide 'max_leaves': round(np.exp(rng.uniform(np.log(4.0), np.log(128.0)))), # only for Lossguide 'colsample_bylevel': rng.uniform(0.6, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(2, 128, endpoint=True), # todo: make logarithmic? 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v4': # slightly shrunk version of large-v3: # removed Lossguide -> also removed max_leaves # shrunk colsample_bylevel, min_data_in_leaf, one_hot_max_size # todo: there should be no harm in tuning nan_mode in ['Min', 'Max'] space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 10, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v5': # large-v4 but with max_depth <= 8 as in tabrepo1 space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v6': # large-v5 but with tabrepo lr search space space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v7': # large-v5 but with early_stopping_rounds=50 space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 1000, 'early_stopping_rounds': 50, 'max_bin': 254, # added this to be sure 'learning_rate': np.exp(rng.uniform(np.log(3e-2), np.log(8e-2))), # shrunk # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v8-10k': # large-v7 but with 10k estimators and the tabrepo1 lr search space space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'max_bin': 254, # added this to be sure 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'random_strength': rng.uniform(0.0, 2.0), # shrunk 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'large-v9-10k': # large-v8-10k but without tuning random_strength space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'max_bin': 254, # added this to be sure 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), # bootstrap 'bootstrap_type': 'Bernoulli', # shrunk # 'bagging_temperature': rng.uniform(0, 4), # can only be used with Bayesian 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! # PerTreeLevel not supported for Lossguide # 'sampling_frequency': rng.choice(['PerTree', 'PerTreeLevel']), # CPU only! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(100.0)))), # only for Depthwise and Lossguide 'colsample_bylevel': rng.uniform(0.85, 1.0), 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(20))), 'leaf_estimation_iterations': round(np.exp(rng.uniform(np.log(1.0), np.log(20.0)))), # categorical features 'one_hot_max_size': rng.integers(8, 128, endpoint=True), 'model_size_reg': np.exp(rng.uniform(np.log(1e-1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk } elif hpo_space_name == 'tabrepo1': space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 8, endpoint=True), 'l2_leaf_reg': rng.uniform(1.0, 5.0), 'max_ctr_complexity': rng.integers(1, 5, endpoint=True), 'one_hot_max_size': rng.choice([2, 3, 5, 10]), 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), } elif hpo_space_name == 'tabrepo1-es': space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'early_stopping_rounds': 50, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 8, endpoint=True), 'l2_leaf_reg': rng.uniform(1.0, 5.0), 'max_ctr_complexity': rng.integers(1, 5, endpoint=True), 'one_hot_max_size': rng.choice([2, 3, 5, 10]), 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), } elif hpo_space_name == 'tabrepo1-es-10k': space = { 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 8, endpoint=True), 'l2_leaf_reg': rng.uniform(1.0, 5.0), 'max_ctr_complexity': rng.integers(1, 5, endpoint=True), 'one_hot_max_size': rng.choice([2, 3, 5, 10]), 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), } elif hpo_space_name == 'tabarena': space = { 'n_estimators': 10_000, 'early_stopping_rounds': 300, # probably not exactly equivalent to TabArena 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'bootstrap_type': 'Bernoulli', 'subsample': rng.uniform(0.7, 1.0), # can only be used with Bernoulli (or Poisson)! 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']), 'max_depth': rng.integers(4, 8, endpoint=True), 'colsample_bylevel': rng.uniform(0.85, 1.0), 'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'leaf_estimation_iterations': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(21.0)))), # categorical features 'one_hot_max_size': np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))), 'model_size_reg': np.exp(rng.uniform(np.log(0.1), np.log(1.5))), 'max_ctr_complexity': rng.integers(2, 5, endpoint=True), # shrunk 'boosting_type': 'Plain', # avoid Ordered as the default on GPU 'max_bin': 254, # added this to be sure } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**config) for i in range(n_tv_splits)]) ================================================ FILE: pytabkit/models/alg_interfaces/ensemble_interfaces.py ================================================ import copy import time from pathlib import Path from typing import List, Optional, Dict import numpy as np import torch from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.data.data import DictDataset, TaskType from pytabkit.models.torch_utils import cat_if_necessary from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import Metrics from pytabkit.models.utils import ObjectLoadingContext class WeightedPrediction: def __init__(self, y_pred_list: List[torch.Tensor], task_type: TaskType): self.task_type = task_type self.y_pred_converted_list = y_pred_list if task_type == TaskType.REGRESSION \ else [torch.softmax(y_pred, dim=-1) for y_pred in y_pred_list] def predict_for_weights(self, weights: np.ndarray): weights = weights.astype(np.float32) norm_weights = weights / np.sum(weights) weighted_sum = sum([w * y_pred for w, y_pred in zip(norm_weights, self.y_pred_converted_list)]) if self.task_type == TaskType.CLASSIFICATION: weighted_sum = torch.log(weighted_sum + 1e-30) return weighted_sum class CaruanaEnsembleAlgInterface(SingleSplitAlgInterface): """ Following a simple variant of Caruana et al. (2004), "Ensemble selection from libraries of models" without pre-selection of candidates """ def __init__(self, alg_interfaces: List[AlgInterface], fit_params: Optional[List[Dict]] = None, **config): super().__init__(fit_params=fit_params, **config) self.alg_interfaces = alg_interfaces self.task_type = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': return CaruanaEnsembleAlgInterface([alg_interface.get_refit_interface(n_refit=n_refit) for alg_interface in self.alg_interfaces], fit_params=fit_params or self.fit_params) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory tmp_folder = tmp_folders[0] self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for i, ai in enumerate(self.alg_interfaces)] # store copies here, but the ones that will actually be trained are in alg_contexts_ # this means that models should not be held in RAM all the time self.alg_interfaces = copy.deepcopy(self.alg_interfaces) sub_fit_params = [] # train sub-models for alg_idx, alg_ctx in enumerate(self.alg_contexts_): with alg_ctx as alg_interface: sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in tmp_folders] if self.config.get('diversify_seeds', False): sub_idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs, test_idxs=idxs.test_idxs, split_seed=idxs.split_seed + alg_idx, sub_split_seeds=[sss + alg_idx for sss in idxs.sub_split_seeds], split_id=idxs.split_id) for idxs in idxs_list] else: sub_idxs_list = idxs_list alg_interface.fit(ds, sub_idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}') sub_fit_params.append(alg_interface.get_fit_params()[0]) if self.fit_params is not None: # this is the refit stage, there is no validation data set to determine the weights on, # instead the weights are already in fit_params return if idxs_list[0].val_idxs is None: raise ValueError('CaruanaEnsembleAlgInterface.fit(): Neither a validation set ' 'nor ensemble weights were provided') self.task_type = TaskType.CLASSIFICATION if ds.tensor_infos[ 'y'].get_cat_size_product() > 0 else TaskType.REGRESSION val_metric_name = self.config.get('ens_weight_metric_name', self.config.get('val_metric_name', None)) if val_metric_name is None: val_metric_name = Metrics.default_val_metric_name(task_type=self.task_type) n_caruana_steps = self.config.get('n_caruana_steps', 40) # default value is taken from TabRepo paper (IIRC) y_preds_oob_list = [] time_limit_s: Optional[float] = self.config.get('time_limit_s', None) start_time = time.time() for alg_idx, alg_ctx in enumerate(self.alg_contexts_): if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s: break with alg_ctx as alg_interface: y_preds = alg_interface.predict(ds) # get out-of-bag predictions y_preds_oob_list.append(cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]] for j in range(idxs_list[0].val_idxs.shape[0])], dim=0)) # get out-of-bag labels y = ds.tensors['y'] y_oob = cat_if_necessary([y[idxs_list[0].val_idxs[j]] for j in range(idxs_list[0].val_idxs.shape[0])], dim=0) weights = np.zeros(len(self.alg_contexts_), dtype=np.int32) best_weights = np.copy(weights) best_loss = np.inf wp = WeightedPrediction(y_preds_oob_list, self.task_type) allow_negative_weights = self.config.get('allow_negative_weights', False) for step_idx in range(n_caruana_steps): best_step_weights = None best_step_loss = np.inf for weight_idx in range(weights.shape[0]): weights[weight_idx] += 1 y_pred_oob = wp.predict_for_weights(weights) loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item() # print(f'{weights=}, {loss=}') if loss < best_step_loss: best_step_loss = loss best_step_weights = np.copy(weights) weights[weight_idx] -= 1 # negative weights option # check weights >= 2 allowing for floating-point errors if allow_negative_weights and np.sum(weights) >= 1.5: weights[weight_idx] -= 1 y_pred_oob = wp.predict_for_weights(weights) loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item() # print(f'{weights=}, {loss=}') if loss < best_step_loss: best_step_loss = loss best_step_weights = np.copy(weights) weights[weight_idx] += 1 if best_step_loss < best_loss: best_loss = best_step_loss best_weights = np.copy(best_step_weights) weights = best_step_weights logger.log(2, f'Obtained ensemble weights: {best_weights}') self.fit_params = [dict(alg_weights=best_weights.tolist(), sub_fit_params=sub_fit_params)] def predict(self, ds: DictDataset) -> torch.Tensor: weights = self.fit_params[0]['alg_weights'] sparse_weights = [] sparse_preds = [] for i, w in enumerate(weights): if w != 0: with self.alg_contexts_[i] as alg_interface: sparse_preds.append(alg_interface.predict(ds)) sparse_weights.append(w) wp = WeightedPrediction(sparse_preds, task_type=self.task_type) return wp.predict_for_weights(weights=np.asarray(sparse_weights)) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: single_resources = [ ssi.get_required_resources(ds, n_cv, n_refit, n_splits=n_splits, split_seeds=split_seeds, n_train=n_train) for ssi in self.alg_interfaces] return RequiredResources.combine_sequential(single_resources) def to(self, device: str) -> None: for alg_idx, alg_ctx in enumerate(self.alg_contexts_): with alg_ctx as alg_interface: alg_interface.to(device) class AlgorithmSelectionAlgInterface(SingleSplitAlgInterface): """ Picks the best model out of a list of candidates. """ def __init__(self, alg_interfaces: List[AlgInterface], fit_params: Optional[List[Dict]] = None, **config): super().__init__(fit_params=fit_params, **config) self.alg_interfaces = alg_interfaces self.task_type = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': # todo: could use sub_fit_params refit_interfaces = [] for alg_context in self.alg_contexts_: with alg_context as alg_interface: refit_interfaces.append(alg_interface.get_refit_interface(n_refit=n_refit)) return AlgorithmSelectionAlgInterface(refit_interfaces, fit_params=fit_params or self.fit_params) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory tmp_folder = tmp_folders[0] self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for i, ai in enumerate(self.alg_interfaces)] # store copies here, but the ones that will actually be trained are in alg_contexts_ # this means that models should not be held in RAM all the time self.alg_interfaces = copy.deepcopy(self.alg_interfaces) if self.fit_params is not None: # this is the refit stage, there is no validation data set to determine the best model on, # instead the best model index is already in fit_params best_alg_idx = self.fit_params[0]['best_alg_idx'] sub_tmp_folders = [tmp_folder / str(best_alg_idx) if tmp_folder is not None else None for tmp_folder in tmp_folders] with self.alg_contexts_[best_alg_idx] as alg_interface: alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{best_alg_idx}') return if idxs_list[0].val_idxs is None: raise ValueError('CaruanaEnsembleAlgInterface.fit(): Neither a validation set ' 'nor fit_params were provided') self.task_type = TaskType.CLASSIFICATION if ds.tensor_infos[ 'y'].get_cat_size_product() > 0 else TaskType.REGRESSION val_metric_name = self.config.get('alg_sel_metric_name', self.config.get('val_metric_name', None)) if val_metric_name is None: val_metric_name = Metrics.default_val_metric_name(task_type=self.task_type) # get out-of-bag labels y = ds.tensors['y'] y_oob = cat_if_necessary([y[idxs_list[0].val_idxs[i]] for i in range(idxs_list[0].val_idxs.shape[0])], dim=0) best_alg_idx = 0 best_alg_loss = np.inf best_sub_fit_params = None time_limit_s: Optional[float] = self.config.get('time_limit_s', None) start_time = time.time() for alg_idx, alg_ctx in enumerate(self.alg_contexts_): if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s: break with alg_ctx as alg_interface: sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in tmp_folders] alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}') y_preds = alg_interface.predict(ds) # get out-of-bag predictions y_pred_oob = cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]] for j in range(idxs_list[0].val_idxs.shape[0])], dim=0) loss = Metrics.apply(y_pred_oob.cpu(), y_oob.cpu(), val_metric_name).item() if loss < best_alg_loss: best_alg_loss = loss best_alg_idx = alg_idx best_sub_fit_params = alg_interface.get_fit_params()[0] self.fit_params = [dict(best_alg_idx=best_alg_idx, sub_fit_params=best_sub_fit_params)] logger.log(2, f'Best algorithm has index {best_alg_idx}') logger.log(2, f'Algorithm selection fit parameters: {self.fit_params[0]}') def predict(self, ds: DictDataset) -> torch.Tensor: alg_idx = self.fit_params[0]['best_alg_idx'] with self.alg_contexts_[alg_idx] as alg_interface: return alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: # too pessimistic for refit... single_resources = [ ssi.get_required_resources(ds, n_cv, n_refit, n_splits=n_splits, split_seeds=split_seeds, n_train=n_train) for ssi in self.alg_interfaces] return RequiredResources.combine_sequential(single_resources) def to(self, device: str) -> None: for alg_idx, alg_ctx in enumerate(self.alg_contexts_): with alg_ctx as alg_interface: alg_interface.to(device) class PrecomputedPredictionsAlgInterface(SingleSplitAlgInterface): def __init__(self, y_preds_cv: torch.Tensor, y_preds_refit: Optional[torch.Tensor], fit_params_cv: Dict, fit_params_refit: Optional[Dict]): super().__init__() self.y_preds_cv = y_preds_cv self.y_preds_refit = y_preds_refit self.is_refit = None self.fit_params_cv = fit_params_cv self.fit_params_refit = fit_params_refit def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': return self # todo: does this work? def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: self.is_refit = idxs_list[0].val_idxs is None self.fit_params = [self.fit_params_refit] if self.is_refit else [self.fit_params_cv] def predict(self, ds: DictDataset) -> torch.Tensor: if ds.n_samples != self.y_preds_cv.shape[1]: raise ValueError('Prediction can only be performed on the exact same dataset ' 'because this uses precomputed predictions') return self.y_preds_refit if self.is_refit else self.y_preds_cv def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: return RequiredResources(time_s=1e-5 * ds.n_samples, cpu_ram_gb=2.0, n_threads=1) ================================================ FILE: pytabkit/models/alg_interfaces/lightgbm_interfaces.py ================================================ import copy from pathlib import Path from typing import Optional, Dict, Tuple, Any, List import numpy as np import torch from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.resource_params import ResourceParams from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import OptAlgInterface, \ AlgInterface, RandomParamsAlgInterface from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, SingleSplitWrapperAlgInterface, \ SklearnSubSplitInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer import warnings from pytabkit.models.training.metrics import Metrics class LGBMCustomMetric: def __init__(self, metric_name: str, is_classification: bool, is_higher_better: bool = False): self.metric_name = metric_name self.is_classification = is_classification self.is_higher_better = is_higher_better def __call__(self, y_pred: np.ndarray, eval_data): # eval_data should be of type lgbm.Dataset y = torch.as_tensor(eval_data.get_label(), dtype=torch.long if self.is_classification else torch.float32) if len(y.shape) == 1: y = y[:, None] # print(f'{y_pred.shape=}, {eval_data.get_label().shape=}') y_pred = torch.as_tensor(y_pred, dtype=torch.float32) if len(y_pred.shape) == 1: if self.is_classification: if y_pred.shape[0] == y.shape[0]: # binary classification, transform into both class probabilities y_pred = torch.stack([1. - y_pred, y_pred], dim=-1) else: # bugged multiclass classification, need to reshape # print(y_pred[:7]) y_pred = y_pred.view(-1, y.shape[0]).t().contiguous() # print(y_pred[0, :].sum()) else: y_pred = y_pred[:, None] if self.is_classification: # go from probabilities to logits y_pred = torch.log(y_pred + 1e-30) eval_result = Metrics.apply(y_pred, y, metric_name=self.metric_name) # print(f'LightGBM metric value: {self.metric_name} = {eval_result.item():g}') return self.metric_name, eval_result, self.is_higher_better class LGBMSklearnSubSplitInterface(SklearnSubSplitInterface): def _get_cat_indexes_arg_name(self) -> str: return 'categorical_feature' def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None), ('max_depth', None), ('verbosity', None), ('learning_rate', ['lr', 'learning_rate', 'eta']), ('subsample', ['subsample', 'bagging_fraction']), ('colsample_bytree', ['colsample_bytree', 'feature_fraction']), ('bagging_freq', None), ('min_data_in_leaf', None), ('min_sum_hessian_in_leaf', ['min_sum_hessian_in_leaf', 'min_child_weight']), ('lambda_l1', ['lambda_l1', 'alpha', 'reg_alpha']), ('lambda_l2', ['lambda_l2', 'lambda', 'reg_lambda']), ('num_leaves', None), ('min_child_weight', None), ('boosting_type', None), ('max_bin', None), ('cat_smooth', None), ('cat_l2', None), ('n_jobs', ['n_jobs', 'n_threads'], n_threads), ] params = utils.extract_params(self.config, params_config) if self.n_classes > 0: from lightgbm import LGBMClassifier return LGBMClassifier(random_state=seed, **params) else: from lightgbm import LGBMRegressor return LGBMRegressor(random_state=seed, **params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, num_leaves=31), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.lgbm_class_time, cpu_ram_params=ResourceParams.lgbm_class_ram) return rc.get_required_resources(ds) class LGBMSubSplitInterface(TreeBasedSubSplitInterface): def _get_params(self): params_config = [('n_estimators', None, 1000), ('max_depth', None), ('verbosity', None, -1), ('learning_rate', ['lr', 'learning_rate', 'eta'], 0.1), ('subsample', ['subsample', 'bagging_fraction'], 1.0), ('colsample_bytree', ['colsample_bytree', 'feature_fraction'], 1.0), ('bagging_freq', None, 1), # 1 is not the default in the interface but 0 could be misleading ('min_data_in_leaf', None, 20), ('min_sum_hessian_in_leaf', ['min_sum_hessian_in_leaf', 'min_child_weight'], 1e-3), ('lambda_l1', ['lambda_l1', 'alpha', 'reg_alpha'], 0.0), ('lambda_l2', ['lambda_l2', 'lambda', 'reg_lambda'], 0.0), ('num_leaves', None, 31), ('boosting', ['boosting', 'boosting_type'], None), ('max_bin', None), ('cat_smooth', None), ('cat_l2', None), ('early_stopping_round', ['early_stopping_round', 'early_stopping_rounds'], None), ('extra_trees', None), ('max_cat_to_onehot', None), ('min_data_per_group', None), ] params = utils.extract_params(self.config, params_config) return params def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': assert n_refit == 1 return LGBMSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config) # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/lightgbm_experiment.py def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]: params = copy.deepcopy(params) if n_classes == 0: train_metric_name = self.config.get('train_metric_name', 'mse') if train_metric_name == 'mse': params.update({'objective': 'mean_squared_error'}) elif train_metric_name.startswith('pinball('): quantile = float(train_metric_name[len('pinball('):-1]) params.update({'objective': 'quantile', 'alpha': quantile}) else: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') elif n_classes <= 2: params.update({'objective': 'binary'}) elif n_classes > 2: params.update({'objective': 'multiclass', 'num_class': n_classes}) if 'num_leaves' in params: params['num_leaves'] = max(int(params['num_leaves']), 2) if 'min_data_in_leaf' in params: params['min_data_in_leaf'] = int(params['min_data_in_leaf']) return params def _convert_ds(self, ds: DictDataset) -> Any: import lightgbm as lgbm x_cont = ds.tensors['x_cont'].cpu().numpy() label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy() if label is not None and label.shape[1] == 1: label = label[:, 0] has_cat = 'x_cat' in ds.tensor_infos and ds.tensor_infos['x_cat'].get_n_features() > 0 if not has_cat: # no categorical columns return lgbm.Dataset(x_cont, label=label, categorical_feature=[]) x_df = ds.without_labels().to_df() cat_features = x_df.select_dtypes(include='category').columns.tolist() return lgbm.Dataset(x_df, label, categorical_feature=cat_features) def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int, n_threads: int, val_metric_name: Optional[str] = None, tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]: import lightgbm as lgbm from lightgbm import record_evaluation # print(f'Fitting LightGBM') n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item() params = self._preprocess_params(params, n_classes) params.update({ 'data_random_seed': 1 + seed, 'feature_fraction_seed': 2 + seed, 'bagging_seed': 3 + seed, 'drop_seed': 4 + seed, 'objective_seed': 5 + seed, 'extra_seed': 6 + seed, 'num_threads': n_threads }) eval_metric = None eval_name = None feval = None if val_ds is not None: if val_metric_name is None: val_metric_name = 'class_error' if n_classes > 0 else 'rmse' if val_metric_name == 'class_error': eval_metric = 'binary_error' if n_classes <= 2 else 'multi_error' elif val_metric_name == 'cross_entropy': eval_metric = 'binary_logloss' if n_classes <= 2 else 'multi_logloss' elif val_metric_name == 'rmse': eval_metric = 'rmse' elif val_metric_name == 'mae': eval_metric = 'mae' else: eval_name = val_metric_name feval = LGBMCustomMetric(val_metric_name, is_classification=n_classes > 0) if eval_metric is None: # specified custom metric, don't use pre-given metric eval_metric = "None" else: eval_name = eval_metric params['metric'] = eval_metric if val_ds is None: params = utils.update_dict(params, remove_keys=['early_stopping_round', 'early_stopping_rounds']) evals = [] if val_ds is None else [self._convert_ds(val_ds)] valid_names = [] if val_ds is None else ['val'] evals_result = {} train_ds = self._convert_ds(train_ds) # warning filtering taken from https://auto.gluon.ai/dev/_modules/autogluon/tabular/models/lgb/lgb_model.html with warnings.catch_warnings(): # Filter harmless warnings introduced in lightgbm 3.0, # future versions plan to remove: https://github.com/microsoft/LightGBM/issues/3379 warnings.filterwarnings('ignore', message='Overriding the parameters from Reference Dataset.') warnings.filterwarnings('ignore', message='categorical_column in param dict is overridden.') bst = lgbm.train(utils.update_dict(params, remove_keys=['n_estimators']), train_ds, valid_sets=evals, valid_names=valid_names, feval=feval, callbacks=[record_evaluation(evals_result)], num_boost_round=params['n_estimators']) # print(f'{params["n_estimators"]=}') if val_ds is not None: # print('evals_result val:', evals_result['val'], flush=True) val_errors = evals_result['val'][eval_name] else: val_errors = None return bst, val_errors def _predict(self, bst, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor: # bst should be of type lgbm.Booster # print(f'LGBM _predict() with {other_params=}') num_iteration = None if other_params is None else other_params['n_estimators'] y_pred = torch.as_tensor(bst.predict(self._convert_ds(ds).data, num_iteration=num_iteration), dtype=torch.float32) if n_classes == 0: y_pred = y_pred.unsqueeze(-1) elif n_classes <= 2: y_pred = torch.stack([1. - y_pred, y_pred], dim=-1) if n_classes >= 1: y_pred = torch.log(y_pred + 1e-30) # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}') return y_pred def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, num_leaves=31, max_n_threads=8), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.lgbm_class_time, cpu_ram_params=ResourceParams.lgbm_class_ram) return rc.get_required_resources(ds) class LGBMHyperoptAlgInterface(OptAlgInterface): def __init__(self, space=None, n_hyperopt_steps: int = 50, opt_method: str = 'hyperopt', **config): from hyperopt import hp default_config = {} max_config = dict() if space is None: space = config.get('hpo_space_name', None) if space == 'catboost_quality_benchmarks': # space from catboost quality benchmarks, # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/lightgbm_experiment.py # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'learning_rate': hp.loguniform('learning_rate', -7, 0), 'num_leaves': hp.qloguniform('num_leaves', 0, 7, 1), 'feature_fraction': hp.uniform('feature_fraction', 0.5, 1), 'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1), 'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5), 'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]), 'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]), } default_config = dict(n_estimators=5000) max_config['num_leaves'] = 1000 # about exp(7) elif space == 'tabpfn' or space == 'hollmann': # from Hollmann, Müller, Eggensperger, Hutter, # TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'n_estimators': hp.quniform('n_estimators', 50, 2000), # in the paper it says that this is not log but that's hard to believe, # especially when e^{-3} is the lower bound 'learning_rate': hp.loguniform('learning_rate', -3, 0), 'num_leaves': hp.qloguniform('num_leaves', np.log(5), np.log(50), 1), 'max_depth': hp.qloguniform('max_depth', np.log(3), np.log(20), 1), 'subsample': hp.uniform('subsample', 0.2, 0.8), 'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -5, 4), # this is min_child_weight 'lambda_l1': hp.choice('lambda_l1', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]), # this is reg_alpha 'lambda_l2': hp.choice('lambda_l2', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]), # this is reg_lambda } max_config['num_leaves'] = 50 elif space == 'mt-reg': # hand-guessed space for regression if opt_method == 'smac': from ConfigSpace import ConfigurationSpace, Float, Integer space = ConfigurationSpace() space.add_hyperparameters([ Integer('num_leaves', (16, 256), log=True, default=100), Float('feature_fraction', (0.4, 1), default=0.7), Float('bagging_fraction', (0.6, 1), default=1.0), Integer('min_data_in_leaf', (1, 64), log=True, default=3), ]) else: # assume hyperopt space = { 'num_leaves': hp.qloguniform('num_leaves', np.log(16), np.log(256), 1), 'feature_fraction': hp.uniform('feature_fraction', 0.4, 1), 'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1), np.log(64), 1), } default_config = dict(n_estimators=1000, learning_rate=5e-2, min_sum_hessian_in_leaf=1e-5) max_config['num_leaves'] = 256 elif space == 'mt-reg-2': # hand-guessed space for regression space = { 'num_leaves': hp.qloguniform('num_leaves', np.log(16), np.log(256), 1), 'learning_rate': hp.loguniform('learning_rate', np.log(2.5e-2), np.log(1e-1)), 'feature_fraction': hp.uniform('feature_fraction', 0.4, 1), 'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1), np.log(64), 1), } default_config = dict(n_estimators=1000, min_sum_hessian_in_leaf=1e-5) max_config['num_leaves'] = 256 config = utils.update_dict(default_config, config) opt_class = SMACOptimizer if opt_method == 'smac' else HyperoptOptimizer super().__init__(hyper_optimizer=opt_class(space=space, fixed_params=dict(), n_hyperopt_steps=n_hyperopt_steps, **config), max_resource_config=utils.join_dicts(config, max_config), **config) def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**config) for i in range(n_sub_splits)]) class RandomParamsLGBMAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) # adapted from catboost quality benchmarks hpo_space_name = self.config.get('hpo_space_name', 'cqb') if hpo_space_name == 'cqb': space = { 'learning_rate': np.exp(rng.uniform(-7, 0)), 'num_leaves': round(np.exp(rng.uniform(0, 7))), 'feature_fraction': rng.uniform(0.5, 1), 'bagging_fraction': rng.uniform(0.5, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(0, 6))), 'min_sum_hessian_in_leaf': np.exp(rng.uniform(-16, 5)), 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(-16, 2))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(-16, 2))]), 'n_estimators': 1000, } elif hpo_space_name == 'large': space = { 'early_stopping_rounds': 50, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(5e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(256)))), 'feature_fraction': rng.uniform(0.3, 1), 'bagging_fraction': rng.uniform(0.3, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(128)))), 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(20.0))), 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'n_estimators': 1000, 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(256)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(2.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v2': space = { 'early_stopping_rounds': 50, 'learning_rate': np.exp(rng.uniform(np.log(1e-2), np.log(1e-1))), # shrunk 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), # shrunk 'feature_fraction': rng.uniform(0.85, 1), # shrunk 'bagging_fraction': rng.uniform(0.7, 1), # shrunk 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), # shrunk but not much 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # shrunk # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'n_estimators': 1000, 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), # shrunk 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), # shrunk 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # shrunk # min_data_in_bin } elif hpo_space_name == 'large-v2-10k': space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(2e-2))), # shrunk 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), # shrunk 'feature_fraction': rng.uniform(0.85, 1), # shrunk 'bagging_fraction': rng.uniform(0.7, 1), # shrunk 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), # shrunk but not much 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # shrunk # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), # shrunk 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), # shrunk 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # shrunk # min_data_in_bin } elif hpo_space_name == 'large-v3-10k': # v2 but with the lr space of tabrepo1 space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), # shrunk 'feature_fraction': rng.uniform(0.85, 1), # shrunk 'bagging_fraction': rng.uniform(0.7, 1), # shrunk 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), # shrunk but not much 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # shrunk # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), # shrunk 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), # shrunk 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # shrunk # min_data_in_bin } elif hpo_space_name == 'large-v4-10k': # v3-10k but without tuning bagging_fraction space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), # shrunk 'feature_fraction': rng.uniform(0.85, 1), # shrunk 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), # shrunk but not much 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # shrunk # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), # shrunk 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), # shrunk 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # shrunk # min_data_in_bin } elif hpo_space_name == 'large-v5-10k': # v3-10k but without tuning all the categorical parameters space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), # shrunk 'feature_fraction': rng.uniform(0.85, 1), # shrunk 'bagging_fraction': rng.uniform(0.7, 1), # shrunk 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), # shrunk but not much 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # shrunk # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), } elif hpo_space_name == 'large-v6-10k': # v3-10k but with the tabrepo1 search space for feature_fraction space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v7-10k': # v6-10k but with increased min_data_in_leaf space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(64)))), 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), # could shrink more but one may want this for classification 'lambda_l1': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1.0)))]), 'lambda_l2': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(20.0)))]), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v8-10k': # v6-10k but without tuning lambda_l1 and lambda_l2 space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), 'min_sum_hessian_in_leaf': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v9-10k': # v8-10k but without tuning min_sum_hessian_in_leaf space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v10-10k': # v9-10k but with num_leaves from tabrepo1 space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'large-v11-10k': # v9-10k but without tuning bagging_fraction space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'num_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(200)))), 'feature_fraction': rng.uniform(0.4, 1), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(1.0), np.log(64)))), 'bagging_freq': 1, # already the default here but not in original LightGBM # based on https://arxiv.org/abs/2411.04324 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.exp(rng.uniform(np.log(1.0), np.log(200)))), 'cat_l2': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.exp(rng.uniform(np.log(8.0), np.log(100.0)))), # min_data_in_bin } elif hpo_space_name == 'tabrepo1-es': space = { 'early_stopping_rounds': 50, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'feature_fraction': rng.uniform(0.4, 1.0), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(60.0)))), 'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))), 'extra_trees': rng.choice([False, True]), } elif hpo_space_name == 'tabrepo1-es-10k': space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'feature_fraction': rng.uniform(0.4, 1.0), 'min_data_in_leaf': round(np.exp(rng.uniform(np.log(2.0), np.log(60.0)))), 'num_leaves': round(np.exp(rng.uniform(np.log(16.0), np.log(255)))), 'extra_trees': rng.choice([False, True]), } elif hpo_space_name == 'tabrepo1-fixed-es-10k': space = { 'early_stopping_rounds': 50, 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'feature_fraction': rng.uniform(0.4, 1.0), 'min_data_in_leaf': rng.integers(2, 60, endpoint=True), 'num_leaves': rng.integers(16, 255, endpoint=True), 'extra_trees': rng.choice([False, True]), } elif hpo_space_name == 'tabarena': space = { 'early_stopping_rounds': 300, # not exactly equivalent, probably 'n_estimators': 10_000, 'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'feature_fraction': rng.uniform(0.4, 1), 'bagging_fraction': rng.uniform(0.7, 1), 'bagging_freq': 1, # already the default here but not in original LightGBM 'num_leaves': np.floor(np.exp(rng.uniform(np.log(2.0), np.log(201)))), 'min_data_in_leaf': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(65)))), 'extra_trees': rng.choice([False, True]), 'min_data_per_group': round(np.floor(np.exp(rng.uniform(np.log(2.0), np.log(101))))), 'cat_l2': np.exp(rng.uniform(np.log(5e-3), np.log(2.0))), 'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))), 'max_cat_to_onehot': round(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0))))), 'lambda_l1': np.exp(rng.uniform(np.log(1e-5), np.log(1.0))), 'lambda_l2': np.exp(rng.uniform(np.log(1e-5), np.log(2.0))), } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**config) for i in range(n_tv_splits)]) ================================================ FILE: pytabkit/models/alg_interfaces/nn_interfaces.py ================================================ import copy import warnings from pathlib import Path from typing import List, Optional, Dict, Any, Union import numpy as np import torch from pytabkit.models.training.nn_creator import get_realmlp_auto_batch_size try: import lightning.pytorch as pl except ImportError: import pytorch_lightning as pl import logging from datetime import timedelta from pytabkit.models import utils from pytabkit.models.data.data import DictDataset from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer from pytabkit.models.nn_models.base import Layer, Variable from pytabkit.models.nn_models.models import NNFactory from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.torch_utils import cat_if_necessary from pytabkit.models.training.lightning_modules import TabNNModule, postprocess_multiquantile from pytabkit.models.training.logging import Logger from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, OptAlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources def get_lignting_accel_and_devices(device: str): if device == 'cpu': pl_accelerator = 'cpu' pl_devices = 'auto' elif device == 'mps': pl_accelerator = 'mps' pl_devices = 'auto' elif device == 'cuda': pl_accelerator = 'gpu' pl_devices = [0] elif device.startswith('cuda:'): pl_accelerator = 'gpu' pl_devices = [int(device[len('cuda:'):])] else: raise ValueError(f'Unknown device "{device}"') return pl_accelerator, pl_devices class NNAlgInterface(AlgInterface): def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.model: Optional[TabNNModule] = None self.device = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': return NNAlgInterface(fit_params if fit_params is not None else self.fit_params, **self.config) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str): # the code below requires all splits to have the same number of sub-splits assert np.all([idxs_list[i].train_idxs.shape[0] == idxs_list[0].train_idxs.shape[0] for i in range(len(idxs_list))]) # we can then decompose the overall number of sub-splits into the number of splits # and the number of sub-splits per split # print(f'Starting NN fit') # have the option to change the seeds (for comparing NNs with different random seeds) random_seed_offset = self.config.get('random_seed_offset', 0) if random_seed_offset != 0: idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs, test_idxs=idxs.test_idxs, split_seed=idxs.split_seed + random_seed_offset, sub_split_seeds=[seed + random_seed_offset for seed in idxs.sub_split_seeds], split_id=idxs.split_id) for idxs in idxs_list] if self.config.get('same_seed_for_sub_splits', False): idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs, test_idxs=idxs.test_idxs, split_seed=idxs.split_seed, sub_split_seeds=[idxs.sub_split_seeds[0]] * len(idxs.sub_split_seeds), split_id=idxs.split_id) for idxs in idxs_list] # https://stackoverflow.com/questions/74364944/how-to-get-rid-of-info-logging-messages-in-pytorch-lightning log = logging.getLogger("lightning") log.propagate = False log.setLevel(logging.ERROR) warnings.filterwarnings("ignore", message="You defined a `validation_step` but have no `val_dataloader`.") old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32 torch.backends.cuda.matmul.allow_tf32 = False # to be safe wrt rounding errors, but might not be necessary # todo: allow preprocessing on CPU and then only put batches on GPU in data loader? gpu_devices = interface_resources.gpu_devices self.device = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu' ds = ds.to(self.device) fit_params = self.fit_params if self.fit_params is None and 'stop_epoch' in self.config: fit_params = [dict(stop_epoch=self.config['stop_epoch'])] * len(idxs_list) n_epochs = self.config.get('n_epochs', 256) self.model = TabNNModule(**utils.join_dicts({'n_epochs': 256, 'logger': logger}, self.config), fit_params=fit_params) self.model.compile_model(ds, idxs_list, interface_resources) pl_accelerator, pl_devices = get_lignting_accel_and_devices(self.device) max_time = None if interface_resources.time_in_seconds is None else timedelta( seconds=interface_resources.time_in_seconds) self.min_trainer_kwargs = dict( max_time=max_time, accelerator=pl_accelerator, devices=pl_devices, max_epochs=n_epochs, enable_checkpointing=False, enable_progress_bar=False, num_sanity_val_steps=0, enable_model_summary=False, log_every_n_steps=1, ) # don't save the trainer in self, otherwise it stores the dataset trainer = pl.Trainer( max_time=max_time, accelerator=pl_accelerator, devices=pl_devices, callbacks=self.model.create_callbacks(), max_epochs=n_epochs, enable_checkpointing=False, enable_progress_bar=False, num_sanity_val_steps=0, logger=pl.loggers.logger.DummyLogger(), enable_model_summary=False, log_every_n_steps=1, ) trainer.fit( model=self.model, train_dataloaders=self.model.train_dl, val_dataloaders=self.model.val_dl ) if hasattr(self.model, 'fit_params'): self.fit_params = self.model.fit_params torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32 # remove all remaining references to GPU tensors, for some reason this can't be done in the model del self.model._trainer # self.model.to('cpu') # to allow serialization without GPU issues, but doesn't work # print(f'Importances (sorted):', self.get_importances().sort()[0]) def predict(self, ds: DictDataset) -> torch.Tensor: pred_dict = self.get_current_predict_params_dict() if 'val_metric_name' in pred_dict: self.model.restore_ckpt_for_val_metric_name(pred_dict['val_metric_name']) old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32 torch.backends.cuda.matmul.allow_tf32 = False self.model.to(self.device) ds = ds.to(self.device) ds_x, _ = ds.split_xy() pl_accelerator, pl_devices = get_lignting_accel_and_devices(self.device) # create new trainer so we don't have to pickle the full trainer that references the dataset somehow # update devices since the model device may have been moved since trainer = pl.Trainer(**(self.min_trainer_kwargs | dict(accelerator=pl_accelerator, devices=pl_devices, logger=pl.loggers.logger.DummyLogger()))) y_pred = trainer.predict(model=self.model, dataloaders=self.model.get_predict_dataloader(ds_x)) y_pred = cat_if_necessary(y_pred, dim=-2).to('cpu') # concat along batch dimension y_pred = postprocess_multiquantile(y_pred, **self.config) # postprocessing in case of multiquantile loss torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32 # remove all remaining references to GPU tensors, for some reason this can't be done in the model del self.model._trainer return y_pred def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: val_metric_names = self.config.get('val_metric_names', None) if val_metric_names is None: return {'': dict()} else: return {f'_val-{val_metric_name}': dict(val_metric_name=val_metric_name) for val_metric_name in val_metric_names} def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: tensor_infos = ds.tensor_infos factory = self.config.get('factory', None) if factory is None: factory = NNFactory(**self.config) fitter = factory.create(tensor_infos) static_fitter, dynamic_fitter = fitter.split_off_dynamic() static_tensor_infos = static_fitter.forward_tensor_infos(tensor_infos) n_params = fitter.get_n_params(tensor_infos) n_forward = fitter.get_n_forward(tensor_infos) n_parallel = max(n_cv, n_refit) * n_splits * self.config.get('n_ens', 1) batch_size = self.config.get('batch_size', 256) if batch_size == 'auto': batch_size = get_realmlp_auto_batch_size(n_train) # print(f'{batch_size=}') n_epochs = self.config.get('n_epochs', 256) # per-element RAM usage: # continuous data requires 4 bytes for forward pass and 4 for backward pass # categorical data requires 8 bytes for forward pass (because torch.long is required) and none for backward pass pass_memory = n_forward * batch_size * 8 # initial batch size ignored ds_size_gb = ds.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4) for ti in static_tensor_infos.values()]) / (1024 ** 3) ds_ram_gb = 5 * ds_size_gb # ds_ram_gb = 3 * task_info.get_ds_size_gb() / (1024**3) param_memory = 5 * n_params * 8 # 5 because of model, model copy, grads, adam mom, adam sq_mom fixed_ram_gb = 0.3 # go safe # print(f'{pass_memory=}, {param_memory=}') # max memory that would be used if the dataset wasn't used init_ram_gb_full = n_forward * ds.n_samples * 8 / (1024 ** 3) init_ram_gb_max = 1.2 # todo: rough estimate, a bit larger than what is allowed in fit_transform_subsample() init_ram_gb = min(init_ram_gb_max, init_ram_gb_full) # init_ram_gb = 1.5 # print(f'{ds_ram_gb=}, {pass_memory/(1024**3)=}, {param_memory/(1024**3)=}, {init_ram_gb=}') factor = 1.2 # to go safe on ram gpu_ram_gb = fixed_ram_gb + ds_ram_gb + max(init_ram_gb, factor * (n_parallel * (pass_memory + param_memory)) / (1024 ** 3)) gpu_usage = min(1.0, n_parallel / 200) # rather underestimate it and use up all the ram on the gpu # go somewhat safe, should be small anyway cpu_ram_gb = 0.3 + ds_ram_gb + 1.3 * (pass_memory + param_memory) / (1024 ** 3) time_approx = ds.n_samples * n_epochs * 4e-5 * (2 if n_refit > 0 else 1) if self.config.get('use_gpu', True): return RequiredResources(time_s=time_approx, n_threads=1.0, cpu_ram_gb=cpu_ram_gb, n_gpus=1, gpu_usage=gpu_usage, gpu_ram_gb=gpu_ram_gb) else: return RequiredResources(time_s=time_approx, n_threads=1.0, cpu_ram_gb=cpu_ram_gb + gpu_ram_gb) def get_model_ram_gb(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int]): tensor_infos = ds.tensor_infos factory = self.config.get('factory', None) if factory is None: factory = NNFactory(**self.config) fitter = factory.create(tensor_infos) n_params = fitter.get_n_params(tensor_infos) n_parallel = max(n_cv, n_refit) * n_splits factor = 1.2 # to go safe on ram return factor * n_parallel * n_params * 4 / (1024 ** 3) def to(self, device: str) -> None: # print(f'Move RealMLP model to device {device}') self.model.to(device) self.device = device def get_importances(self) -> torch.Tensor: net: Layer = self.model.model params = net.parameters() scale = None weight = None importances_param = self.config.get('feature_importances', None) for param in params: param: Variable = param scope_str = str(param.context.scope) if scope_str.endswith('layer-0/scale'): scale = param elif scope_str.endswith('layer-0/weight'): weight = param # print(scope_str) assert weight is not None with torch.no_grad(): # shape: (vectorized network dims) x n_features importances = weight.norm(dim=-1) if scale is not None: importances *= scale[..., 0, :].abs() p = self.config.get('importances_exponent', 1.0) importances = importances ** p # # # hard feature selection # n_remove = int(0.9 * importances.shape[-1]) # new_importances = torch.ones_like(importances) # for i in range(importances.shape[0]): # new_importances[i, torch.argsort(importances[i])[:n_remove]] = 0.0 # importances = new_importances # print(importances) if importances_param is not None: print(f'Using importances_param') importances *= importances_param[..., :] importances /= (importances.norm(dim=-1, keepdim=True) / np.sqrt(importances.shape[-1])) return importances def get_first_layer_weights(self, with_scale: bool) -> torch.Tensor: net: Layer = self.model.model params = net.parameters() scale = None weight = None for param in params: param: Variable = param scope_str = str(param.context.scope) if scope_str.endswith('layer-0/scale'): scale = param elif scope_str.endswith('layer-0/weight'): weight = param assert weight is not None if scale is not None and with_scale: with torch.no_grad(): return weight * scale[..., 0, :, None] else: return weight.data # todo: have option to move to/from GPU class NNHyperoptAlgInterface(OptAlgInterface): def __init__(self, space: Optional[Union[str, Dict[str, Any]]] = None, n_hyperopt_steps: int = 50, opt_method: str = 'hyperopt', **config): from hyperopt import hp default_config = config # todo max_config = copy.copy(default_config) if space == 'default': space = { 'lr': hp.loguniform('lr', np.log(2e-2), np.log(3e-1)), 'num_emb_type': hp.choice('num_emb_type', ['none', 'pl', 'plr', 'pbld']), 'add_front_scale': hp.choice('add_front_scale', [(0.6, True), (0.4, False)]), 'p_drop': hp.choice('p_drop', [(0.3, 0.0), (0.5, 0.15), (0.2, 0.3)]), 'wd': hp.choice('wd', [0.0, 0.02]), 'plr_sigma': hp.loguniform('plr_sigma', np.log(0.05), np.log(0.5)), 'act': hp.choice('act', ['relu', 'selu', 'mish']), 'hidden_sizes': hp.choice('hidden_sizes', [(0.6, [256] * 3), (0.2, [512]), (0.2, [64] * 5)]), 'ls_eps': hp.choice('ls_eps', [(0.3, 0.0), (0.7, 0.1)]) } utils.update_dict(default_config, remove_keys=list(space.keys())) elif not isinstance(space, dict): print(f'Unknown hyperparameter space: {space}') config = utils.update_dict(default_config, config) opt_class = SMACOptimizer if opt_method == 'smac' else HyperoptOptimizer super().__init__(hyper_optimizer=opt_class(space=space, fixed_params=default_config, n_hyperopt_steps=n_hyperopt_steps, **config), max_resource_config=utils.join_dicts(config), **config) def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: return NNAlgInterface(**config) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: required_resources = super().get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train) # add n_steps * model_ram_gb to required resources, because these will be stored alg_interface = NNAlgInterface(**self.max_resource_config) model_ram_gb = alg_interface.get_model_ram_gb(ds, n_cv, n_refit, n_splits, split_seeds) required_resources.cpu_ram_gb += self.hyper_optimizer.get_n_hyperopt_steps() * model_ram_gb return required_resources class RealMLPParamSampler: def __init__(self, is_classification: bool, hpo_space_name: str = 'default', **config): self.is_classification = is_classification self.hpo_space_name = hpo_space_name def sample_params(self, seed: int) -> Dict[str, Any]: assert self.hpo_space_name in ['default', 'clr', 'moresigma', 'moresigmadim', 'moresigmadimreg', 'moresigmadimsize', 'moresigmadimlr', 'probclass', 'probclass-mlp', 'large', 'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10', 'tabarena', 'tabarena-new', 'alt11', 'alt12', 'alt13', 'alt14', 'alt15', 'alt16', 'alt17', 'alt18', 'alt19', 'alt20'] rng = np.random.default_rng(seed=seed) if self.hpo_space_name == 'probclass-mlp': params = {'lr': np.exp(rng.uniform(np.log(1e-4), np.log(1e-2))), 'p_drop': rng.choice([0.0, 0.1, 0.2, 0.3]), 'wd': rng.choice([0.0, 1e-5, 1e-4, 1e-3])} default_params = DefaultParams.VANILLA_MLP_CLASS if self.is_classification else DefaultParams.VANILLA_MLP_REG return utils.join_dicts(default_params, params) hidden_size_options = [[256] * 3, [64] * 5, [512]] params = {'num_emb_type': rng.choice(['none', 'pbld', 'pl', 'plr']), 'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]), # convert to actual bool so it can be serialized 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'p_drop': rng.choice([0.0, 0.15, 0.3], p=[0.3, 0.5, 0.2]), 'wd': rng.choice([0.0, 2e-2]), 'plr_sigma': np.exp(rng.uniform(np.log(0.05), np.log(0.5))), 'act': rng.choice(['relu', 'selu', 'mish']), 'hidden_sizes': hidden_size_options[rng.choice([0, 1, 2], p=[0.6, 0.2, 0.2])]} if self.is_classification: params['ls_eps'] = rng.choice([0.0, 0.1], p=[0.3, 0.7]) if self.hpo_space_name == 'clr': params['lr'] = np.exp(rng.uniform(np.log(2e-3), np.log(3e-1))) params['lr_sched'] = 'constant' params['use_early_stopping'] = True params['early_stopping_multiplicative_patience'] = 1 params['early_stopping_additive_patience'] = 16 elif self.hpo_space_name == 'moresigma': params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) elif self.hpo_space_name == 'moresigmadim': params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32)))) params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64)))) elif self.hpo_space_name == 'moresigmadimreg': params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32)))) params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64)))) params['p_drop'] = rng.choice([0.0, rng.uniform(0.0, 0.5)]) params['wd'] = np.exp(rng.uniform(np.log(1e-5), np.log(4e-2))) elif self.hpo_space_name == 'moresigmadimsize': params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32)))) params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64)))) params['hidden_sizes'] = [rng.choice(np.arange(8, 513))] * rng.choice(np.arange(1, 6)) elif self.hpo_space_name == 'moresigmadimlr': params['plr_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) params['plr_hidden_1'] = 2 * round(np.exp(rng.uniform(np.log(1), np.log(32)))) params['plr_hidden_2'] = round(np.exp(rng.uniform(np.log(2), np.log(64)))) params['lr'] = np.exp(rng.uniform(np.log(5e-3), np.log(5e-1))) elif self.hpo_space_name == 'probclass': params['ls_eps'] = rng.choice([0.0, 0.1]) params['wd'] = rng.choice([0.0, 2e-3, 2e-2]) elif self.hpo_space_name == 'large': params = {'num_emb_type': rng.choice(['none', 'pbld', 'pl', 'plr']), 'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]), 'n_hidden': round(np.exp(rng.uniform(np.log(64), np.log(512)))), 'n_layers': rng.integers(1, 5, endpoint=True), 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.0, 0.6), 'wd': rng.choice([rng.uniform(0.0, 1e-3), np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), 'act': rng.choice(['relu', 'selu', 'mish', 'silu', 'gelu']), 'use_parametric_act': rng.choice([False, True]), 'p_drop_sched': rng.choice(['flat_cos', 'constant']), 'wd_sched': rng.choice(['flat_cos', 'constant']), 'ls_eps': rng.choice([0.0, rng.uniform(0.0, 0.2)]), 'lr_sched': rng.choice(['coslog4', 'cos']), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt1': params = {'num_emb_type': rng.choice(['none', 'pbld']), 'n_hidden': rng.choice([128, 256, 384]), 'n_layers': rng.integers(1, 3, endpoint=True), 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'p_drop': rng.uniform(0.0, 0.5), 'wd': np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e1))), 'act': rng.choice(['selu', 'mish', 'silu']), # 'use_parametric_act': rng.choice([False, True]), # 'p_drop_sched': rng.choice(['flat_cos', 'constant']), # 'wd_sched': rng.choice(['flat_cos', 'constant']), 'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(5e-2)))]), # 'lr_sched': rng.choice(['coslog4', 'cos']), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'use_early_stopping': True, 'early_stopping_multiplicative_patience': 2, 'early_stopping_additive_patience': 20, } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt2': # refined version of large params = {'num_emb_type': 'pbld', 'n_hidden': round(np.exp(rng.uniform(np.log(198), np.log(512)))), 'n_layers': rng.integers(1, 3, endpoint=True), 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.06, 0.6), 'wd': np.exp(rng.uniform(np.log(6e-3), np.log(1e-1))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))), 'act': rng.choice(['mish', 'silu']), 'wd_sched': rng.choice(['flat_cos', 'constant']), 'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(5e-2)))]), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'p_drop_sched': 'constant', } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt3': # refined version of alt2 (better for 20 steps but worse for 50) params = {'num_emb_type': 'pbld', 'n_hidden': round(np.exp(rng.uniform(np.log(323), np.log(480)))), 'n_layers': rng.integers(1, 2, endpoint=True), 'lr': np.exp(rng.uniform(np.log(3e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.1, 0.5), 'wd': np.exp(rng.uniform(np.log(6e-3), np.log(6e-2))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))), 'act': 'mish', 'wd_sched': 'flat_cos', 'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(2e-2)))]), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(4e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(1e-1), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.5), np.log(7.5))), 'p_drop_sched': 'constant', } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt4': # large space for regression params = {'num_emb_type': 'pbld', 'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]), 'n_hidden': round(np.exp(rng.uniform(np.log(128), np.log(512)))), 'n_layers': rng.integers(1, 4, endpoint=True), 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.0, 0.5), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), 'act': rng.choice(['mish', 'silu', 'elu']), 'use_parametric_act': True, 'p_drop_sched': rng.choice(['flat_cos', 'constant']), 'wd_sched': rng.choice(['flat_cos', 'constant']), 'lr_sched': rng.choice(['coslog4', 'cos']), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt5': # refined space for regression params = {'num_emb_type': 'pbld', 'add_front_scale': rng.choice([True, False], p=[0.6, 0.4]), 'n_hidden': round(np.exp(rng.uniform(np.log(128), np.log(512)))), 'n_layers': 4, 'lr': np.exp(rng.uniform(np.log(3e-2), np.log(1e-1))), 'p_drop': rng.uniform(0.0, 0.45), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), 'act': 'mish', 'use_parametric_act': True, 'p_drop_sched': 'flat_cos', 'wd_sched': 'flat_cos', 'lr_sched': 'coslog4', 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(3e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(7.5))), } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt6': # regression, manually adjusted from alt5 params = {'num_emb_type': 'pbld', 'add_front_scale': True, 'n_hidden': 256, 'n_layers': rng.choice([2, 3, 4]), 'lr': np.exp(rng.uniform(np.log(4e-2), np.log(2e-1))), 'p_drop': rng.uniform(0.0, 0.5), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), 'act': 'mish', 'use_parametric_act': True, 'p_drop_sched': 'flat_cos', 'wd_sched': 'flat_cos', 'lr_sched': 'coslog4', 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(3e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(7.5))), } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt7': # refined version of alt2 (classification) params = {'num_emb_type': 'pbld', 'n_hidden': 256, 'n_layers': rng.integers(1, 4, endpoint=True), 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.0, 0.6), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(30))), 'act': 'mish', 'wd_sched': rng.choice(['flat_cos', 'constant']), 'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(2e-1)))]), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'p_drop_sched': 'constant', } params['hidden_sizes'] = [params['n_hidden']] * params['n_layers'] elif self.hpo_space_name == 'alt8': # version of alt2 (classification) with some new hyperparameters params = {'num_emb_type': 'pbld', 'hidden_sizes': 'rectangular', 'hidden_width': 256, 'ls_eps_sched': 'coslog4', 'tfms': [['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], ['one_hot', 'mean_center', 'l2_normalize', 'embedding']][rng.choice([0, 1])], 'batch_size': [256, 'auto'][rng.choice([0, 1])], 'n_hidden_layers': rng.integers(1, 4, endpoint=True), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(5.0))), 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), 'p_drop': rng.uniform(0.06, 0.6), 'wd': np.exp(rng.uniform(np.log(6e-3), np.log(1e-1))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))), 'act': rng.choice(['mish', 'silu']), 'wd_sched': rng.choice(['flat_cos', 'constant']), 'ls_eps': rng.choice([0.0, np.exp(rng.uniform(np.log(5e-3), np.log(1e-1)))]), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(1e-3), np.log(1e-1))), 'plr_lr_factor': np.exp(rng.uniform(np.log(3e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'p_drop_sched': 'constant', } elif self.hpo_space_name == 'alt9': # version of alt8 (classification) with reduced search spaces, and increased with space # removed batch_size tuning, tfms tuning params = {'num_emb_type': 'pbld', 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), # added 'ls_eps_sched': 'coslog4', 'n_hidden_layers': rng.integers(1, 3, endpoint=True), # reduced 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), # reduced 'lr': np.exp(rng.uniform(np.log(1e-2), np.log(5e-1))), # todo: could reduce this 'p_drop': rng.uniform(0.0, 0.5), # reduced 'wd': np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), # reduced 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(15))), 'act': rng.choice(['mish', 'silu']), 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), # reduced 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), # reduced 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'p_drop_sched': 'constant', } elif self.hpo_space_name == 'alt10': # version of alt9, similar to tabrepo params = {'num_emb_type': 'pbld', 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'ls_eps_sched': 'coslog4', 'act': 'mish', 'n_hidden_layers': rng.integers(1, 4, endpoint=True), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'p_drop': rng.uniform(0.0, 0.5), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(2e-1))), 'use_ls': rng.choice([False, True]), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'p_drop_sched': 'flat_cos', } elif self.hpo_space_name == 'tabarena': # common search space params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) } if rng.uniform(0.0, 1.0) > 0.5: # large configs params['plr_hidden_1'] = rng.choice([8, 16, 32, 64]).item() params['plr_hidden_2'] = rng.choice([8, 16, 32, 64]).item() params['n_epochs'] = rng.choice([256, 512]).item() params['use_early_stopping'] = True # set in the defaults of RealMLP in TabArena params['early_stopping_multiplicative_patience'] = 3 params['early_stopping_additive_patience'] = 40 else: # default values, used here to always set the same set of parameters params['plr_hidden_1'] = 16 params['plr_hidden_2'] = 4 params['n_epochs'] = 256 params['use_early_stopping'] = False elif self.hpo_space_name == 'tabarena-new': # common search space params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) # added in tabarena-new compared to tabarena 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'n_ens': 8, "ens_av_before_softmax": False, } if rng.uniform(0.0, 1.0) > 0.5: # large configs params['plr_hidden_1'] = rng.choice([8, 16, 32, 64]).item() params['plr_hidden_2'] = rng.choice([8, 16, 32, 64]).item() params['n_epochs'] = rng.choice([256, 512]).item() params['use_early_stopping'] = True # set in the defaults of RealMLP in TabArena params['early_stopping_multiplicative_patience'] = 3 params['early_stopping_additive_patience'] = 40 else: # default values, used here to always set the same set of parameters params['plr_hidden_1'] = 16 params['plr_hidden_2'] = 4 params['n_epochs'] = 256 params['use_early_stopping'] = False elif self.hpo_space_name == 'alt11': # tabarena without the large configs params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) } elif self.hpo_space_name == 'alt12': # alt11 with n_hidden_layers=1 in the search space params = { 'n_hidden_layers': rng.integers(1, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) } elif self.hpo_space_name == 'alt13': # alt11 with more categorical hyperparameters params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), } elif self.hpo_space_name == 'alt14': # alt13 with weight_init_mode='normal' params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'weight_init_mode': 'normal', } elif self.hpo_space_name == 'alt15': # alt13 with tuning momentum (beta1) params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'mom': 1.0 - np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), # tune in [0.7, 0.98] } elif self.hpo_space_name == 'alt16': # alt13 with n_ens=2 params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'n_ens': 2, 'ens_av_before_softmax': True, } elif self.hpo_space_name == 'alt17': # alt13 with n_ens=4 params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'n_ens': 4, 'ens_av_before_softmax': True, } elif self.hpo_space_name == 'alt18': # alt17 but with averaging after softmax params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'n_ens': 4, 'ens_av_before_softmax': False, } elif self.hpo_space_name == 'alt19': # alt13 with numerical preprocessing tuning tfms_list = [ ['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], ['one_hot', 'mean_center', 'l2_normalize', 'smooth_clip', 'embedding'], ] params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'tfms': tfms_list[int(rng.choice([0, 1]))], 'smooth_clip_max_abs_value': np.exp(rng.uniform(np.log(1.0), np.log(10.0))) } elif self.hpo_space_name == 'alt20': # alt13 with numerical preprocessing tuning (but without the max_abs_value unlike alt19) tfms_list = [ ['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], ['one_hot', 'mean_center', 'l2_normalize', 'smooth_clip', 'embedding'], ] params = { 'n_hidden_layers': rng.integers(2, 4, endpoint=True), 'hidden_sizes': 'rectangular', 'hidden_width': rng.choice([256, 384, 512]), 'p_drop': rng.uniform(0.0, 0.5), 'act': 'mish', 'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))), 'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))), 'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))), 'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))), 'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))), 'ls_eps_sched': 'coslog4', 'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'p_drop_sched': 'flat_cos', 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))), 'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression) 'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()), 'embedding_size': int(rng.choice([4, 8, 16])), 'tfms': tfms_list[int(rng.choice([0, 1]))], } # print(f'{params=}') default_params = DefaultParams.RealMLP_TD_CLASS if self.is_classification else DefaultParams.RealMLP_TD_REG return utils.join_dicts(default_params, params) class RandomParamsNNAlgInterface(SingleSplitAlgInterface): def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config): # model_idx is used for seeding along with the seed given in fit(), # so we can do HPO by combining multiple RandomParamsNNAlgInterface objects with different model_idx values super().__init__(fit_params=fit_params, **config) self.model_idx = model_idx self.alg_interface = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': raise NotImplementedError('Refit is not fully implemented...') # return RandomParamsNNAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params, # **self.config) def _create_sub_interface(self, ds: DictDataset, seed: int): # this is also set in get_required_resources, but okay if self.fit_params is None: hparam_seed = utils.combine_seeds(seed, self.model_idx) is_classification = not ds.tensor_infos['y'].is_cont() self.fit_params = [RealMLPParamSampler(is_classification, **self.config).sample_params(hparam_seed)] # todo: need epoch for refit params = utils.join_dicts(self.config, self.fit_params[0], self.config.get('override_params', dict()) or dict()) # params = utils.update_dict(self.fit_params[0], self.config) if 'n_epochs' in self.config: params['n_epochs'] = self.config['n_epochs'] self.fit_params[0] = params return NNAlgInterface(fit_params=None, **params) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed) logger.log(1, f'{self.fit_params=}') self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name) self.fit_params[0]['sub_fit_params'] = self.alg_interface.fit_params[0] def predict(self, ds: DictDataset) -> torch.Tensor: self.alg_interface.set_current_predict_params(self.get_current_predict_params_name()) return self.alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert len(split_seeds) == 1 alg_interface = self._create_sub_interface(ds, split_seeds[0]) return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train) def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: return NNAlgInterface(**self.config).get_available_predict_params() def to(self, device: str) -> None: self.alg_interface.to(device) # class NNHyperoptAlgInterface(OptAlgInterface): # def __init__(self, space=None, n_hyperopt_steps: int = 50, **config): # from hyperopt import hp # default_config = {} # max_config = {} # if space is None: # space = { # 'num_emb_type': hp.choice(['none', 'pl-densenet', 'plr']), # 'add_front_scale': hp.choice([True, False]), # 'lr': hp.loguniform([2e-2, 1.5e-1]), # 'p_drop': hp.choice([0.0, 0.15, 0.3, 0.45]), # 'hidden_sizes': hp.choice([[256]*3, [512]]), # 'act': hp.choice(['selu', 'mish', 'relu']), # 'ls_eps': hp.choice([0.0, 1.0]) # } # # todo: have conversion function? # config = utils.update_dict(default_config, config) # super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(), # n_hyperopt_steps=n_hyperopt_steps, # **config), # max_resource_config=utils.join_dicts(config, max_config), # **config) # # def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: # return NNAlgInterface(**config) ================================================ FILE: pytabkit/models/alg_interfaces/other_interfaces.py ================================================ import os from typing import Any, List, Optional import numpy as np import pandas as pd import torch from sklearn.compose import TransformedTargetRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, \ GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor from sklearn.preprocessing import StandardScaler from pytabkit.models.alg_interfaces.alg_interfaces import RandomParamsAlgInterface from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface, SingleSplitWrapperAlgInterface from pytabkit.models import utils from pytabkit.models.data.data import DictDataset class RFSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None), ('criterion', None), ('max_depth', None), ('min_samples_split', None), ('max_features', None), ('min_samples_leaf', None), ('bootstrap', None), ('min_impurity_decrease', None), ('min_weight_fraction_leaf', None), ('max_leaf_nodes', None), ('max_samples', None), ('n_jobs', ['n_jobs', 'n_threads'], n_threads), ('verbose', ['verbose', 'verbosity'])] params = utils.extract_params(self.config, params_config) if not params.get('bootstrap', True) and 'max_samples' in params: del params['max_samples'] if self.n_classes > 0: return RandomForestClassifier(random_state=seed, **params) else: train_metric_name = self.config.get('train_metric_name', None) if train_metric_name == 'mse': params['criterion'] = 'squared_error' # is the default anyway elif train_metric_name == 'mae': params['criterion'] = 'absolute_error' elif train_metric_name is not None: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') reg = RandomForestRegressor(random_state=seed, **params) if self.config.get('standardize_target', False): reg = TransformedTargetRegressor(reg, transformer=StandardScaler()) return reg def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class RandomParamsRFAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) hpo_space_name = self.config.get('hpo_space_name', 'grinsztajn') if hpo_space_name == 'grinsztajn': # adapted from Grinsztajn et al. (2022) space = { 'n_estimators': 250, 'max_depth': rng.choice([None, 2, 3, 4], p=[0.7, 0.1, 0.1, 0.1]), 'criterion': rng.choice(['gini', 'entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': rng.choice([2, 3], p=[0.95, 0.05]), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(50.5)))), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, 0.01, 0.02, 0.05], p=[0.85, 0.05, 0.05, 0.05]), 'tfms': ['one_hot'], } elif hpo_space_name == 'large-v1': space = { 'n_estimators': 300, # this wasn't used in the experiments # 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 2, 3, 4, 6, 8, 12, 16]), 'criterion': rng.choice(['gini', 'entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(128.0)))), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-1)))]), 'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)], } elif hpo_space_name == 'large-v2': # large-v1 but reduced max_depth, criterion, min_samples_leaf, min_impurity_decrease # added max_leaf_nodes back in space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)], } elif hpo_space_name == 'large-v3': # large-v2 but not tuning min_impurity_decrease, reduced max_depth, reduced min_samples_split, # only 100 estimators space = { 'n_estimators': 100, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)], } elif hpo_space_name == 'large-v4': # large-v2 but only ordinal encoding space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v5': # large-v3 but with 300 estimators and only ordinal encoding space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v6': # large-v4 but only bootstrap=True space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v7': # large-v6 but not tuning max_leaf_nodes space = { 'n_estimators': 300, 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error', 'absolute_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v8': # large-v4 but not tuning max_leaf_nodes, not allowing absolute_error space = { 'n_estimators': 300, 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v9': # large-v8 but tuning max_leaf_nodes again space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v10': # large-v9 but not tuning min_impurity_decrease space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v11': # large-v9 but tuning one-hot encoding space = { 'n_estimators': 300, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 12, 16]), 'criterion': rng.choice(['entropy']) if is_classification else rng.choice(['squared_error']), 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(8.0)))), 'min_samples_leaf': 1, 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(5e-3)))]), 'bootstrap': rng.choice([True, False]), 'tfms': [['one_hot'], ['ordinal_encoding']][rng.integers(0, 1, endpoint=True)], } elif hpo_space_name == 'large-v12': # very large space like large-v1 but a bit different # only 50 estimators -> use with bagging space = { 'n_estimators': 50, 'max_depth': rng.choice([6, 8, 12, 16, 20]), 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': rng.choice(['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, None]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(64.0)))), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]), # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v12': # very large space like large-v1 but a bit different # only 50 estimators -> use with bagging space = { 'n_estimators': 50, 'max_depth': rng.choice([6, 8, 12, 16, 20]), 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': rng.choice(['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, None]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(64.0)))), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]), # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v13': # reduced version on large-v12 based on talent-reg-small space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_depth': rng.choice([16, 20]), 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-3), np.log(1e-1)))]), # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v14': # reduced version of large-v13 based on talent-reg-small # changed max_features, removed max_depth, changed min_impurity_decrease # removed tuning max_samples since it doesn't seem to do much? # this doesn't perform very well (target was not standardized for regression) space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': rng.uniform(0.2, 0.9), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), 'tfms': ['ordinal_encoding'], } elif hpo_space_name == 'large-v15': # large-v14 but with standardized target # better space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': rng.uniform(0.2, 0.9), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v16': # large-v15 but don't tune min_impurity_decrease. Also go back to old max_features space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v17': # large-v16 but with tuning max_samples (wasn't used) space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v18': # large-v16 but with max_depth limit (equivalent to large-v13 without tuning min_impurity_decrease) space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), # 'max_samples': rng.uniform(0.4, 1.0), # this was accidentally not used 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v19': # large-v18 but with tuning max_samples space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v20': # large-v19 but with tuning min_impurity_decrease, with 300 estimator, a few more max_depth options space = { 'n_estimators': 300, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([12, 16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(5e-3)))]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v21': # large-v20 but with different max_depth, min_impurity_decrease, and 50 estimators space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v22': # large-v21 but without bootstrap=False space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v23': # large-v21 but with 100 estimators space = { 'n_estimators': 100, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-3)))]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v24': # large-v21 but without tuning min_impurity_decrease space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v25': # large-v21 but with different min_impurity_decrease space space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': 1, 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v26': # large-v25 but with tuning min_samples_leaf space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'max_depth': rng.choice([16, 20, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(4.5)))), 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v27': # inspired from XT but with both bootstrap options space = { 'n_estimators': 50, 'max_features': ['sqrt', 0.5, 0.75, 1.0][rng.integers(4)], 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': rng.choice([0.0, np.exp(rng.uniform(np.log(1e-5), np.log(1e-3)))]), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'tabrepo1': space = { 'n_estimators': 300, 'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True), 'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]), 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'tfms': ['one_hot'], } elif hpo_space_name == 'tabrepo1-ordinal': space = { 'n_estimators': 300, 'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True), 'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]), 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'tfms': ['ordinal_encoding'], # failed to fix it } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**config) for i in range(n_tv_splits)]) class ExtraTreesSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None), ('criterion', None), ('max_depth', None), ('min_samples_split', None), ('max_features', None), ('min_samples_leaf', None), ('bootstrap', None), ('min_impurity_decrease', None), ('min_weight_fraction_leaf', None), ('max_leaf_nodes', None), ('max_samples', None), ('n_jobs', ['n_jobs', 'n_threads'], n_threads), ('verbose', ['verbose', 'verbosity'])] params = utils.extract_params(self.config, params_config) if not params.get('bootstrap', True) and 'max_samples' in params: del params['max_samples'] if self.n_classes > 0: return ExtraTreesClassifier(random_state=seed, **params) else: train_metric_name = self.config.get('train_metric_name', None) if train_metric_name == 'mse': params['criterion'] = 'squared_error' # is the default anyway elif train_metric_name == 'mae': params['criterion'] = 'absolute_error' elif train_metric_name is not None: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') reg = ExtraTreesRegressor(random_state=seed, **params) if self.config.get('standardize_target', False): reg = TransformedTargetRegressor(reg, transformer=StandardScaler()) return reg def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class RandomParamsExtraTreesAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) hpo_space_name = self.config['hpo_space_name'] if hpo_space_name == 'large-v1': space = { 'n_estimators': 50, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'max_depth': rng.choice([None, 8, 12, 16]), 'criterion': rng.choice(['gini', 'entropy']) if is_classification else 'squared_error', 'max_features': rng.choice(['sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(8.0)))), 'max_samples': float(rng.uniform(0.4, 1.0)), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v2': # large-v1 shrunken space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.6), np.log(4.5)))), 'bootstrap': rng.choice([True, False]), 'max_samples': rng.uniform(0.4, 1.0), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v3': # large-v2 shrunken # very good for classification # tuning of max_features may be unnecessary, default might work just as well # maybe could go even larger with min_samples_split space = { 'n_estimators': 50, 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'min_samples_leaf': 1, 'bootstrap': False, # 'max_samples': rng.uniform(0.4, 1.0), # irrelevant without bootstrap 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), # could decrease upper bound to 5e-4 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v4': # large space for regression tests space = { 'n_estimators': 50, 'max_leaf_nodes': round(np.exp(rng.uniform(np.log(500), np.log(100_000)))), 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': rng.choice([0.4, 0.6, 0.8, None]), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': 1, 'max_samples': float(rng.uniform(0.4, 1.0)), 'bootstrap': rng.choice([True, False]), 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v5': # shrunken version of large-v4 for regression # min_impurity_decrease could be shrunk more space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': float(rng.uniform(0.5, 1.0)), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': 1, # 'max_samples': float(rng.uniform(0.4, 1.0)), # irrelevant without bootstrap 'bootstrap': False, 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-6), np.log(5e-4))), 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v6': # large-v5 without tuning min_impurity_decrease space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': float(rng.uniform(0.5, 1.0)), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': 1, # 'max_samples': float(rng.uniform(0.4, 1.0)), # irrelevant without bootstrap 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v7': # large-v6 with tuning max_leaf_nodes # doesn't help space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True), 'max_features': float(rng.uniform(0.5, 1.0)), 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': 1, # 'max_samples': float(rng.uniform(0.4, 1.0)), # irrelevant without bootstrap 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v8': # large-v6 but with different tuning space for max_features space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': 1, 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v9': # large-v8 but tuning min_samples_leaf space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(8.5)))), 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v10': # large-v9 but without tuning min_samples_split space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'min_samples_split': 2, 'min_samples_leaf': round(np.exp(rng.uniform(np.log(1.5), np.log(8.5)))), 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v11': # large-v10 but with fixed tuning space for min_samples_leaf space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'min_samples_split': 2, 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.5), np.log(8.5)))), 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v12': # large-v9 but with fixed tuning space for min_samples_leaf space = { 'n_estimators': 50, 'criterion': 'entropy' if is_classification else 'squared_error', 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(32.0)))), 'min_samples_leaf': round(np.exp(rng.uniform(np.log(0.5), np.log(8.5)))), 'bootstrap': False, 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v13': # large-v3 with different max_features space space = { 'n_estimators': 50, 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'min_samples_leaf': 1, 'bootstrap': False, # 'max_samples': rng.uniform(0.4, 1.0), # irrelevant without bootstrap 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), # could decrease upper bound to 5e-4 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'large-v14': # large-v3 with different max_features space space = { 'n_estimators': 50, 'max_features': ['sqrt', 0.5, 0.75, 1.0][rng.integers(4)], 'criterion': 'entropy' if is_classification else 'squared_error', 'min_samples_split': round(np.exp(rng.uniform(np.log(1.5), np.log(16.0)))), 'min_samples_leaf': 1, 'bootstrap': False, # 'max_samples': rng.uniform(0.4, 1.0), # irrelevant without bootstrap 'min_impurity_decrease': np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), # could decrease upper bound to 5e-4 'tfms': ['ordinal_encoding'], 'standardize_target': True, } elif hpo_space_name == 'tabrepo1-mod': space = { 'n_estimators': 50, # not completely sure if tabrepo1 uses entropy 'criterion': 'entropy' if is_classification else 'squared_error', 'max_leaf_nodes': rng.integers(5000, 50000, endpoint=True), 'min_samples_leaf': rng.choice([1, 2, 3, 4, 5, 10, 20, 40, 80]), 'max_features': ['sqrt', 'log2', 0.5, 0.75, 1.0][rng.integers(5)], 'tfms': ['ordinal_encoding'], } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([ExtraTreesSubSplitInterface(**config) for i in range(n_tv_splits)]) class GBTSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None), ('learning_rate', None), ('subsample', None), ('max_depth', None), ('verbose', ['verbose', 'verbosity'])] params = utils.extract_params(self.config, params_config) if self.n_classes > 0: return GradientBoostingClassifier(random_state=seed, **params) else: train_metric_name = self.config.get('train_metric_name', 'mse') if train_metric_name == 'mse': pass # is the default anyway elif train_metric_name.startswith('pinball('): quantile = float(train_metric_name[len('pinball('):-1]) params['loss'] = f'quantile' params['alpha'] = quantile elif train_metric_name == 'mae': params['loss'] = 'absolute_error' else: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') return GradientBoostingRegressor(random_state=seed, **params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class KNNSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_neighbors', None), ('weights', None), ('p', None), ('n_jobs', ['n_jobs', 'n_threads'], n_threads)] params = utils.extract_params(self.config, params_config) if self.n_classes > 0: from sklearn.neighbors import KNeighborsClassifier return KNeighborsClassifier(**params) else: from sklearn.neighbors import KNeighborsRegressor return KNeighborsRegressor(**params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class RandomParamsKNNAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) hpo_space_name = self.config['hpo_space_name'] if hpo_space_name == 'v1': space = { 'n_neighbors': int(np.exp(rng.uniform(np.log(1.0), np.log(101.0)))), 'weights': rng.choice(['uniform', 'distance']), # 'p': np.exp(rng.uniform(np.log(0.2), np.log(8.0))), # values outside of 1 and 2 can be very slow 'p': rng.choice([1, 2]), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'tabrepo1': space = { 'n_neighbors': rng.choice([3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 20, 30, 40, 50]), 'weights': rng.choice(['uniform', 'distance']), 'p': rng.choice([1, 2]), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([KNNSubSplitInterface(**config) for i in range(n_tv_splits)]) class LinearModelSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [ # ('l1_ratio', None), ('fit_intercept', None), # ('n_jobs', ['n_jobs', 'n_threads'], n_threads) ] penalty = self.config.get('penalty', 'l2') n_jobs = self.config.get('n_jobs', self.config.get('n_threads', None)) params = utils.extract_params(self.config, params_config) l1_ratio = self.config.get('l1_ratio', 0.5) C = self.config.get('C', 1.0) if self.n_classes > 0: from sklearn.linear_model import LogisticRegression return LogisticRegression(random_state=seed, penalty=penalty, solver='lbfgs' if penalty == 'l2' else 'saga', C=C, l1_ratio=l1_ratio if penalty == 'elasticnet' else None, n_jobs=n_jobs, **params) # return LogisticRegression(random_state=seed, penalty='l2', solver='newton-cholesky', C=C, **params) else: alpha = self.config.get('alpha', 1 / C) from sklearn.linear_model import Ridge, Lasso, ElasticNet if penalty == 'l2': return Ridge(random_state=seed, alpha=alpha, **params) elif penalty == 'l1': return Lasso(random_state=seed, alpha=alpha, **params) elif penalty == 'elasticnet': return ElasticNet(random_state=seed, alpha=alpha, l1_ratio=l1_ratio, **params) else: raise ValueError() # from sklearn.linear_model import ElasticNet # return ElasticNet(random_state=seed, alpha=alpha, **params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, n_threads=1), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0} ram_params = {'': 0.5, 'ds_size_gb': 3.0} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class RandomParamsLinearModelAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) hpo_space_name = self.config['hpo_space_name'] if hpo_space_name == 'v1': space = { 'penalty': rng.choice(['l1', 'l2', 'elasticnet']), 'l1_ratio': rng.uniform(0.01, 1.0), 'C': np.exp(rng.uniform(np.log(1e-2), np.log(1e7))), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'v2': # smaller version of v1 space = { 'penalty': rng.choice(['l1', 'l2', 'elasticnet']), 'l1_ratio': rng.uniform(0.01, 0.8), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e5))), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'v3': # smaller version of v1 space = { 'penalty': rng.choice(['l1', 'l2', 'elasticnet']), 'l1_ratio': rng.uniform(0.01, 0.5), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e4))), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'v4': # smaller version of v1 space = { 'penalty': rng.choice(['l1', 'l2']), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e5))), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'tabrepo1': space = { 'penalty': rng.choice(['l1', 'l2']), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))), 'tfms': ['mean_center', 'l2_normalize', 'one_hot'], } elif hpo_space_name == 'tabrepo1-rssc3': space = { 'penalty': rng.choice(['l1', 'l2']), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))), 'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'], 'smooth_clip_max_abs_value': 3, } elif hpo_space_name == 'tabrepo1-rssc5': space = { 'penalty': rng.choice(['l1', 'l2']), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))), 'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'], 'smooth_clip_max_abs_value': 5, } elif hpo_space_name == 'tabrepo1-rssc10': space = { 'penalty': rng.choice(['l1', 'l2']), 'C': np.exp(rng.uniform(np.log(1e-1), np.log(1e3))), 'tfms': ['median_center', 'robust_scale', 'smooth_clip', 'one_hot'], 'smooth_clip_max_abs_value': 10, } else: raise ValueError() return space def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([LinearModelSubSplitInterface(**config) for i in range(n_tv_splits)]) class SklearnMLPSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [] # todo: add parameters params = utils.extract_params(self.config, params_config) if self.n_classes > 0: return MLPClassifier(random_state=seed, **params) else: reg = MLPRegressor(random_state=seed, **params) return TransformedTargetRegressor(regressor=reg, transformer=StandardScaler()) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_onehot_size_gb': 10.0, '1/n_threads*n_samples': 4e-5} ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) class KANSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: import imodelsx.kan params_config = [] # todo: add parameters params = utils.extract_params(self.config, params_config) params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0] if self.n_classes > 0: return imodelsx.kan.KANClassifier(**params) else: reg = imodelsx.kan.KANRegressor(**params) return TransformedTargetRegressor(regressor=reg, transformer=StandardScaler()) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5} ram_params = {'': 0.15, 'ds_onehot_size_gb': 1.5} gpu_ram_params = {'': 0.4, 'n_features': 1e-4} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02) # , gpu_ram_params) return rc.get_required_resources(ds) def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): # by default, we ignore the validation set since most sklearn methods do not support it n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False # give train+valid to KAN since it does its own train+valid split # (even though that one uses 20% valid instead of 25%) # x_df = x_df.iloc[train_mask, :] x_np = x_df.to_numpy() # y = y[train_mask] if cat_col_names is not None and len(cat_col_names) > 0: self.model.fit(x_np, y, **{self._get_cat_indexes_arg_name(): cat_col_names}) else: self.model.fit(x_np, y) def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict(x_df.to_numpy()) def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict_proba(x_df.to_numpy()) class GrandeWrapper: """ Wrapper class for GRANDE that allows to pass cat_features in fit() instead of the constructor. """ def __init__(self, **config): self.config = config def fit(self, X, y, X_val, y_val, cat_features: Optional[List[str]] = None): # params_config = [] # todo: add parameters # params = utils.extract_params(self.config, params_config) params = { 'depth': 5, # tree depth 'n_estimators': 2048, # number of estimators / trees 'learning_rate_weights': 0.005, # learning rate for leaf weights 'learning_rate_index': 0.01, # learning rate for split indices 'learning_rate_values': 0.01, # learning rate for split values 'learning_rate_leaf': 0.01, # learning rate for leaves (logits) 'optimizer': 'adam', # optimizer 'cosine_decay_steps': 0, # decay steps for lr schedule (CosineDecayRestarts) # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression) 'focal_loss': False, # use focal loss {True, False} 'temperature': 0.0, # temperature for stochastic re-weighted GD (0.0, 1.0) 'from_logits': True, # use logits for weighting {True, False} 'use_class_weights': True, # use class weights for training {True, False} 'dropout': 0.0, # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training) 'selected_variables': 0.8, # feature subset percentage (0.0, 1.0) 'data_subset_fraction': 1.0, # data subset percentage (0.0, 1.0) } args = { 'epochs': 1, # number of epochs for training 'early_stopping_epochs': 25, # patience for early stopping (best weights are restored) 'batch_size': 64, # batch size for training 'random_seed': 42, 'verbose': 1, } if issubclass(y.dtype.type, np.floating): print(f'regression') self.is_regression_ = True params['loss'] = 'mse' args['objective'] = 'regression' elif len(np.unique(y)) <= 2: self.is_regression_ = False params['loss'] = 'crossentropy' args['objective'] = 'binary' else: self.is_regression_ = False params['loss'] = 'crossentropy' args['objective'] = 'classification' if cat_features is not None: args['cat_idx'] = [X.columns.get_loc(name) for name in cat_features] else: args['cat_idx'] = [] device = self.config.get('device', 'cpu') if device.startswith('cuda'): gpu_idx_str = device[len('cuda:'):] os.environ['CUDA_VISIBLE_DEVICES'] = gpu_idx_str os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' from GRANDE import GRANDE self.model_ = GRANDE(params=params, args=args) self.model_.fit(X.copy(), y, X_val.copy(), y_val) def predict_proba(self, X): return self.model_.predict(X) def predict(self, X): y_pred = self.model_.predict(X) if not self.is_regression_: return np.argmax(y_pred, axis=1) else: return y_pred class GrandeSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: model = GrandeWrapper(**self.config, device='cpu' if len(gpu_devices) == 0 else gpu_devices[0]) # if self.n_classes == 0: # doesn't work with validation sets anyway # model = TransformedTargetRegressor(regressor=model, transformer=StandardScaler()) return model def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_onehot_size_gb': 10.0, '1/n_threads*n_samples': 4e-5} ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params) return rc.get_required_resources(ds) def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): # by default, we ignore the validation set since most sklearn methods do not support it n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False x_val_df = x_df.iloc[~train_mask, :] y_val_df = y[~train_mask] x_df = x_df.iloc[train_mask, :] y = y[train_mask] if cat_col_names is not None and len(cat_col_names) > 0: self.model.fit(x_df, y, x_val_df, y_val_df, cat_features=cat_col_names) else: self.model.fit(x_df, y, x_val_df, y_val_df) class TabPFN2SubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [ # ('n_jobs', ['n_jobs', 'n_threads'], n_threads), ('softmax_temperature', None), ('average_before_softmax', None), ('inference_precision', None), ('fit_mode', None), ('model_path', None), ] params = utils.extract_params(self.config, params_config) if self.config.get('use_float32', False): params['inference_precision'] = torch.float32 # print(f'{gpu_devices=}') if self.n_classes > 0: from tabpfn import TabPFNClassifier return TabPFNClassifier(random_state=seed, device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu', # device='cuda' if len(gpu_devices) > 0 else 'cpu', ignore_pretraining_limits=True, **params) else: from tabpfn import TabPFNRegressor return TabPFNRegressor(random_state=seed, device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu', # device='cuda' if len(gpu_devices) > 0 else 'cpu', ignore_pretraining_limits=True, **params) def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): # by default, we ignore the validation set since most sklearn methods do not support it if not self.config.get('fit_on_valid', False): n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False x_df = x_df.iloc[train_mask, :] y = y[train_mask] # don't provide a categorical indicator, it should work like this as well self.model.fit(x_df, y) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=1.0, gpu_ram_params={'': 10.0}) return rc.get_required_resources(ds) class TabICLSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [ # ('n_jobs', ['n_jobs', 'n_threads'], n_threads), ('n_estimators', None), ('softmax_temperature', None), ('average_logits', None), ('use_amp', None), ('batch_size', None), ('model_path', None), ('allow_auto_download', None), ('norm_methods', None) ] params = utils.extract_params(self.config, params_config) if self.config.get('use_float32', False): params['inference_precision'] = torch.float32 # print(f'{gpu_devices=}') if self.n_classes > 0: if self.config.get('use_tabiclex', False): from tabiclv2 import TabICLClassifier else: from tabicl import TabICLClassifier return TabICLClassifier(random_state=seed, device=gpu_devices[0] if len(gpu_devices) > 0 else 'cpu', **params) else: raise ValueError(f'TabICL for regression does not exist') def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): # by default, we ignore the validation set since most sklearn methods do not support it if not self.config.get('fit_on_valid', False): n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False x_df = x_df.iloc[train_mask, :] y = y[train_mask] x_df = x_df.copy() if self.config.get('add_fingerprint_feature', False): x_df['__fingerprint_feature'] = np.random.randn(len(x_df)) if self.config.get('mirror_numerical_features', False): self.float_cols_ = x_df.select_dtypes(include=['float']).columns print(f'{len(self.float_cols_)=}') # Generate random signs (+1 or -1) for each column self.signs_ = np.random.choice([-1, 1], size=len(self.float_cols_)) # Multiply each float column by its random sign x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_ # don't provide a categorical indicator, it should work like this as well self.model.fit(x_df, y) def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: x_df = x_df.copy() if self.config.get('add_fingerprint_feature', False): x_df['__fingerprint_feature'] = np.random.randn(len(x_df)) if self.config.get('mirror_numerical_features', False): x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_ return super()._predict_sklearn(x_df) def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: x_df = x_df.copy() if self.config.get('add_fingerprint_feature', False): x_df['__fingerprint_feature'] = np.random.randn(len(x_df)) if self.config.get('mirror_numerical_features', False): x_df.loc[:, self.float_cols_] = x_df.loc[:, self.float_cols_] * self.signs_ return super()._predict_proba_sklearn(x_df) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100), self.config) time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8} ram_params = {'': 0.5} rc = ResourcePredictor(config=updated_config, time_params=time_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.999, gpu_ram_params={'': 10.0}) return rc.get_required_resources(ds) ================================================ FILE: pytabkit/models/alg_interfaces/resource_computation.py ================================================ import numbers import time from collections.abc import Callable from typing import Dict, Union, List, Any, Tuple, Optional import numpy as np import pandas as pd import torch import torch.nn as nn from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models import utils from pytabkit.models.data.data import DictDataset, TensorInfo from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.training.metrics import pinball_loss # This file contains code to predict required resources (time and RAM) of a ML model on a dataset. # There are two components: # - Computing the predicted resources based on a linear model on raw and product features # - Fitting the linear model coefficients based on evaluations on random parameters. def get_resource_features(config: Dict, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, **extra_params) -> Dict[str, float]: """ Extracts features that can be used in a linear model for predicting resource usage. """ # in hyperopt method also on number of steps (for time estimation) tensor_infos = ds.tensor_infos n_samples = ds.n_samples n_classes = tensor_infos['y'].get_cat_size_product() prep_factory = PreprocessingFactory(**config) onehot_factory = PreprocessingFactory(tfms=['one_hot']) fitter, out_tensor_infos = prep_factory.create_transform(tensor_infos) _, onehot_tensor_infos = onehot_factory.create_transform(tensor_infos) n_features = sum([ti.get_n_features() for key, ti in out_tensor_infos.items() if key in ['x_cont', 'x_cat']]) ds_prep = DictDataset(tensors=None, tensor_infos=out_tensor_infos, device=ds.device, n_samples=n_samples) ds_onehot = DictDataset(tensors=None, tensor_infos=onehot_tensor_infos, device=ds.device, n_samples=n_samples) cat_size_sum = 0 if 'x_cat' not in out_tensor_infos else out_tensor_infos['x_cat'].get_cat_sizes().sum().item() n_classes = ds.tensor_infos['y'].get_cat_size_product() n_cat = ds.tensor_infos['x_cat'].get_n_features() ds_size_gb = ds.get_size_gb() ds_prep_size_gb = ds_prep.get_size_gb() ds_onehot_size_gb = ds_onehot.get_size_gb() n_tree_repeats = 1 if n_classes <= 2 else n_classes features = dict() features['1/n_threads'] = 1 / config.get('n_threads', 1) features['ds_size_gb'] = ds_size_gb features['ds_prep_size_gb'] = ds_prep_size_gb features['ds_onehot_size_gb'] = ds_onehot_size_gb features['n_features'] = n_features features['n_samples'] = n_samples features['n_tree_repeats'] = n_tree_repeats features['n_cv_refit'] = n_cv + n_refit features['n_splits'] = n_splits max_depth = config.get('max_depth', 6) if isinstance(max_depth, numbers.Number): features['2_power_maxdepth'] = 2 ** max_depth features['log_num_leaves'] = np.log(max(1, config.get('num_leaves', 31))) features['cat_size_sum'] = cat_size_sum features['n_classes'] = n_classes features['n_cat'] = n_cat return utils.join_dicts(config, features, extra_params) def process_resource_features(raw_features: Dict[str, Any], feature_spec: List[str]): """ Adds product features to raw features. :param raw_features: Raw feature values :param feature_spec: List of strings. Each string should be of the form 'feature_1*...*feature_n', using the names of the features whose products should be added :return: Returns a dictionary of the raw features along with the newly computed product features. """ results = dict() for combination in feature_spec: # ignore empty factors factors = [factor for factor in combination.split('*') if factor != ''] value = 1.0 for factor in factors: value *= raw_features[factor] results[combination] = value return results def eval_linear_product_model(raw_features: Dict[str, Any], params: Dict[str, float]): """ Computes the "inner product" between the feature dictionaries (obtained from raw features and products according to the keys in params). :return: """ result = 0.0 for key, param in params.items(): # ignore empty factors factors = [factor for factor in key.split('*') if factor != ''] value = 1.0 for factor in factors: value *= raw_features[factor] result += param * value return result class FeatureSpec: """ Allows to create a list of product feature names from product and powerset operations etc. """ @staticmethod def _listify(spec: Union[List, str]): if isinstance(spec, list): return spec elif isinstance(spec, str): return [spec] else: raise ValueError(f'Unsupported spec type {type(spec)}') @staticmethod def _product_str(first: str, second: str) -> str: if len(first) == 0: if len(second) == 0: return '' else: return second else: if len(second) == 0: return first else: return f'{first}*{second}' @staticmethod def concat(*feature_specs): feature_specs = [FeatureSpec._listify(spec) for spec in feature_specs] flattened = [spec for lst in feature_specs for spec in lst] return flattened @staticmethod def product(*feature_specs): if len(feature_specs) <= 0: raise ValueError() elif len(feature_specs) == 1: return FeatureSpec._listify(feature_specs[0]) else: first, rest = feature_specs[0], feature_specs[1:] first_list = FeatureSpec._listify(first) rest_product = FeatureSpec.product(*rest) return [FeatureSpec._product_str(first_spec, rest_spec) for first_spec in first_list for rest_spec in rest_product] @staticmethod def powerset_products(*feature_specs): if len(feature_specs) == 0: return [''] elif len(feature_specs) == 1: return FeatureSpec.concat('', feature_specs[0]) else: return FeatureSpec.product(FeatureSpec.concat('', feature_specs[0]), FeatureSpec.powerset_products(*feature_specs[1:])) # some code for linear regression with different losses, to estimate coefficients for resource prediction class NormalizedDataRegressor: def __init__(self, sub_regressor): self.sub_regressor = sub_regressor def fit(self, X: np.ndarray, y: np.ndarray): self.x_norms_ = np.sqrt(np.mean(X ** 2, axis=0)) self.y_norm_ = np.sqrt(np.mean(y ** 2)) self.sub_regressor.fit(X / self.x_norms_[None, :], y / self.y_norm_) def get_coefs(self) -> np.ndarray: return self.sub_regressor.get_coefs() * self.y_norm_ / self.x_norms_ def predict(self, X: np.ndarray) -> np.ndarray: return self.sub_regressor.predict(X / self.x_norms_) * self.y_norm_ class LogLinearModule(nn.Module): def __init__(self, n_features: int): super().__init__() self.params = nn.Parameter(torch.zeros(n_features, dtype=torch.float64)) def forward(self, x: torch.Tensor) -> torch.Tensor: return x @ torch.exp(self.params) class LogLinearRegressor: def __init__(self, pessimistic: bool): self.pessimistic = pessimistic def fit(self, X: np.ndarray, y: np.ndarray): x = torch.as_tensor(X, dtype=torch.float64) y = torch.as_tensor(y, dtype=torch.float64) y_log = torch.log(y + 1e-8) n_features = x.shape[1] self.model_ = LogLinearModule(n_features=n_features) opt = torch.optim.Adam(params=self.model_.parameters(), betas=(0.9, 0.95)) n_it = 10000 max_lr = 1e-1 for i in range(n_it): for param_group in opt.param_groups: # linearly decaying lr schedule param_group['lr'] = (1 - i / n_it) * max_lr y_pred_log = torch.log(self.model_(x)) if self.pessimistic: loss = pinball_loss(torch.exp(y_pred_log), y, quantile=0.99) else: loss = ((y_pred_log - y_log) ** 2).mean() if i % (n_it // 10) == 0: print(f'Loss: {loss.item():g}') loss.backward() opt.step() opt.zero_grad() def get_coefs(self) -> np.ndarray: return np.exp(self.model_.params.detach().numpy()) def fit_resource_factors(data: List[Tuple[Dict[str, float], float]], pessimistic: bool, coef_factor: float = 1.0): feature_names = list(data[0][0].keys()) y = np.asarray([data[i][1] for i in range(len(data))]) X = np.asarray([[data[i][0][feature_names[j]] for j in range(len(feature_names))] for i in range(len(data))]) # transform data set to implicitly learn with relative mse # ((y_pred - y)/y)^2 = ((X/y)c - 1)^2 # X = X / y[:, None] # y = np.ones_like(y) # coefs: np.ndarray = np.linalg.lstsq(X, y)[0] # always use pessimistic version reg = NormalizedDataRegressor(LogLinearRegressor(pessimistic=True)) reg.fit(X, y) coefs = reg.get_coefs() coefs[coefs < 0.0] = 0.0 if pessimistic: # rescale to a bit larger than the maximum on the training set y_pred = X @ coefs coefs *= coef_factor * np.max(y / y_pred) else: y_pred = X @ coefs coefs *= np.mean(y) / np.mean(y_pred) # # align their geometric means # coefs *= np.exp(np.mean(np.log(y)) - np.mean(np.log(y_pred))) return {name: coef for name, coef in zip(feature_names, coefs)} class TimeWrapper: def __init__(self, f: Callable): self.f = f def __call__(self): start_time = time.time() self.f() end_time = time.time() return end_time - start_time def create_ds(n_samples: int, n_cont: int, n_cat: int, cat_size: int, n_classes: int) -> DictDataset: torch.manual_seed(0) x_cont = torch.randn(n_samples, n_cont) x_cont_info = TensorInfo(feat_shape=[n_cont]) x_cat = torch.randint(0, cat_size, size=(n_samples, n_cat)) x_cat_info = TensorInfo(cat_sizes=[cat_size] * n_cat) if n_classes > 0: y = torch.randint(0, n_classes, size=(n_samples, 1)) y_info = TensorInfo(cat_sizes=[n_classes]) else: y = torch.randn(n_samples, 1) y_info = TensorInfo(feat_shape=[1]) return DictDataset(tensors=dict(x_cont=x_cont, x_cat=x_cat, y=y), tensor_infos=dict(x_cont=x_cont_info, x_cat=x_cat_info, y=y_info)) class Sampler: def sample(self) -> Union[int, float]: raise NotImplementedError() class UniformSampler(Sampler): def __init__(self, low: Union[int, float], high: Union[int, float], log=False, is_int=False): self.low = low self.high = high self.log = log self.is_int = is_int def sample(self) -> Union[int, float]: low = self.low high = self.high + 1 if self.is_int else self.high # in the integer case, make the upper bound inclusive if self.log: sample = np.exp(np.random.uniform(np.log(low), np.log(high))) else: sample = np.random.uniform(low, high) return int(sample) if self.is_int else sample # class ChoiceSampler: # def __init__(self): def ds_to_xy(ds: DictDataset) -> Tuple[pd.DataFrame, np.ndarray]: X = ds.without_labels().to_df() y = ds.tensors['y'].numpy() return X, y class ResourcePredictor: """ Predicts resource usages based on a linear model on raw and product features. """ def __init__(self, config: Dict[str, Any], time_params: Dict[str, float], cpu_ram_params: Dict[str, float], gpu_ram_params: Optional[Dict[str, float]] = None, n_gpus: int = 0, gpu_usage: float = 1.0): """ :param config: Configuration parameters. :param time_params: Coefficients for the linear model for time prediction. :param cpu_ram_params: Coefficients for the linear model for CPU RAM prediction. :param gpu_ram_params: Coefficients for the linear model for GPU RAM prediction. :param n_gpus: Number of GPUs that should be used. :param gpu_usage: Usage level of each GPU (between 0 and 1). """ self.config = config self.time_params = time_params self.cpu_ram_params = cpu_ram_params self.gpu_ram_params = gpu_ram_params self.n_gpus = n_gpus self.gpu_usage = gpu_usage def get_required_resources(self, ds: DictDataset, **extra_params) -> RequiredResources: """ Function that provides an estimate of the required resources :param ds: Dataset (does not need to contain the tensors, just the n_samples and tensor_infos) :return: RequiredResources estimate. """ # in hyperopt method also on number of steps # moreover it should depend on n_threads, and scaling law should be able to be configured # should allow n_threads to depend on the task_info (based on certain thresholds and possibly scaling law) # include a time_factor depending on the method n_samples = ds.n_samples n_classes = ds.tensor_infos['y'].get_cat_sizes()[0].item() ds = DictDataset(tensors=None, tensor_infos=ds.tensor_infos, device='cpu', n_samples=ds.n_samples) raw_features_prelim = get_resource_features(self.config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params) n_features = raw_features_prelim['n_features'] if 'n_threads' in self.config: n_threads = self.config['n_threads'] else: # for dionis, it's roughly 100k * 60 * 355 = 2_130_000_000 # for robert it's 10k * 7200 * 10 = 720_000_000 # for indoor_loc_building it's roughly 20k * 520 * 3 = 31_200_000 ds_complexity = n_samples * n_features * n_classes thresh = self.config.get('single_thread_complexity_threshold', 200_000_000) # n_threads = min(self.config.get('max_complexity_threads', 128), 1 + int(ds_complexity / thresh)) n_threads = 1 + int(ds_complexity / thresh) config = utils.update_dict(self.config, dict(n_threads=n_threads)) raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params) cpu_ram_gb = eval_linear_product_model(raw_features, self.cpu_ram_params) min_threads_per_gb = self.config.get('min_threads_per_gb', 0.3) n_threads = min(self.config.get('max_n_threads', 8), max(n_threads, int(min_threads_per_gb * cpu_ram_gb))) config = utils.update_dict(self.config, dict(n_threads=n_threads)) raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1, **extra_params) time_s = eval_linear_product_model(raw_features, self.time_params) cpu_ram_gb = eval_linear_product_model(raw_features, self.cpu_ram_params) gpu_ram_gb = 0.0 if self.gpu_ram_params is None \ else eval_linear_product_model(raw_features, self.gpu_ram_params) # todo: rough correction to prioritize dionis even if it's run with too many threads, # should use better time estimation model time_s += 0.2 * n_threads * time_s return RequiredResources(time_s=time_s, n_threads=n_threads, cpu_ram_gb=cpu_ram_gb, gpu_ram_gb=gpu_ram_gb, n_gpus=self.n_gpus, gpu_usage=0.0 if self.n_gpus == 0 else self.gpu_usage) # if __name__ == '__main__': # features = FeatureSpec.concat('', 'ds_size_gb', # FeatureSpec.product('n_cv_refit', 'n_splits', # FeatureSpec.powerset_products('1/n_threads', 'n_features', # 'n_samples', # 'n_estimators', 'n_tree_repeats'))) # print(features) # print(f'{len(features)=}') ================================================ FILE: pytabkit/models/alg_interfaces/resource_params.py ================================================ class ResourceParams: # determined using estimate_resource_params.py cb_class_time = {'': 1.1074866100217955, 'ds_size_gb': 6.2276292117813865, 'ds_prep_size_gb': 6.2276292117813865, 'ds_onehot_size_gb': 2.0150542417790342e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 2.214973220043591, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 5.1876881836135774e-09, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.035559075362487e-06, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 7.13999461225352e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.000849954711796066, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.964226717465322e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 2.3531597535778573e-14, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.2994223618739465e-15, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.7778533486675952e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 8.378247017832774e-10, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 4.732937240944653e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 5.508439525827261e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 1.285253358050953e-10, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.629510161784679e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 2.627359007275516e-15, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.133320942151551e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 2.651274595052903e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 3.5098969397077584e-11, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 3.673856471950424e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.267867148099078e-16, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 2.3903321610037346e-05, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.589892590504275e-14, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 2.3930248376103085e-16, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 8.531748659348444e-11} cb_class_ram = {'': 0.9345478156433287, 'ds_size_gb': 1.804116547565268e-05, 'ds_prep_size_gb': 1.804116547565268e-05, 'ds_onehot_size_gb': 0.012758554137232066, 'n_tree_repeats': 4.077606761367131e-09, 'n_samples': 7.243808011863237e-07, 'n_samples*n_tree_repeats': 1.2285638949747794e-07, 'n_features': 3.8863715875356e-09, 'n_features*n_tree_repeats': 1.1947420344843242e-12, 'n_features*n_samples': 3.767039504566679e-08, 'n_features*n_samples*n_tree_repeats': 7.361290583089635e-16, 'max_depth': 0.004088255941858752, 'max_depth*n_tree_repeats': 1.1590969030724202e-09, 'max_depth*n_samples': 1.4477032736637855e-13, 'max_depth*n_samples*n_tree_repeats': 3.3706497906893135e-13, 'max_depth*n_features': 0.0006014917997388746, 'max_depth*n_features*n_tree_repeats': 1.834250929757216e-13, 'max_depth*n_features*n_samples': 4.241634070711833e-09, 'max_depth*n_features*n_samples*n_tree_repeats': 1.197601653926371e-16, '2_power_maxdepth': 2.576133502607949e-09, '2_power_maxdepth*n_tree_repeats': 2.356086562374563e-05, '2_power_maxdepth*n_samples': 1.3036510550142841e-15, '2_power_maxdepth*n_samples*n_tree_repeats': 1.9523394732422347e-09, '2_power_maxdepth*n_features': 7.810833280259485e-12, '2_power_maxdepth*n_features*n_tree_repeats': 6.14544078331367e-15, '2_power_maxdepth*n_features*n_samples': 1.5863977594541182e-13, '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 2.3171956595374328e-17} xgb_class_time = {'': 1.5850150119193643e-06, 'ds_size_gb': 67.40780781613621, 'ds_prep_size_gb': 67.40780781613621, 'ds_onehot_size_gb': 7.555892653328937e-06, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 3.1700300238387285e-06, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 0.416152219367654, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 1.7981743709586172e-06, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 3.1379386919643983e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 4.361726529019224e-09, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.433229074601185e-11, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 3.348195651528877e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 3.4142887744033714e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 9.578781115632407e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 1.2099510988434818e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 1.7180121037111673e-12, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 7.916471324379998e-14, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 0.007922594727428374, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 8.113108001263881e-12, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 6.758297160216264e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.4232541896951673e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 6.35528424560118e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 8.810550042257941e-11, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 7.369774923827121e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.186297360838691e-16, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 3.4755127308109863e-05, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 1.1585222842499338e-13, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 2.652000680981318e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.1214153087760665e-11} xgb_class_ram = {'': 0.899804501497566, 'ds_size_gb': 0.41986843027802623, 'ds_prep_size_gb': 0.41986843027802623, 'ds_onehot_size_gb': 7.280007472890875e-06, 'n_tree_repeats': 0.0012854309387287798, 'n_samples': 8.808932580897527e-08, 'n_samples*n_tree_repeats': 8.625259564591089e-10, 'n_features': 1.6375678219943912e-10, 'n_features*n_tree_repeats': 1.302388952570238e-12, 'n_features*n_samples': 3.488627499883473e-11, 'n_features*n_samples*n_tree_repeats': 4.2124781789579334e-11, 'max_depth': 3.280529943711475e-08, 'max_depth*n_tree_repeats': 5.768929558524772e-10, 'max_depth*n_samples': 6.291962320207664e-14, 'max_depth*n_samples*n_tree_repeats': 5.126839919323976e-15, 'max_depth*n_features': 6.35648749681192e-05, 'max_depth*n_features*n_tree_repeats': 1.935402530195678e-13, 'max_depth*n_features*n_samples': 1.28838675675802e-08, 'max_depth*n_features*n_samples*n_tree_repeats': 1.69854661852343e-16, '2_power_maxdepth': 3.26910486762921e-11, '2_power_maxdepth*n_tree_repeats': 1.4676442049665057e-12, '2_power_maxdepth*n_samples': 2.64316777243899e-16, '2_power_maxdepth*n_samples*n_tree_repeats': 1.4901204061072977e-17, '2_power_maxdepth*n_features': 1.140492447521818e-08, '2_power_maxdepth*n_features*n_tree_repeats': 2.404137742885295e-15, '2_power_maxdepth*n_features*n_samples': 3.6325731146686714e-13, '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 3.723108372490702e-19} lgbm_class_time = {'': 0.07952271409861912, 'ds_size_gb': 24.914198992356777, 'ds_prep_size_gb': 24.914198992356777, 'ds_onehot_size_gb': 0.6707498854892533, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.15904542819723824, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.75292585133515e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.995934332919547e-09, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 4.51061814549484e-13, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.015836831101031235, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.885892548234532e-11, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 2.320710370608533e-08, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.006248880421662e-14, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads': 1.6421556695965297e-07, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.015956943711852814, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples': 2.330829367448416e-12, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.2170171882409568e-13, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features': 0.001802775666445253, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.072475113612503e-12, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples': 3.376112165195102e-07, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 8.92885930282138e-09, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads': 7.505014868911757e-10, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.00041603300901854167, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples': 9.05403593468941e-15, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.3824258787970722e-15, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features': 2.152594512387446e-12, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.26406208478857e-14, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples': 9.221334002333759e-16, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.8809384428115866e-11} lgbm_class_ram = {'': 0.8604627263253337, 'ds_size_gb': 2.0214168208781946, 'ds_prep_size_gb': 2.0214168208781946, 'ds_onehot_size_gb': 3.622669179301401e-06, 'n_tree_repeats': 0.0015219464100389682, 'n_samples': 3.856682701344501e-07, 'n_samples*n_tree_repeats': 1.544688671627044e-10, 'n_features': 3.1028960780988996e-10, 'n_features*n_tree_repeats': 1.4858440058980697e-12, 'n_features*n_samples': 2.5173717397818705e-08, 'n_features*n_samples*n_tree_repeats': 6.656160609292717e-11, 'log_num_leaves': 1.573053922451339e-08, 'log_num_leaves*n_tree_repeats': 1.626145985707e-06, 'log_num_leaves*n_samples': 1.617414150367892e-13, 'log_num_leaves*n_samples*n_tree_repeats': 6.161688826595097e-13, 'log_num_leaves*n_features': 2.930068871528871e-11, 'log_num_leaves*n_features*n_tree_repeats': 2.7540140942935337e-13, 'log_num_leaves*n_features*n_samples': 3.939554526330466e-15, 'log_num_leaves*n_features*n_samples*n_tree_repeats': 3.851475872271092e-15, 'num_leaves': 7.114807543594747e-11, 'num_leaves*n_tree_repeats': 7.004349205794621e-07, 'num_leaves*n_samples': 6.063719974576439e-16, 'num_leaves*n_samples*n_tree_repeats': 1.1825948996367154e-14, 'num_leaves*n_features': 6.127161836179573e-06, 'num_leaves*n_features*n_tree_repeats': 4.723694325860319e-15, 'num_leaves*n_features*n_samples': 5.682583426130539e-17, 'num_leaves*n_features*n_samples*n_tree_repeats': 2.820814699620109e-14} class ResourceParamsOld: cb_class_time = {'': 0.060695272326207535, 'ds_size_gb': 2.4268955178538847, 'ds_prep_size_gb': 2.4268955178538847, 'ds_onehot_size_gb': 0.040427221672569374, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.12139054465241507, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.0362927572255956e-09, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 5.259225293072914e-06, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.1159977413280863e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.002034550389178136, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 1.972850747965341e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 1.590097554595333e-14, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.280000915439824e-15, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.374338752023958e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 4.062768369148915e-10, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 1.242824030666801e-12, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 9.32433742185293e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 7.126063129715731e-11, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.344879400790812e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 2.631878772648314e-15, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.4077434831895832e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 1.99445077397377e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 1.2520160532307873e-11, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 3.0511461549128756e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.873281614024595e-16, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 1.2644593910088394e-05, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.235731644015564e-14, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 1.1517663973680398e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.4847067022145893e-11} cb_class_ram = {'': 0.8683295939412378, 'ds_size_gb': 2.1956796547330758e-05, 'ds_prep_size_gb': 2.1956796547330758e-05, 'ds_onehot_size_gb': 0.054809311336043706, 'n_tree_repeats': 3.16259450440823e-09, 'n_samples': 5.359259624964122e-07, 'n_samples*n_tree_repeats': 1.817237502556807e-07, 'n_features': 1.728902260462638e-09, 'n_features*n_tree_repeats': 1.1883754249270118e-12, 'n_features*n_samples': 3.2106346545767416e-08, 'n_features*n_samples*n_tree_repeats': 8.080444898120663e-16, 'max_depth': 0.00023942254928693192, 'max_depth*n_tree_repeats': 7.662207891804141e-10, 'max_depth*n_samples': 2.0135633249657367e-13, 'max_depth*n_samples*n_tree_repeats': 1.9065381412052897e-13, 'max_depth*n_features': 0.0006188384463276942, 'max_depth*n_features*n_tree_repeats': 1.825891231551508e-13, 'max_depth*n_features*n_samples': 4.017104578325911e-09, 'max_depth*n_features*n_samples*n_tree_repeats': 1.2652983818045863e-16, '2_power_maxdepth': 0.0001056123359157812, '2_power_maxdepth*n_tree_repeats': 2.694024798514516e-06, '2_power_maxdepth*n_samples': 1.3780270956209364e-15, '2_power_maxdepth*n_samples*n_tree_repeats': 2.064100170958034e-09, '2_power_maxdepth*n_features': 1.0080022114889349e-10, '2_power_maxdepth*n_features*n_tree_repeats': 6.15051597263584e-15, '2_power_maxdepth*n_features*n_samples': 2.3070275489115195e-12, '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 2.7850591221080067e-17} xgb_class_time = {'': 0.04616911535729873, 'ds_size_gb': 3.47457744189382, 'ds_prep_size_gb': 3.47457744189382, 'ds_onehot_size_gb': 0.0698867127341342, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.09233823071459746, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.035228262559771e-08, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 6.154537890478014e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 8.63288843709104e-14, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 3.291166164590293e-10, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 3.670077849317217e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 1.914319987041818e-13, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 2.926688203905133e-15, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads': 1.68587043083397e-08, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_tree_repeats': 0.0026046534716614215, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples': 3.601942853784541e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.5052320282512473e-14, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features': 0.0007712724349247164, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_tree_repeats': 6.967156404769764e-13, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples': 1.7162683220472862e-09, 'n_cv_refit*n_splits*max_depth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.226904474214378e-10, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads': 9.064818572421352e-11, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_tree_repeats': 6.993000397349683e-07, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples': 2.9578963011700153e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.991428507510768e-16, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features': 2.802431219594177e-06, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_tree_repeats': 9.943166031719296e-15, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples': 5.094046852454207e-14, 'n_cv_refit*n_splits*2_power_maxdepth*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 4.515896055082407e-12} xgb_class_ram = {'': 0.89800664010472, 'ds_size_gb': 0.8958165176491728, 'ds_prep_size_gb': 0.8958165176491728, 'ds_onehot_size_gb': 1.2775211008166364e-05, 'n_tree_repeats': 0.0005355693710144896, 'n_samples': 7.445989056176149e-08, 'n_samples*n_tree_repeats': 1.1095360093190593e-08, 'n_features': 1.419262523195433e-10, 'n_features*n_tree_repeats': 1.189619404783309e-12, 'n_features*n_samples': 2.1948939540241107e-11, 'n_features*n_samples*n_tree_repeats': 6.761378006837745e-13, 'max_depth': 4.602455291339385e-08, 'max_depth*n_tree_repeats': 5.846802665464209e-10, 'max_depth*n_samples': 6.003527146823594e-14, 'max_depth*n_samples*n_tree_repeats': 5.458849368989926e-15, 'max_depth*n_features': 8.276969896399465e-05, 'max_depth*n_features*n_tree_repeats': 1.73562626225241e-13, 'max_depth*n_features*n_samples': 1.1188204977077247e-08, 'max_depth*n_features*n_samples*n_tree_repeats': 1.2101329730965103e-16, '2_power_maxdepth': 3.500391185762912e-11, '2_power_maxdepth*n_tree_repeats': 1.446703161897511e-12, '2_power_maxdepth*n_samples': 2.6046111134557463e-16, '2_power_maxdepth*n_samples*n_tree_repeats': 1.4647083952656776e-17, '2_power_maxdepth*n_features': 8.730859656468559e-07, '2_power_maxdepth*n_features*n_tree_repeats': 2.253274531849529e-15, '2_power_maxdepth*n_features*n_samples': 5.586329461516387e-11, '2_power_maxdepth*n_features*n_samples*n_tree_repeats': 3.406456640909277e-19} lgbm_class_time = {'': 0.028063263911210914, 'ds_size_gb': 2.970270224525262, 'ds_prep_size_gb': 2.970270224525262, 'ds_onehot_size_gb': 0.09163862856656434, 'n_cv_refit*n_splits*n_estimators*1/n_threads': 0.05612652782242183, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_tree_repeats': 3.7651417047281056e-08, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples': 3.057993467818764e-07, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_samples*n_tree_repeats': 6.264643485181751e-14, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features': 0.0018753906815885733, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_tree_repeats': 2.1257067882553722e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples': 8.471355616223231e-12, 'n_cv_refit*n_splits*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 3.3001370294885434e-15, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads': 6.47442904885375e-08, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_tree_repeats': 0.0011608214817588585, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples': 9.964309915135878e-13, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 2.608150056678177e-14, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features': 0.0001926020481234091, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 4.598542008079632e-13, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples': 1.3986995179321424e-08, 'n_cv_refit*n_splits*log_num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 6.208468162170729e-10, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads': 1.1569746986292633e-09, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_tree_repeats': 7.442820019642213e-05, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples': 4.6777144377244544e-14, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_samples*n_tree_repeats': 1.075739698121751e-15, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features': 2.0127433109741758e-13, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_tree_repeats': 5.291223606102416e-15, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples': 2.39530599680757e-16, 'n_cv_refit*n_splits*num_leaves*n_estimators*1/n_threads*n_features*n_samples*n_tree_repeats': 1.8233627245552183e-12} lgbm_class_ram = {'': 0.8545661661490145, 'ds_size_gb': 2.3080037837801175, 'ds_prep_size_gb': 2.3080037837801175, 'ds_onehot_size_gb': 4.0697094447404033e-07, 'n_tree_repeats': 0.0018080853926450316, 'n_samples': 2.994431799612211e-07, 'n_samples*n_tree_repeats': 1.1377985339470745e-09, 'n_features': 4.08148741723376e-07, 'n_features*n_tree_repeats': 1.4109066020140611e-12, 'n_features*n_samples': 2.3506833903706615e-08, 'n_features*n_samples*n_tree_repeats': 8.047116933926301e-12, 'log_num_leaves': 1.8470627691115034e-08, 'log_num_leaves*n_tree_repeats': 4.350203928522753e-07, 'log_num_leaves*n_samples': 1.4244297306885883e-13, 'log_num_leaves*n_samples*n_tree_repeats': 7.582204707419711e-13, 'log_num_leaves*n_features': 4.90256931677757e-11, 'log_num_leaves*n_features*n_tree_repeats': 2.6408516124748747e-13, 'log_num_leaves*n_features*n_samples': 3.020317664222622e-15, 'log_num_leaves*n_features*n_samples*n_tree_repeats': 2.1876975907194365e-15, 'num_leaves': 1.0490359582375276e-10, 'num_leaves*n_tree_repeats': 1.0650528506541837e-07, 'num_leaves*n_samples': 5.943342181332617e-16, 'num_leaves*n_samples*n_tree_repeats': 1.9123390691308356e-14, 'num_leaves*n_features': 6.105483514684091e-06, 'num_leaves*n_features*n_tree_repeats': 4.533114041820276e-15, 'num_leaves*n_features*n_samples': 3.668665655364504e-17, 'num_leaves*n_features*n_samples*n_tree_repeats': 1.2053037667373442e-13} ================================================ FILE: pytabkit/models/alg_interfaces/rtdl_interfaces.py ================================================ import copy from typing import List, Any, Optional, Dict, Tuple from pathlib import Path import numpy as np import pandas as pd import torch from sklearn.preprocessing import StandardScaler, OrdinalEncoder from sklearn.impute import SimpleImputer from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, \ RandomParamsAlgInterface from pytabkit.models.alg_interfaces.sub_split_interfaces import SklearnSubSplitInterface, SingleSplitWrapperAlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.data.data import DictDataset from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import insert_missing_class_columns def allow_single_underscore(params_config: List[Tuple]) -> List[Tuple]: # allow to specify the parameters with __ or with just _ # the reason is that in the sklearn interfaces using __ is problematic # since sklearn thinks these belong to a sub-estimator params_config = copy.deepcopy(params_config) for i in range(len(params_config)): cfg = list(params_config[i]) if cfg[1] is None and '__' in cfg[0]: cfg[1] = [cfg[0], cfg[0].replace('__', '_')] params_config[i] = tuple(cfg) return params_config class SkorchSubSplitInterface(SklearnSubSplitInterface): def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): from skorch.helper import predefined_split from skorch.dataset import Dataset # set number of classes if self.n_classes > 0: # classification self.model.set_n_classes(self.n_classes) # get transformed_target from config transformed_target = self.config.get("transformed_target", False) if transformed_target: # do TransformedTargetRegressor by hand (because setting the # validation set in skorch conflicts with TransformedTargetRegressor) self.transformer = StandardScaler() y = self.transformer.fit_transform(y.reshape(-1, 1)) else: self.transformer = None n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False # create val_ds for skorch (see FAQ) # Note that this break TransformedTargetRegressor, which is why we do it by hand x_train = np.array(x_df.iloc[train_mask, :], dtype=np.float32) x_val = np.array(x_df.iloc[~train_mask, :], dtype=np.float32) y_train = y[train_mask] y_val = y[~train_mask] if self.n_classes else y[~train_mask].reshape(-1, 1) self.categorical_indicator = None if cat_col_names is not None and len(cat_col_names) > 0: self.categorical_indicator = np.array([name in cat_col_names for name in x_df.columns]) self.model.set_categorical_indicator(self.categorical_indicator) # we do OrdinalEncoder one more time to be sure that there are no "holes" # in the categories # missing values were encoded as zero, we need to make them missing again self.replace_zero_by_nans = SimpleImputer(missing_values=0., strategy="constant", fill_value=np.nan, keep_empty_features=True) x_train[:, self.categorical_indicator] = self.replace_zero_by_nans.fit_transform( x_train[:, self.categorical_indicator]) self.ord_enc = OrdinalEncoder(dtype=np.float32, handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1) x_train[:, self.categorical_indicator] = self.ord_enc.fit_transform(x_train[:, self.categorical_indicator]) x_val[:, self.categorical_indicator] = self.replace_zero_by_nans.transform( x_val[:, self.categorical_indicator]) x_val[:, self.categorical_indicator] = self.ord_enc.transform(x_val[:, self.categorical_indicator]) val_ds = Dataset(x_val, y_val) self.model.set_params(train_split=predefined_split(val_ds)) self.model.fit(x_train, y_train) def predict(self, ds: DictDataset) -> torch.Tensor: # adapted from SklearnSubSplitLearner # should return tensor of shape len(ds) x output_shape if self.tfm is not None: ds = self.tfm.forward_ds(ds) x_df = ds.without_labels().to_df() x_array = np.array(x_df, dtype=np.float32) # added if self.categorical_indicator is not None: x_array[:, self.categorical_indicator] = self.replace_zero_by_nans.transform( x_array[:, self.categorical_indicator]) x_array[:, self.categorical_indicator] = self.ord_enc.transform(x_array[:, self.categorical_indicator]) # skorch doesn't support pandas dataframe if self.n_classes > 0: # classification y_pred = np.log(self.model.predict_proba(x_array) + 1e-30) else: # regression y_pred = self.model.predict(x_array) if len(y_pred.shape) == 1: y_pred = y_pred[:, None] y_pred = torch.as_tensor(y_pred, dtype=torch.float32) # guard against missing classes in the training set # (GBDT interfaces don't need this because they get passed n_classes as a parameter) y_pred = insert_missing_class_columns(y_pred, self.train_ds) # added if self.transformer is not None: y_pred = self.transformer.inverse_transform(y_pred.reshape(-1, 1)) # transform to tensor y_pred = torch.from_numpy(y_pred) return y_pred[None] # add vectorized dimension class RTDL_MLPSubSplitInterface(SkorchSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: # the random state is handled by SklearnSubSplitLearner.fit() which sets # numpy and torch seeds based on self.random_state # which is all we need for skorch, so # we don't need to use seed here params_config = allow_single_underscore([ ("lr_scheduler", None), ("lr", None), ("optimizer", None), ("module__n_layers", None), ("module__d_layers", None), ("module__d_first_layer", None), ("module__d_last_layer", None), ("module__activation", None), ("module__dropout", None), ("module__num_emb_type", None), ("module__num_emb_dim", None), ("module__num_emb_hidden_dim", None), ("module__num_emb_sigma", None), ("module__num_emb_lite", None), ("module__d_embedding", None), ("optimizer__weight_decay", None), ("batch_size", None), ("max_epochs", None), ("use_checkpoints", None), ("es_patience", None), ("lr_patience", None), ("verbose", None), ("checkpoint_dir", "tmp_folder"), ("val_metric_name", None), ]) params = utils.extract_params(self.config, params_config) params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0] if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None: params['checkpoint_dir'] = './rtdl_checkpoints' from pytabkit.models.nn_models.rtdl_resnet import create_mlp_classifier_skorch, create_mlp_regressor_skorch if self.n_classes > 0: return create_mlp_classifier_skorch(**params) else: return create_mlp_regressor_skorch(**params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_samples': 8e-5, 'n_samples*n_features': 8e-7} ram_params = {'': 0.3, 'ds_onehot_size_gb': 3.0} gpu_ram_params = {'': 0.4, 'ds_onehot_size_gb': 1.5, 'n_features': 1.5e-3 if self.config.get('module_num_emb_type', 'none') != 'none' else 1e-4} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02) # , gpu_ram_params) return rc.get_required_resources(ds) class ResnetSubSplitInterface(SkorchSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: # the random state is handled by SklearnSubSplitLearner.fit() which sets # numpy and torch seeds based on self.random_state # which is all we need for skorch, so # we don't need to use seed here params_config = allow_single_underscore([ ("lr_scheduler", None), ("module__activation", None), ("module__normalization", None), ("module__n_layers", None), ("module__d", None), ("module__d_hidden_factor", None), ("module__hidden_dropout", None), ("module__residual_dropout", None), ("optimizer__weight_decay", None), ("module__d_embedding", None), ("lr", None), ("optimizer", None), ("batch_size", None), ("max_epochs", None), ("use_checkpoints", None), ("es_patience", None), ("lr_patience", None), ("verbose", None), ("checkpoint_dir", "tmp_folder"), ("val_metric_name", None), ]) # allow to specify these parameters with __ or with just _ # the reason is that in the sklearn interfaces using __ is problematic # since sklearn thinks these belong to a sub-estimator # params_config.extend([(key, [key, key.replace('__', '_')], None) for key, source in # ] params = utils.extract_params(self.config, params_config) params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0] if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None: params['checkpoint_dir'] = './rtdl_checkpoints' from pytabkit.models.nn_models.rtdl_resnet import create_resnet_classifier_skorch, create_resnet_regressor_skorch if self.n_classes > 0: return create_resnet_classifier_skorch(**params) else: return create_resnet_regressor_skorch(**params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8} ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0} # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4, # 'cat_size_sum': 2e-3} gpu_ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0, 'n_train': 4e-6, 'n_features': 1e-3, 'cat_size_sum': 1e-3} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02) # , gpu_ram_params) return rc.get_required_resources(ds, n_train=n_train) class FTTransformerSubSplitInterface(SkorchSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: # the random state is handled by SklearnSubSplitLearner.fit() which sets # numpy and torch seeds based on self.random_state # which is all we need for skorch, so # we don't need to use seed here params_config = allow_single_underscore([ ("lr_scheduler", None), ("module__activation", None), ("module__n_layers", None), ("module__n_heads", None), ("module__token_bias", None), ("module__d_token", None), ("module__d_ffn_factor", None), ("module__attention_dropout", None), ("module__ffn_dropout", None), ("module__residual_dropout", None), ("module__prenormalization", None), ("module__initialization", None), ("module__kv_compression", None, None), ("module__kv_compression_sharing", None, None), ("lr", None), ("optimizer__weight_decay", None), ("optimizer", None), ("batch_size", None), ("max_epochs", None), ("use_checkpoints", None), ("es_patience", None), ("lr_patience", None), ("verbose", None), ("checkpoint_dir", "tmp_folder"), ("val_metric_name", None), ]) params = utils.extract_params(self.config, params_config) params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0] if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None: params['checkpoint_dir'] = './rtdl_checkpoints' from pytabkit.models.nn_models.rtdl_resnet import create_ft_transformer_classifier_skorch, create_ft_transformer_regressor_skorch if self.n_classes > 0: return create_ft_transformer_classifier_skorch(**params) else: return create_ft_transformer_regressor_skorch(**params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 # Bioresponse has 419 features and uses 12.8 GB RAM with batch size 256 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_train*n_features': 8e-6} ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0} # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4, # 'cat_size_sum': 2e-3} # ram computation: attention matrix is n_layers * n_heads * 4bytes * n_features**2 # (coef = 4*8*4 * 1e-9 -> just use 1e-7?) # then there is also 3 (QKV) * n_features * d_token * batch_size * n_heads * 4bytes * n_layers * 2(forward+backward) # coef = 3 * 384 * 128 * 8 * 4 * 4 * 2 / (1024)**3 = 3.5e-2 # and embedding: cat_sizes * d_token gpu_ram_params = {'': 0.2, 'ds_onehot_size_gb': 3.0, 'n_train': 4e-6, 'n_features': 3.5e-2, # use slightly smaller value (based on empirical observations) 'n_features*n_features': 4e-6, 'cat_size_sum': 1e-4} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02) # , gpu_ram_params) return rc.get_required_resources(ds, n_train=n_train) def choose_batch_size_rtdl(train_size) -> int: # set batch_size depending on the number of samples # as in the rtdl paper # if train_size < 10_000: # return 128 # taken from tabr paper, not used in our paper due to a bug if train_size < 30_000: return 256 elif train_size < 100_000: return 512 else: return 1024 def choose_batch_size_rtdl_new(train_size: int) -> int: # set batch_size depending on the number of samples # as in the rtdl paper if train_size < 10_000: return 128 elif train_size < 30_000: return 256 elif train_size < 100_000: return 512 else: return 1024 class RTDL_MLP_ParamSamplerNew: def __init__(self, is_classification: bool, train_size: int, num_emb_type: str = 'none'): self.is_classification = is_classification self.train_size = train_size self.num_emb_type = num_emb_type def sample_params(self, seed: int) -> Dict[str, Any]: rng = np.random.default_rng(seed=seed) # cutoff to change hp space for large datasets # as in rtdl # the cutoff is between 70K and 300K cutoff_train_size_rtdl = 100_000 is_large_dataset = self.train_size > cutoff_train_size_rtdl params = { # reduced d_layers "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \ else rng.choice(np.arange(1, 9)), "module_d_layers": rng.choice(np.arange(1, 1025)) if is_large_dataset \ else rng.choice(np.arange(1, 513)), # "Note that the size of the first and the last layers are tuned and set separately, while the size for # “in-between” layers is the same for all of them." from rtdl paper "module_d_first_layer": rng.choice(np.arange(1, 1025)) if is_large_dataset \ else rng.choice(np.arange(1, 513)), "module_d_last_layer": rng.choice(np.arange(1, 1025)) if is_large_dataset \ else rng.choice(np.arange(1, 513)), "module_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]), "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), "optimizer_weight_decay": rng.choice( [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.] ), "module_d_embedding": rng.choice(np.arange(1, 65)), # have smaller embedding sizes to avoid RAM issues "batch_size": choose_batch_size_rtdl_new(self.train_size), "lr_scheduler": False, "optimizer": "adamw", "max_epochs": 400, "use_checkpoints": True, "es_patience": 16, 'verbose': 0, 'tfms': ['quantile_tabr'], } # MLP-PLR space from # https://github.com/yandex-research/rtdl-num-embeddings/blob/main/exp/mlp-plr/adult/log_linear_fixed_tuning.toml # lr: loguniform(5e-5, 5e-3) # wd: 0, loguniform(1e-6, 1e-3) # sigma: 1e-3, 1e2 (or 1e-2, 1e2 for a different version) # had one-hot encodings # d_layers: ? if self.num_emb_type != 'none': params['module_num_emb_type'] = self.num_emb_type params['module_num_emb_dim'] = rng.choice(np.arange(1, 65)) # reduced from upper bound 128 params['module_num_emb_hidden_dim'] = rng.choice(np.arange(1, 65)) # reduced from upper bound 128 params['module_num_emb_sigma'] = np.exp(rng.uniform(np.log(1e-2), np.log(1e1))) if self.is_classification: params["transformed_target"] = False else: params["transformed_target"] = True return params class RTDL_ResNet_ParamSampler: def __init__(self, is_classification: bool, train_size: int): self.is_classification = is_classification self.train_size = train_size def sample_params(self, seed: int) -> Dict[str, Any]: rng = np.random.default_rng(seed=seed) # cutoff to change hp space for large datasets # as in rtdl # the cutoff is between 70K and 300K cutoff_train_size_rtdl = 100_000 is_large_dataset = self.train_size > cutoff_train_size_rtdl params = { "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \ else rng.choice(np.arange(1, 9)), "module_d": rng.choice(np.arange(64, 1025)) if is_large_dataset \ else rng.choice(np.arange(64, 513)), "module_d_hidden_factor": rng.choice(np.arange(1, 5)), "module_hidden_dropout": rng.uniform(0.0, 0.5), "module_residual_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]), "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), "optimizer_weight_decay": rng.choice( [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.] ), "module_d_embedding": rng.choice(np.arange(8, 32)), # we go lower (than 64) # because we have smaller datasets with categorical features "batch_size": choose_batch_size_rtdl(self.train_size), "module_activation": "relu", "module_normalization": "batchnorm", "lr_scheduler": False, "optimizer": "adamw", "max_epochs": 400, "use_checkpoints": True, "es_patience": 16, 'verbose': 0, 'tfms': ['quantile_tabr'], } if self.is_classification: params["transformed_target"] = False else: params["transformed_target"] = True return params class RTDL_ResNet_ParamSamplerNew: def __init__(self, is_classification: bool, train_size: int): self.is_classification = is_classification self.train_size = train_size def sample_params(self, seed: int) -> Dict[str, Any]: rng = np.random.default_rng(seed=seed) # cutoff to change hp space for large datasets # as in rtdl # the cutoff is between 70K and 300K cutoff_train_size_rtdl = 100_000 is_large_dataset = self.train_size > cutoff_train_size_rtdl params = { "module_n_layers": rng.choice(np.arange(1, 17)) if is_large_dataset \ else rng.choice(np.arange(1, 9)), "module_d": rng.choice(np.arange(64, 1025)) if is_large_dataset \ else rng.choice(np.arange(64, 513)), "module_d_hidden_factor": rng.choice(np.arange(1, 5)), "module_hidden_dropout": rng.uniform(0.0, 0.5), "module_residual_dropout": rng.choice([rng.uniform(0, 0.5)] + [0.]), "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-2))), "optimizer_weight_decay": rng.choice( [np.exp(rng.uniform(np.log(1e-6), np.log(1e-3)))] + [0.] ), "module_d_embedding": rng.choice(np.arange(1, 65)), # use smaller embedding dimensions "batch_size": choose_batch_size_rtdl_new(self.train_size), "module_activation": "relu", "module_normalization": "batchnorm", "lr_scheduler": False, "optimizer": "adamw", "max_epochs": 400, "use_checkpoints": True, "es_patience": 16, 'verbose': 0, 'tfms': ['quantile_tabr'], } if self.is_classification: params["transformed_target"] = False else: params["transformed_target"] = True return params class RandomParamsResnetAlgInterface(SingleSplitAlgInterface): def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.model_idx = model_idx self.alg_interface = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': return RandomParamsResnetAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params, **self.config) def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int): # this is also set in get_required_resources, but okay if self.fit_params is None: hparam_seed = utils.combine_seeds(seed, self.model_idx) is_classification = not ds.tensor_infos['y'].is_cont() self.fit_params = [RTDL_ResNet_ParamSamplerNew(is_classification, n_train).sample_params(hparam_seed)] # self.fit_params = [RTDL_ResNet_ParamSamplerNew(is_classification, n_train).sample_params(hparam_seed)] return SingleSplitWrapperAlgInterface( [ResnetSubSplitInterface(**utils.update_dict(self.config, self.fit_params[0]))]) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, idxs_list[0].n_train) self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name) def predict(self, ds: DictDataset) -> torch.Tensor: return self.alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert len(split_seeds) == 1 alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train) return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train=n_train) class RandomParamsFTTransformerAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed=seed) params = { "module_n_layers": rng.choice(np.arange(1, 5)), "module_d_token": 8 * rng.choice(np.arange(2, 49)), # this is different in https://github.com/yandex-research/rtdl-revisiting-models/blob/main/output/adult/ft_transformer/tuning/0.toml # but used like this in the newer tabr paper spaces "module_d_ffn_factor": rng.uniform(2 / 3, 8 / 3), "module_ffn_dropout": rng.uniform(0.0, 0.5), "module_attention_dropout": rng.uniform(0.0, 0.5), "module_residual_dropout": rng.choice([rng.uniform(0, 0.2)] + [0.]), "lr": np.exp(rng.uniform(np.log(1e-5), np.log(1e-3))), "optimizer_weight_decay": rng.choice( [np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))] + [0.] ), "batch_size": choose_batch_size_rtdl_new(n_train), "lr_scheduler": False, "max_epochs": 400, # introduced a limit, like for MLP and ResNet "use_checkpoints": True, "es_patience": 16, 'verbose': 0, 'tfms': ['quantile_tabr'], } if is_classification: params["transformed_target"] = False return utils.join_dicts(DefaultParams.FTT_D_CLASS, params) else: params["transformed_target"] = True return utils.join_dicts(DefaultParams.FTT_D_REG, params) def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([FTTransformerSubSplitInterface(**config) for i in range(n_tv_splits)]) class RandomParamsRTDLMLPAlgInterface(SingleSplitAlgInterface): def __init__(self, model_idx: int, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.model_idx = model_idx self.alg_interface = None def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': return RandomParamsRTDLMLPAlgInterface(model_idx=self.model_idx, fit_params=fit_params or self.fit_params, **self.config) def _create_sub_interface(self, ds: DictDataset, seed: int, n_train: int): if self.fit_params is None: hparam_seed = utils.combine_seeds(seed, self.model_idx) is_classification = not ds.tensor_infos['y'].is_cont() self.fit_params = [RTDL_MLP_ParamSamplerNew(is_classification, n_train, num_emb_type=self.config.get('num_emb_type', 'none') ).sample_params(hparam_seed)] return SingleSplitWrapperAlgInterface( [RTDL_MLPSubSplitInterface(**utils.update_dict(self.config, self.fit_params[0]))]) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None: assert len(idxs_list) == 1 self.alg_interface = self._create_sub_interface(ds, idxs_list[0].split_seed, n_train=idxs_list[0].n_train) print(f'{self.fit_params[0]=}') self.alg_interface.fit(ds, idxs_list, interface_resources, logger, tmp_folders, name) def predict(self, ds: DictDataset) -> torch.Tensor: return self.alg_interface.predict(ds) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert len(split_seeds) == 1 alg_interface = self._create_sub_interface(ds, split_seeds[0], n_train) return alg_interface.get_required_resources(ds, n_cv, n_refit, n_splits, split_seeds, n_train) ================================================ FILE: pytabkit/models/alg_interfaces/sub_split_interfaces.py ================================================ import copy import random from pathlib import Path from typing import List, Optional, Dict, Any, Tuple import numpy as np import pandas as pd import torch from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.data.data import DictDataset from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import insert_missing_class_columns class SingleSplitWrapperAlgInterface(SingleSplitAlgInterface): """ AlgInterface that takes multiple AlgInterfaces that can only handle a single train-val-test split and wraps them to handle a trainval-test split (possibly with multiple train-val splits) """ def __init__(self, sub_split_interfaces: List[AlgInterface], fit_params: Optional[List[Dict[str, Any]]] = None, **config): """ :param sub_split_interfaces: Interfaces for each sub-split (train-val split). """ super().__init__(fit_params=fit_params, **config) self.sub_split_interfaces = sub_split_interfaces def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': if fit_params is not None: assert len(fit_params) == 1 # single split required orig_fit_params = fit_params fit_params = fit_params or self.fit_params config = utils.join_dicts(self.sub_split_interfaces[0].config, self.config) if config.get('use_best_mean_iteration_for_refit', True): sub_fit_params = [utils.update_dict(fit_params[0], remove_keys='sub_fit_params')] return SingleSplitWrapperAlgInterface( [self.sub_split_interfaces[0].get_refit_interface( n_refit=1, fit_params=sub_fit_params) for i in range(n_refit)], fit_params=fit_params) else: if n_refit != len(self.sub_split_interfaces): raise ValueError('When use_best_mean_iteration_for_refit==False, we must have n_cv==n_refit, ' f'but got n_cv={len(self.sub_split_interfaces)} and {n_refit=}') if orig_fit_params is not None: raise ValueError('When use_best_mean_iteration_for_refit==False, ' 'fit_params in get_refit_interface() should be None') return SingleSplitWrapperAlgInterface( [ssi.get_refit_interface(n_refit=1) for ssi in self.sub_split_interfaces], fit_params=fit_params) def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 # this is a SingleSplitAlgInterface assert len(tmp_folders) == 1 # this is a SingleSplitAlgInterface if self.config.get('same_seed_for_sub_splits', False): idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs, test_idxs=idxs.test_idxs, split_seed=idxs.split_seed, sub_split_seeds=[idxs.sub_split_seeds[0]] * len(idxs.sub_split_seeds), split_id=idxs.split_id) for idxs in idxs_list] split_idxs = idxs_list[0] tmp_folder = tmp_folders[0] hyper_results_list = [] # todo: this could be parallelized if necessary, but not for now for i in range(split_idxs.n_trainval_splits): sub_split_idxs = [split_idxs.get_sub_split_idxs_alt(i)] sub_tmp_folder = tmp_folder / f'sub_split_{i}' if tmp_folder is not None else None # don't set fit_params here # because we might intentionally not want to set them if use_best_mean_iteration_for_refit==False # see get_refit_interfaces() # if self.fit_params is not None: # self.sub_split_interfaces[i].fit_params = self.fit_params hyper_results = self.sub_split_interfaces[i].fit(ds, sub_split_idxs, interface_resources, logger, [sub_tmp_folder], name=name) hyper_results = hyper_results[0][0] if hyper_results is not None else [] hyper_results_list.append(hyper_results) if self.fit_params is None: # determine best fit parameters (early stopping epoch or so) # by averaging losses across cv splits and then taking the minimum of that n_hyper_results = [len(hyper_result) for hyper_result in hyper_results_list] # print(f'{n_hyper_results=}') # truncate all hyper results to minimum length (could be different in case of early stopping) min_n_hyper_results = min(n_hyper_results) if min_n_hyper_results > 0: for i in range(len(hyper_results_list)): hyper_results_list[i] = hyper_results_list[i][:min_n_hyper_results] n_hyper_results = [len(hyper_result) for hyper_result in hyper_results_list] if not utils.all_equal(n_hyper_results): raise RuntimeError(f'Got hyperparameter results of different lengths: {n_hyper_results}') for i in range(n_hyper_results[0]): if not utils.all_equal([frozenset(hyper_result[i][0]) for hyper_result in hyper_results_list]): raise RuntimeError(f'Hyperparameter result lists did not use the same hyperparameters') mean_hyper_results = np.asarray([np.mean([hyper_result[i][1] for hyper_result in hyper_results_list]) for i in range(n_hyper_results[0])]) # use reverse argmin for ties since it sometimes gives better results best_idx = utils.reverse_argmin(mean_hyper_results) self.fit_params = [copy.copy(hyper_results_list[0][best_idx][0])] self.fit_params[0]['sub_fit_params'] = [ssi.fit_params for ssi in self.sub_split_interfaces] # steal the config from the sub_split_interface because it usually gets all the kwargs config = utils.join_dicts(self.sub_split_interfaces[0].config, self.config) if config.get('use_best_mean_iteration_for_cv', False): for ssi in self.sub_split_interfaces: ssi.fit_params = self.fit_params else: self.fit_params = [dict( sub_fit_params=[(ssi.fit_params[0] if ssi.fit_params is not None else None) for ssi in self.sub_split_interfaces])] return None def predict(self, ds: DictDataset) -> torch.Tensor: # todo: pay attention to dimensions return torch.cat([s.predict(ds) for s in self.sub_split_interfaces], dim=0) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_splits == 1 assert n_cv == len(self.sub_split_interfaces) # todo: this is ignoring the refit stage single_resources = [ ssi.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[split_seed], n_train=n_train) for ssi, split_seed in zip(self.sub_split_interfaces, split_seeds)] return RequiredResources.combine_sequential(single_resources) def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: return self.sub_split_interfaces[0].get_available_predict_params() def set_current_predict_params(self, name: str) -> None: super().set_current_predict_params(name) for ssi in self.sub_split_interfaces: ssi.set_current_predict_params(name) class SklearnSubSplitInterface(SingleSplitAlgInterface): # todo: have another base class """ Base class for AlgInterfaces based on scikit-learn methods. """ def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.tfm = None self.n_classes = None self.model = None self.train_ds = None def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 assert idxs_list[0].n_trainval_splits == 1 # print(f'fit(): {torch.cuda.is_initialized()=}') # return List[Tuple[Dict, float]]], i.e., validation scores for every hyperparameter combination # (could be number of trees, early stopping epoch, or hyperparameters from hyperparameter optimization) # if hyperparams is not None, use these and maybe only return one list element? seed = idxs_list[0].sub_split_seeds[0] torch.manual_seed(seed) # can be useful for label encoding with randomized permutation np.random.seed(seed) random.seed(seed) # print(f'Seeding with seed {seed}') # print(f'{type(seed)=}') self.n_classes = ds.get_n_classes() if idxs_list[0].val_idxs is None: trainval_idxs = idxs_list[0].train_idxs[0] # validation indices such that trainval_idxs[rel_val_idxs] is the val_idxs # can be used to index trainval_ds later rel_val_idxs = torch.zeros(0, dtype=torch.long) else: trainval_idxs = torch.cat([idxs_list[0].train_idxs[0], idxs_list[0].val_idxs[0]], dim=0) rel_val_idxs = torch.arange(idxs_list[0].n_train, trainval_idxs.shape[0], dtype=torch.long) trainval_ds = ds.get_sub_dataset(trainval_idxs) # for filling in missing classes in the train dataset later # might not work when the validation set contains classes that the training set doesn't contain self.train_ds = ds.get_sub_dataset(idxs_list[0].train_idxs[0]) self.config["tmp_folder"] = tmp_folders[0] self.config['interface_resources'] = interface_resources # create preprocessing factory factory = self.config.get('factory', None) if factory is None: factory = PreprocessingFactory(**self.config) # transform according to factory fitter = factory.create(ds.tensor_infos) self.tfm, trainval_ds = fitter.fit_transform(trainval_ds) y = trainval_ds.tensors['y'] self.model = self._create_sklearn_model(seed=seed, n_threads=interface_resources.n_threads, gpu_devices=interface_resources.gpu_devices) if self.n_classes == 0 and trainval_ds.tensor_infos['y'].get_n_features() > 1 \ and self.config.get('use_multioutput_regressor', False): from sklearn.multioutput import MultiOutputRegressor self.model = MultiOutputRegressor(self.model) # todo: test this y = y.numpy() else: y = y[:, 0].numpy() x_df = trainval_ds.without_labels().to_df() cat_col_names = list(x_df.select_dtypes(include='category').columns) self._fit_sklearn(x_df=x_df, y=y, val_idxs=rel_val_idxs.numpy(), cat_col_names=cat_col_names) return None def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray, cat_col_names: Optional[List[str]] = None): # by default, we ignore the validation set since most sklearn methods do not support it n_samples = len(x_df) train_mask = np.ones(shape=(n_samples,), dtype=np.bool_) train_mask[val_idxs] = False x_df = x_df.iloc[train_mask, :] y = y[train_mask] if cat_col_names is not None and len(cat_col_names) > 0: self.model.fit(x_df, y, **{self._get_cat_indexes_arg_name(): cat_col_names}) else: self.model.fit(x_df, y) def predict(self, ds: DictDataset) -> torch.Tensor: # should return tensor of shape len(ds) x output_shape if self.tfm is not None: ds = self.tfm.forward_ds(ds) x_df = ds.without_labels().to_df() if self.n_classes > 0: # classification y_pred = np.log(self._predict_proba_sklearn(x_df) + 1e-30) else: # regression y_pred = self._predict_sklearn(x_df) if len(y_pred.shape) == 1: y_pred = y_pred[:, None] y_pred = torch.as_tensor(y_pred, dtype=torch.float32) # guard against missing classes in the training set # (GBDT interfaces don't need this because they get passed n_classes as a parameter) y_pred = insert_missing_class_columns(y_pred, self.train_ds) return y_pred[None] # add n_models dimension def _predict_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict(x_df) def _predict_proba_sklearn(self, x_df: pd.DataFrame) -> np.ndarray: return self.model.predict_proba(x_df) def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: # override this in subclasses raise NotImplementedError() def _get_cat_indexes_arg_name(self) -> str: # override this in subclasses if categorical features are supported raise NotImplementedError() class TreeBasedSubSplitInterface(SingleSplitAlgInterface): # todo: insert more appropriate class to inherit from? """ Base class for tree-based ML models (XGB, LGBM, CatBoost). """ def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) self.config = config self.tfm = None self.n_classes = None self.model = None def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 assert idxs_list[0].n_trainval_splits == 1 # return List[Tuple[Dict, float]]], i.e., validation scores for every hyperparameter combination # (could be number of trees, early stopping epoch, or hyperparameters from hyperparameter optimization) # if hyperparams is not None, use these and maybe only return one list element? seed = idxs_list[0].sub_split_seeds[0] torch.manual_seed(seed) # can be useful for label encoding with randomized permutation np.random.seed(seed) random.seed(seed) self.n_classes = ds.get_n_classes() train_idxs = idxs_list[0].train_idxs[0] val_idxs = idxs_list[0].val_idxs[0] if idxs_list[0].val_idxs is not None else None train_ds = ds.get_sub_dataset(train_idxs) is_cv = val_idxs is not None val_ds = ds.get_sub_dataset(val_idxs) if is_cv else None # create preprocessing factory factory = self.config.get('factory', None) if factory is None: factory = PreprocessingFactory(**self.config) # transform according to factory fitter = factory.create(ds.tensor_infos) if is_cv: trainval_ds = ds.get_sub_dataset(torch.cat([train_idxs, val_idxs], dim=0)) else: trainval_ds = train_ds self.tfm = fitter.fit(trainval_ds) train_ds = self.tfm.forward_ds(train_ds) if is_cv: val_ds = self.tfm.forward_ds(val_ds) params = self._get_params() if self.fit_params is not None: params = utils.update_dict(params, self.fit_params[0]) gpu_ids = [int(dev_str[len('cuda:'):]) for dev_str in interface_resources.gpu_devices if dev_str.startswith('cuda:')] if len(gpu_ids) > 0 and self.config.get('allow_gpu', True): params['device'] = f'cuda:{gpu_ids[0]}' # this is for XGBoost 2.0 and CatBoost self.model, val_errors = self._fit(train_ds, val_ds, params=params, seed=seed, n_threads=interface_resources.n_threads, val_metric_name=self.config.get('val_metric_name', None), tmp_folder=tmp_folders[0]) if val_errors is None: return None else: if self.config.get('use_best_checkpoint', True): if isinstance(val_errors, dict): # have multiple errors for different metrics self.fit_params = [dict( n_estimators={key: utils.reverse_argmin(values) + 1 for key, values in val_errors.items()})] else: self.fit_params = [dict(n_estimators=utils.reverse_argmin(val_errors) + 1)] else: self.fit_params = [dict(n_estimators=len(val_errors))] if isinstance(val_errors, dict): return None # not implemented else: return [[[(dict(n_estimators=i + 1), err) for i, err in enumerate(val_errors)]]] def predict(self, ds: DictDataset) -> torch.Tensor: # should return tensor of shape len(ds) x output_shape pred_dict = self.get_current_predict_params_dict() pred_params = dict() if self.fit_params is not None: if 'val_metric_name' in pred_dict: pred_params = dict(n_estimators=self.fit_params[0]['n_estimators'][pred_dict['val_metric_name']]) else: pred_params = self.fit_params[0] if self.tfm is not None: ds = self.tfm.forward_ds(ds) return self._predict(self.model, ds, self.n_classes, pred_params)[None] def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int, n_threads: int, val_metric_name: Optional[str] = None, tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]: raise NotImplementedError() def _predict(self, bst: Any, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor: raise NotImplementedError() def _get_params(self) -> Dict[str, Any]: raise NotImplementedError() def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: val_metric_names = self.config.get('val_metric_names', None) if val_metric_names is None: return {'': dict()} else: return {f'_val-{val_metric_name}': dict(val_metric_name=val_metric_name) for val_metric_name in val_metric_names} ================================================ FILE: pytabkit/models/alg_interfaces/tabm_interface.py ================================================ import functools import math import random from pathlib import Path import scipy import sklearn import torch import numpy as np from pytabkit.models.training.metrics import Metrics from torch import nn from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, RandomParamsAlgInterface from typing import Optional, List, Dict, Any, Union, Tuple, Literal from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.nn_models import rtdl_num_embeddings from pytabkit.models.nn_models.base import Fitter from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.nn_models.tabm import Model, make_parameter_groups from pytabkit.models.training.logging import Logger def get_tabm_auto_batch_size(n_train: int) -> int: # by Yury Gorishniy, inferred from the choices in the TabM paper. if n_train < 2_800: return 32 if n_train < 4_500: return 64 if n_train < 6_400: return 128 if n_train < 32_000: return 256 if n_train < 108_000: return 512 return 1024 class TabMSubSplitInterface(SingleSplitAlgInterface): def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': raise NotImplementedError() def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 assert idxs_list[0].n_trainval_splits == 1 seed = idxs_list[0].sub_split_seeds[0] # print(f'Setting seed: {seed}') torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # hyperparams arch_type = self.config.get('arch_type', 'tabm') num_emb_type = self.config.get('num_emb_type', 'none') n_epochs = self.config.get('n_epochs', 1_000_000_000) patience = self.config.get('patience', 16) batch_size = self.config.get('batch_size', 256) compile_model = self.config.get('compile_model', False) lr = self.config.get('lr', 2e-3) d_embedding = self.config.get('d_embedding', 16) d_block = self.config.get('d_block', 512) dropout = self.config.get('dropout', 0.1) tabm_k = self.config.get('tabm_k', 32) allow_amp = self.config.get('allow_amp', False) n_blocks = self.config.get('n_blocks', 'auto') num_emb_n_bins = self.config.get('num_emb_n_bins', 48) # set default to True for backward compatibility share_training_batches = self.config.get("share_training_batches", False) val_metric_name = self.config.get('val_metric_name', None) train_metric_name = self.config.get('train_metric_name', None) weight_decay = self.config.get('weight_decay', 0.0) gradient_clipping_norm = self.config.get('gradient_clipping_norm', None) TaskType = Literal['regression', 'binclass', 'multiclass'] n_train = idxs_list[0].n_train n_classes = ds.get_n_classes() cat_cardinalities = ds.tensor_infos['x_cat'].get_cat_sizes().numpy().tolist() task_type: TaskType = 'regression' if n_classes == 0 else ('binclass' if n_classes == 2 else 'multiclass') device = interface_resources.gpu_devices[0] if len(interface_resources.gpu_devices) >= 1 else 'cpu' device = torch.device(device) if num_emb_n_bins >= n_train: print(f'Reducing num_emb_n_bins to be smaller than n_train') num_emb_n_bins = n_train-1 if val_metric_name is None: val_metric_name = 'rmse' if task_type == 'regression' else 'class_error' if batch_size == "auto": batch_size = get_tabm_auto_batch_size(n_train=n_train) self.n_classes_ = n_classes self.task_type_ = task_type self.device_ = device # create preprocessing factory factory = self.config.get('factory', None) if 'tfms' not in self.config: self.config['tfms'] = ['quantile_tabr'] if factory is None: factory = PreprocessingFactory(**self.config) if idxs_list[0].val_idxs is None: raise ValueError(f'Training without validation set is currently not implemented') ds_parts = {'train': ds.get_sub_dataset(idxs_list[0].train_idxs[0]), 'val': ds.get_sub_dataset(idxs_list[0].val_idxs[0]), # 'test': ds.get_sub_dataset(idxs_list[0].test_idxs) } part_names = ['train', 'val'] # no test non_train_part_names = ['val'] # transform according to factory fitter: Fitter = factory.create(ds.tensor_infos) self.tfm_, ds_parts['train'] = fitter.fit_transform(ds_parts['train']) for part in non_train_part_names: ds_parts[part] = self.tfm_(ds_parts[part]) # filter out numerical columns with only a single value x_cont_train = ds_parts['train'].tensors['x_cont'] for part in part_names: ds_parts[part] = ds_parts[part].to(device) # mask of which columns are not constant self.num_col_mask_ = ~torch.all(x_cont_train == x_cont_train[0:1, :], dim=0) for part in part_names: ds_parts[part].tensors['x_cont'] = ds_parts[part].tensors['x_cont'][:, self.num_col_mask_] # tensor infos are not correct anymore, but might not be used either # update n_cont_features = ds_parts['train'].tensors['x_cont'].shape[1] Y_train = ds_parts['train'].tensors['y'].clone() if task_type == 'regression': assert Y_train.shape[-1] == 1 self.y_mean_ = ds_parts['train'].tensors['y'].mean(dim=0, keepdim=True).item() self.y_std_ = ds_parts['train'].tensors['y'].std(dim=0, keepdim=True, correction=0).item() self.y_max_ = ds_parts['train'].tensors['y'].max().item() self.y_min_ = ds_parts['train'].tensors['y'].min().item() Y_train = (Y_train - self.y_mean_) / (self.y_std_ + 1e-30) data = {part: utils.join_dicts( dict(x_cont=ds_parts[part].tensors['x_cont'], y=ds_parts[part].tensors['y']), dict(x_cat=ds_parts[part].tensors['x_cat']) if ds.tensor_infos['x_cat'].get_n_features() > 0 else dict()) for part in part_names} # adapted from https://github.com/yandex-research/tabm/blob/main/example.ipynb # Automatic mixed precision (AMP) # torch.float16 is implemented for completeness, # but it was not tested in the project, # so torch.bfloat16 is used by default. amp_dtype = ( torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else None ) # Changing False to True will result in faster training on compatible hardware. amp_enabled = allow_amp and amp_dtype is not None and device.type == 'cuda' grad_scaler = torch.cuda.amp.GradScaler() if amp_dtype is torch.float16 else None # type: ignore # fmt: off logger.log(1, f'Device: {device.type.upper()}' f'\nAMP: {amp_enabled} (dtype: {amp_dtype})' f'\ntorch.compile: {compile_model}' ) # fmt: on pass # Choose one of the two configurations below. # TabM bins = None if num_emb_type != 'pwl' or n_cont_features == 0 else rtdl_num_embeddings.compute_bins(data['train']['x_cont'], n_bins=num_emb_n_bins) d_out = n_classes if n_classes > 0 else 1 if train_metric_name is not None and train_metric_name.startswith('multi_pinball'): d_out = train_metric_name.count(',')+1 model = Model( n_num_features=n_cont_features, cat_cardinalities=cat_cardinalities, n_classes=d_out, backbone={ 'type': 'MLP', 'n_blocks': n_blocks if n_blocks != 'auto' else (3 if bins is None else 2), 'd_block': d_block, 'dropout': dropout, }, bins=bins, num_embeddings=( None if bins is None else { 'type': 'PiecewiseLinearEmbeddings', 'd_embedding': d_embedding, 'activation': False, 'version': 'B', } ), arch_type=arch_type, k=tabm_k, share_training_batches=share_training_batches, ).to(device) # import tabm # num_embeddings = None if bins is None else rtdl_num_embeddings.PiecewiseLinearEmbeddings( # bins=bins, # d_embedding=d_embedding, # activation=False, # version='B', # ) # model = tabm.TabM( # n_num_features=n_cont_features, # cat_cardinalities=cat_cardinalities, # d_out = n_classes if n_classes > 0 else 1, # num_embeddings = num_embeddings, # n_blocks=n_blocks if n_blocks != 'auto' else (3 if bins is None else 2), # d_block=d_block, # dropout=dropout, # arch_type=arch_type, # k=tabm_k, # # todo: can introduce activation # share_training_batches=share_training_batches, # todo: disappeared? # ) optimizer = torch.optim.AdamW(make_parameter_groups(model), lr=lr, weight_decay=weight_decay) if compile_model: # NOTE # `torch.compile` is intentionally called without the `mode` argument # (mode="reduce-overhead" caused issues during training with torch==2.0.1). model = torch.compile(model) evaluation_mode = torch.no_grad else: evaluation_mode = torch.inference_mode @torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype) # type: ignore[code] def apply_model(part: str, idx: torch.Tensor) -> torch.Tensor: return ( model( data[part]['x_cont'][idx], data[part]['x_cat'][idx] if 'x_cat' in data[part] else None, ) .float() ) if train_metric_name is None: train_metric_name = 'mse' if self.n_classes_ == 0 else 'cross_entropy' if train_metric_name == 'mse': base_loss_fn = torch.nn.functional.mse_loss elif train_metric_name == 'cross_entropy': base_loss_fn = lambda a, b: torch.nn.functional.cross_entropy(a, b.squeeze(-1)) else: base_loss_fn = functools.partial(Metrics.apply, metric_name=train_metric_name) def loss_fn(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor: # TabM produces k predictions per object. Each of them must be trained separately. # (regression) y_pred.shape == (batch_size, k) # (classification) y_pred.shape == (batch_size, k, n_classes) k = y_pred.shape[1] # print(f'{y_pred.flatten(0, 1).shape=}, {y_true.shape=}') return base_loss_fn( y_pred.flatten(0, 1), y_true.repeat_interleave(k) if model.share_training_batches else y_true, ) @evaluation_mode() def evaluate(part: str) -> float: model.eval() # When using torch.compile, you may need to reduce the evaluation batch size. eval_batch_size = 1024 y_pred: torch.Tensor = ( torch.cat( [ apply_model(part, idx) for idx in torch.arange(len(data[part]['y']), device=device).split( eval_batch_size ) ] ) ) if task_type == 'regression': # Transform the predictions back to the original label space. y_pred = y_pred * self.y_std_ + self.y_mean_ # Compute the mean of the k predictions. average_logits = self.config.get('average_logits', False) if average_logits: y_pred = y_pred.mean(dim=1) if task_type != 'regression': # For classification, the mean must be computed in the probability space. y_pred = y_pred.softmax(dim=-1) if not average_logits: y_pred = y_pred.mean(dim=1) y_true = data[part]['y'].cpu() y_pred = y_pred.cpu() if task_type == 'regression' and len(y_true.shape) == 1: y_true = y_true.unsqueeze(-1) if task_type == 'regression' and len(y_pred.shape) == 1: y_pred = y_pred.unsqueeze(-1) # use minus so higher=better score = -Metrics.apply(y_pred, y_true, val_metric_name).item() return float(score) # The higher -- the better. # print(f'Test score before training: {evaluate("test"):.4f}') epoch_size = math.ceil(n_train / batch_size) best = { 'val': -math.inf, # 'test': -math.inf, 'epoch': -1, } best_params = [p.clone() for p in model.parameters()] # Early stopping: the training stops when # there are more than `patience` consecutive bad updates. remaining_patience = patience try: if self.config.get('verbosity', 0) >= 1: from tqdm.std import tqdm else: tqdm = lambda arr, desc: arr except ImportError: tqdm = lambda arr, desc: arr logger.log(1, '-' * 88 + '\n') for epoch in range(n_epochs): batches = ( torch.randperm(n_train, device=device).split(batch_size) if model.share_training_batches else [ x.transpose(0, 1).flatten() for x in torch.rand((model.k, n_train), device=device).argsort(dim=1).split(batch_size, dim=1) ] ) model.train() for batch_idx in tqdm(batches, desc=f"Epoch {epoch}"): optimizer.zero_grad(set_to_none=True) preds = apply_model('train', batch_idx) loss = loss_fn(preds, Y_train[batch_idx]) if grad_scaler is None: loss.backward() if gradient_clipping_norm not in (None, 'none'): torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping_norm) # type: ignore optimizer.step() else: grad_scaler.scale(loss).backward() if gradient_clipping_norm not in (None, 'none'): # unscale before clipping so the grads are in FP32 grad_scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping_norm) # type: ignore grad_scaler.step(optimizer) grad_scaler.update() val_score = evaluate('val') # test_score = evaluate('test') # logger.log(1, f'(val) {val_score:.4f} (test) {test_score:.4f}') logger.log(1, f'(val) {val_score:.4f}') if val_score > best['val']: logger.log(1, '🌸 New best epoch! 🌸') # best = {'val': val_score, 'test': test_score, 'epoch': epoch} best = {'val': val_score, 'epoch': epoch} remaining_patience = patience with torch.no_grad(): for bp, p in zip(best_params, model.parameters()): bp.copy_(p) else: remaining_patience -= 1 if remaining_patience < 0: break logger.log(1, '') logger.log(1, '\n\nResult:') logger.log(1, str(best)) logger.log(1, f'Restoring best model') with torch.no_grad(): for bp, p in zip(best_params, model.parameters()): p.copy_(bp) self.model_ = model return None def predict(self, ds: DictDataset) -> torch.Tensor: self.model_.eval() ds = self.tfm_(ds).to(self.device_) ds.tensors['x_cont'] = ds.tensors['x_cont'][:, self.num_col_mask_] eval_batch_size = 1024 with torch.no_grad(): y_pred: torch.Tensor = ( torch.cat( [ self.model_( ds.tensors['x_cont'][idx], ds.tensors['x_cat'][idx] if not ds.tensor_infos['x_cat'].is_empty() else None, ) .float() for idx in torch.arange(ds.n_samples, device=self.device_).split( eval_batch_size ) ] ) ) if self.task_type_ == 'regression': # Transform the predictions back to the original label space. y_pred = y_pred.mean(1) y_pred = y_pred * self.y_std_ + self.y_mean_ if self.config.get('clamp_output', False): y_pred = torch.clamp(y_pred, self.y_min_, self.y_max_) else: average_logits = self.config.get('average_logits', False) if average_logits: y_pred = y_pred.mean(1) else: # For classification, the mean must be computed in the probability space. y_pred = torch.log(torch.softmax(y_pred, dim=-1).mean(1) + 1e-30) return y_pred[None].cpu() # add n_models dimension def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8} ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0} # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4, # 'cat_size_sum': 2e-3} gpu_ram_params = {'': 0.5, 'ds_onehot_size_gb': 5.0, 'n_train': 6e-6, 'n_features': 1.5e-3, # reduced from 2e-3 'cat_size_sum': 1e-4} # reduced from 1e-3 rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.02) # , gpu_ram_params) return rc.get_required_resources(ds, n_train=n_train) class RandomParamsTabMAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) # adapted from Grinsztajn et al. (2022) hpo_space_name = self.config.get('hpo_space_name', 'default') if hpo_space_name == 'default': params = { "batch_size": "auto", "patience": 16, "allow_amp": True, "arch_type": "tabm-mini", "tabm_k": 32, # "gradient_clipping_norm": 1.0, # wasn't correctly implemented so we remove it in v1.7.0 # this makes it probably slower with numerical embeddings, and also more RAM intensive # according to the paper it's not very important but should be a bit better (?) "share_training_batches": False, "lr": np.exp(rng.uniform(np.log(1e-4), np.log(3e-3))), "weight_decay": rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-1)))]), "n_blocks": rng.choice([1, 2, 3, 4]), "d_block": rng.choice([i for i in range(64, 1024 + 1) if i % 16 == 0]), "dropout": rng.choice([0.0, rng.uniform(0.0, 0.5)]), # numerical embeddings "num_emb_type": "pwl", "d_embedding": rng.choice([i for i in range(8, 32 + 1) if i % 4 == 0]), "num_emb_n_bins": rng.integers(2, 128, endpoint=True), } elif hpo_space_name == 'tabarena': params = { "batch_size": "auto", "patience": 16, "allow_amp": False, # only for GPU, maybe we should change it to True? "arch_type": "tabm-mini", "tabm_k": 32, # "gradient_clipping_norm": 1.0, # wasn't correctly implemented so we remove it in v1.7.0 # this makes it probably slower with numerical embeddings, and also more RAM intensive # according to the paper it's not very important but should be a bit better (?) "share_training_batches": False, "lr": np.exp(rng.uniform(np.log(1e-4), np.log(3e-3))), "weight_decay": rng.choice([0.0, np.exp(rng.uniform(np.log(1e-4), np.log(1e-1)))]), # removed n_blocks=1 according to Yury Gurishniy's advice "n_blocks": rng.choice([2, 3, 4, 5]), # increased lower limit from 64 to 128 according to Yury Gorishniy's advice "d_block": rng.choice([i for i in range(128, 1024 + 1) if i % 16 == 0]), "dropout": rng.choice([0.0, rng.uniform(0.0, 0.5)]), # numerical embeddings "num_emb_type": "pwl", "d_embedding": rng.choice([i for i in range(8, 32 + 1) if i % 4 == 0]), "num_emb_n_bins": rng.integers(2, 128, endpoint=True), } else: raise ValueError(f'Unknown {hpo_space_name=}') return params def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**config) for i in range(n_tv_splits)]) def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: return TabMSubSplitInterface(**self.config).get_available_predict_params() def set_current_predict_params(self, name: str) -> None: super().set_current_predict_params(name) ================================================ FILE: pytabkit/models/alg_interfaces/tabr_interface.py ================================================ from typing import List, Any, Optional, Dict, Tuple from pathlib import Path import numpy as np import pandas as pd import torch from sklearn.preprocessing import StandardScaler, OrdinalEncoder from sklearn.impute import SimpleImputer from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models import utils from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, SingleSplitAlgInterface, \ RandomParamsAlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources, SubSplitIdxs from pytabkit.models.alg_interfaces.rtdl_interfaces import choose_batch_size_rtdl_new from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.training.logging import Logger from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.nn_models.tabr import TabrLightning, TabrModel from pytabkit.models.nn_models.tabr_context_freeze import TabrModelContextFreeze, TabrLightningContextFreeze from pytabkit.models.training.metrics import insert_missing_class_columns import torch.utils.data try: import lightning.pytorch as pl from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint except ImportError: import pytorch_lightning as pl from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint class ExceptionPrintingCallback(pl.callbacks.Callback): def on_exception(self, trainer, pl_module, exception): import traceback print(f'caught exception') traceback.print_exception(exception) class TabRSubSplitInterface(AlgInterface): def __init__(self, **config): super().__init__(**config) self.tfm = None self.n_classes = None self.model = None self.train_ds = None def create_model(self, n_num_features, n_bin_features, cat_cardinalities, n_classes, freeze_contexts_after_n_epochs: Optional[int]) -> Any: params_config = [ ('num_embeddings', None, None), ('d_main', None), ('d_multiplier', None), ('encoder_n_blocks', None), ('predictor_n_blocks', None), ('mixer_normalization', None), ('context_dropout', None), ('dropout0', None), ('dropout1', None), ('normalization', None), ('activation', None), # The following options should be used only when truly needed. ('memory_efficient', None), ('candidate_encoding_batch_size', None), ('add_scaling_layer', None), ('scale_lr_factor', None), ('use_ntp_linear', None), ('linear_init_type', None), ('use_ntp_encoder', None), ] params = utils.extract_params(self.config, params_config) if freeze_contexts_after_n_epochs is not None: return TabrModelContextFreeze( n_num_features=n_num_features, n_bin_features=n_bin_features, cat_cardinalities=cat_cardinalities, n_classes=n_classes, **params ) else: return TabrModel( n_num_features=n_num_features, n_bin_features=n_bin_features, cat_cardinalities=cat_cardinalities, n_classes=n_classes, **params ) def infer_batch_size(self, n_samples_train: int) -> int: # taken from tabr paper table 14 # the cutoffs might not be exactly the same if n_samples_train < 10_000: return 128 elif n_samples_train < 30_000: return 256 elif n_samples_train < 200_000: return 512 else: return 1024 def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 assert idxs_list[0].n_trainval_splits == 1 pl.seed_everything(idxs_list[0].sub_split_seeds[0]) use_deterministic_before = torch.are_deterministic_algorithms_enabled() torch.use_deterministic_algorithms(False) self.n_classes = ds.get_n_classes() train_idxs = idxs_list[0].train_idxs[0] val_idxs = idxs_list[0].val_idxs[0] if idxs_list[0].val_idxs is not None else None train_ds = ds.get_sub_dataset(train_idxs) self.train_ds = train_ds is_cv = val_idxs is not None val_ds = ds.get_sub_dataset(val_idxs) if is_cv else None # create preprocessing factory factory = self.config.get('factory', None) if factory is None: factory = PreprocessingFactory(**self.config) # transform according to factory fitter = factory.create(ds.tensor_infos) if is_cv: trainval_ds = ds.get_sub_dataset(torch.cat([train_idxs, val_idxs], dim=0)) else: trainval_ds = train_ds self.tfm = fitter.fit(trainval_ds) train_ds = self.tfm.forward_ds(train_ds) if is_cv: val_ds = self.tfm.forward_ds(val_ds) y = train_ds.tensors['y'] if is_cv: y_val = val_ds.tensors['y'] # equivalent of sklearn's TransformedTargetRegressor transformed_target = self.config.get("transformed_target", False) if transformed_target: #do TransformedTargetRegressor by hand (because setting the # validation set in skorch conflicts with TransformedTargetRegressor) self.transformer_mean = y.mean() self.transformer_std = y.std() y = (y - self.transformer_mean) / self.transformer_std if is_cv: y_val = (y_val - self.transformer_mean) / self.transformer_std else: self.transformer_mean = None self.transformer_std = None # create datasets for pytorch lightning X_num = train_ds.tensors['x_cont'] X_cat = train_ds.tensors['x_cat'] # separate bin and cat cat_sizes = train_ds.tensor_infos['x_cat'].get_cat_sizes() cat_sizes = cat_sizes - 1 # cat sizes contains the size + 1 for unknown values #TODO: I think we could do something cleaner binary_indicator = cat_sizes == 2 to_drop_indicator = cat_sizes <= 1 #TODO: this should be dealt with in the converter or the factory cat_indicator = (~to_drop_indicator) & (~binary_indicator) X_bin = train_ds.tensors['x_cat'][:, binary_indicator] X_cat = train_ds.tensors['x_cat'][:, cat_indicator] cat_sizes_nonbinary = cat_sizes[cat_indicator].tolist() # create validation dataset if is_cv: X_num_val = val_ds.tensors['x_cont'] X_cat_val = val_ds.tensors['x_cat'] # separate bin and cat X_bin_val = val_ds.tensors['x_cat'][:, binary_indicator] X_cat_val = val_ds.tensors['x_cat'][:, cat_indicator] # We need to do ordinalEncoding again here to prevent holes in the categories if X_cat.shape[1] > 0: #missing values were encoded as 0 in ToDictDatasetConverter # missing values were encoded as zero, we need to make them missing again self.replace_zero_by_nans = SimpleImputer(missing_values=0., strategy="constant", fill_value=np.nan) self.ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1) # apparently it doesn't work on the integer tensor X_cat = self.replace_zero_by_nans.fit_transform(X_cat.float()) X_cat = torch.from_numpy(self.ord_enc.fit_transform(X_cat)) if is_cv: X_cat_val = self.replace_zero_by_nans.transform(X_cat_val.float()) X_cat_val = torch.from_numpy(self.ord_enc.transform(X_cat_val)) if X_bin.shape[1] > 0: # the ToDictDatasetConverter encoded binary features as 1 and 2 # we need to encode them as 0 and 1 X_bin = X_bin - 1 assert torch.logical_or( torch.logical_or( (X_bin == -1), # missing values were encoded as 0 (X_bin == 0) ), (X_bin == 1)).all() # replace -1 by 0.5 X_bin[X_bin == -1] = 0.5 if is_cv: X_bin_val = X_bin_val - 1 X_bin_val[X_bin_val == -1] = 0.5 from skorch.dataset import Dataset class TabrDataset(Dataset): def __init__(self, X_num, X_bin, X_cat, Y): self.data = { "Y": Y.reshape(-1) } if X_num.shape[1] > 0: self.data["X_num"] = X_num.float() if X_bin.shape[1] > 0: self.data["X_bin"] = X_bin.long() if X_cat.shape[1] > 0: self.data["X_cat"] = X_cat.long() self.size = len(Y) def __len__(self): return self.size def __getitem__(self, idx): return {"indices": idx} train_dataset = TabrDataset( X_num, X_bin, X_cat, y, ) if is_cv: val_dataset = TabrDataset( X_num_val, X_bin_val, X_cat_val, y_val, ) else: assert NotImplementedError n_train = idxs_list[0].n_train min_context_freeze_train_size = self.config.get('min_context_freeze_train_size', 0) freeze_contexts_after_n_epochs = self.config.get('freeze_contexts_after_n_epochs', None) if n_train < min_context_freeze_train_size: freeze_contexts_after_n_epochs = None # don't freeze torch_model = self.create_model( n_num_features=X_num.shape[1], n_bin_features=X_bin.shape[1], cat_cardinalities=cat_sizes_nonbinary, # we could save a little memory # by recomputing the cardinality on train only, but let's keep it simple n_classes=self.n_classes if self.n_classes > 0 else None, freeze_contexts_after_n_epochs=freeze_contexts_after_n_epochs ) # set batch size if auto if self.config.get('batch_size', None) == 'auto': self.config['batch_size'] = self.infer_batch_size(len(y)) self.config["n_threads"] = interface_resources.n_threads self.config["verbosity"] = self.config.get("verbosity", 0) class_to_use = TabrLightningContextFreeze if freeze_contexts_after_n_epochs is not None else TabrLightning self.model = class_to_use( torch_model, train_dataset, val_dataset, C=self.config, n_classes=self.n_classes, ) if self.n_classes > 0: val_metric_name = self.config.get('val_metric_name', 'class_error') if val_metric_name == 'class_error': es_callback = EarlyStopping(monitor='val_accuracy', patience=self.config["patience"], mode='max') checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_accuracy", mode="max", dirpath=tmp_folders[0]) elif val_metric_name == 'cross_entropy': print(f'Early stopping on cross-entropy loss') es_callback = EarlyStopping(monitor='val_loss', patience=self.config["patience"], mode='min') checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min", dirpath=tmp_folders[0]) else: raise ValueError(f'Validation metric {val_metric_name} not implemented for TabR') else: es_callback = EarlyStopping(monitor='val_loss', patience=self.config["patience"], mode='min') checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min", dirpath=tmp_folders[0]) gpu_devices = interface_resources.gpu_devices print("gpu_devices", gpu_devices) self.device = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu' if self.device == 'cpu': pl_accelerator = 'cpu' pl_devices = 'auto' elif self.device == 'mps': pl_accelerator = 'mps' pl_devices = 'auto' elif self.device == 'cuda': pl_accelerator = 'gpu' pl_devices = [0] elif self.device.startswith('cuda:'): pl_accelerator = 'gpu' pl_devices = [int(self.device[len('cuda:'):])] else: raise ValueError(f'Unknown device "{self.device}"') self.trainer = pl.Trainer( accelerator=pl_accelerator, devices=pl_devices, deterministic=True, callbacks=[es_callback, checkpoint_callback, ExceptionPrintingCallback()], max_epochs=self.config["n_epochs"], enable_progress_bar=self.config["verbosity"] > 0, enable_model_summary=self.config["verbosity"] > 0, logger=pl.loggers.logger.DummyLogger(), ) self.trainer.fit(self.model) if self.config["verbosity"] > 0: print("path to best model", checkpoint_callback.best_model_path) # prints path to the best model's checkpoint print("best score", checkpoint_callback.best_model_score) # and prints it score # load best model class_to_use = TabrLightningContextFreeze if freeze_contexts_after_n_epochs is not None else TabrLightning self.model = class_to_use.load_from_checkpoint(checkpoint_callback.best_model_path, model = torch_model, train_dataset=train_dataset, val_dataset=val_dataset, C=self.config, n_classes=self.n_classes, ) torch.use_deterministic_algorithms(use_deterministic_before) return None def predict(self, ds: DictDataset) -> torch.Tensor: # adapted from SklearnSubSplitLearner # should return tensor of shape len(ds) x output_shape use_deterministic_before = torch.are_deterministic_algorithms_enabled() torch.use_deterministic_algorithms(False) if self.tfm is not None: ds = self.tfm.forward_ds(ds) X_num = ds.tensors['x_cont'] X_cat = ds.tensors['x_cat'] # separate bin and cat cat_sizes = ds.tensor_infos['x_cat'].get_cat_sizes() cat_sizes = cat_sizes - 1 # cat sizes contains the size + 1 for missing values binary_indicator = cat_sizes == 2 to_drop_indicator = cat_sizes <= 1 cat_indicator = (~to_drop_indicator) & (~binary_indicator) X_bin = ds.tensors['x_cat'][:, binary_indicator] X_cat = ds.tensors['x_cat'][:, cat_indicator] # We need to do ordinalEncoding again here to prevent holes in the categories if X_cat.shape[1] > 0: X_cat = self.replace_zero_by_nans.transform(X_cat.float()) X_cat = torch.from_numpy(self.ord_enc.transform(X_cat)) if X_bin.shape[1] > 0: # the ToDictDatasetConverter encoded binary features as 1 and 2 # we need to encode them as 0 and 1 X_bin = X_bin - 1 assert torch.logical_or( torch.logical_or( (X_bin == -1), # missing values were encoded as 0 (X_bin == 0) ), (X_bin == 1)).all() # replace -1 by 0.5 X_bin[X_bin == -1] = 0.5 from skorch.dataset import Dataset class TabrDatasetTest(Dataset): def __init__(self, X_num, X_bin, X_cat): self.data = {} if X_num.shape[1] > 0: self.data["X_num"] = X_num.float() self.size = len(X_num) if X_bin.shape[1] > 0: self.data["X_bin"] = X_bin.long() self.size = len(X_bin) if X_cat.shape[1] > 0: self.data["X_cat"] = X_cat.long() self.size = len(X_cat) def __len__(self): return self.size def __getitem__(self, idx): return { key: self.data[key][idx] for key in self.data } test_dataset = TabrDatasetTest( X_num, X_bin, X_cat, ) # create a dataloader test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=self.config["eval_batch_size"], shuffle=False, num_workers=0, #min(self.config["n_threads"] - 1, 16) ) y_pred = self.trainer.predict(self.model, test_dataloader) y_pred = torch.cat(y_pred, dim=0) # guard against missing classes in the training set # (GBDT interfaces don't need this because they get passed n_classes as a parameter) y_pred = insert_missing_class_columns(y_pred, self.train_ds) # inverse transform for y (like in TransformedTargetRegressor) if self.transformer_mean is not None: y_pred = y_pred * self.transformer_std + self.transformer_mean torch.use_deterministic_algorithms(use_deterministic_before) return y_pred[None] # add vectorized dimension def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 has_num_emb = self.config.get('num_embeddings', None) is not None if has_num_emb: num_emb_dict = self.config['num_embeddings'] num_emb_size_factor = 1.0 + 0.2 * (num_emb_dict.get('n_frequencies', 8) + num_emb_dict.get('d_embedding', 4)) else: num_emb_size_factor = 1.0 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=1), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 1e-4} ram_params = {'': 4, 'ds_onehot_size_gb': 1.5} gpu_ram_params = {'': 5, 'n_features': num_emb_size_factor * 1e-4, "n_train": 3e-5, 'n_features*n_train': num_emb_size_factor * 0.5e-7, 'n_classes': 0.04} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.3) #, gpu_ram_params) return rc.get_required_resources(ds, n_train=n_train) class RandomParamsTabRAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) hpo_space_name = self.config.get('hpo_space_name', 'tabr') if hpo_space_name == 'tabr': params = { # reduced d_layers "d_main": rng.choice(np.arange(96, 385)), "context_dropout": rng.uniform(0.0, 0.6), "dropout0": rng.uniform(0.0, 0.6), "dropout1": 0.0, "optimizer": { "type": "AdamW", "lr": np.exp(rng.uniform(np.log(3e-5), np.log(1e-3))), "weight_decay": rng.choice([0, np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))]) # paper says 1e-3 but logs on github say 1e-4 for upper bound }, "encoder_n_blocks": rng.choice([0, 1]), "predictor_n_blocks": rng.choice([1, 2]), "num_embeddings": { "type": "PLREmbeddings", "n_frequencies": rng.choice(np.arange(16, 97)), "d_embedding": rng.choice(np.arange(16, 65)), "frequency_scale": np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), "lite": True, }, } if is_classification: params = utils.join_dicts(DefaultParams.TABR_S_D_CLASS, params) else: params = utils.join_dicts(DefaultParams.TABR_S_D_REG, params) elif hpo_space_name == 'realtabr': tfms_list = [['quantile_tabr'], ['median_center', 'robust_scale', 'smooth_clip']] params = { # reduced d_layers "d_main": rng.choice(np.arange(96, 385)), "context_dropout": rng.uniform(0.0, 0.6), "dropout0": rng.uniform(0.0, 0.6), "dropout1": 0.0, "optimizer": { "type": "AdamW", "lr": np.exp(rng.uniform(np.log(3e-5), np.log(1e-3))), "weight_decay": rng.choice([0, np.exp(rng.uniform(np.log(1e-6), np.log(1e-4)))]), # paper says 1e-3 but logs on github say 1e-4 for upper bound "betas": (0.9, rng.choice([0.95, 0.999])), }, "encoder_n_blocks": rng.choice([0, 1]), "predictor_n_blocks": rng.choice([1, 2]), "num_embeddings": { "type": "PBLDEmbeddings", # use factor 2 since it results in the same hidden dimension # as for PLR without the factor 2 because of the concat(sin, cos) thing "n_frequencies": 2*rng.choice(np.arange(16, 97)), "d_embedding": rng.choice(np.arange(16, 65)), "frequency_scale": np.exp(rng.uniform(np.log(1e-2), np.log(1e2))), }, "ls_eps": rng.choice([0.0, 0.1]), 'tfms': tfms_list[rng.choice(np.arange(len(tfms_list)))], 'add_scaling_layer': rng.choice([True, False]), 'scale_lr_factor': 96, } if is_classification: params = utils.join_dicts(DefaultParams.RealTABR_D_CLASS, params) else: params = utils.join_dicts(DefaultParams.RealTABR_D_REG, params) else: raise ValueError(f'Unknown HPO space name "{hpo_space_name}"') return params def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**config) for i in range(n_tv_splits)]) ================================================ FILE: pytabkit/models/alg_interfaces/xgboost_interfaces.py ================================================ import copy from pathlib import Path from typing import Optional, Dict, Any, Tuple, List, Union import numpy as np import torch from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.resource_params import ResourceParams from pytabkit.models import utils from pytabkit.models.alg_interfaces.base import RequiredResources from pytabkit.models.alg_interfaces.sub_split_interfaces import TreeBasedSubSplitInterface, \ SingleSplitWrapperAlgInterface, \ SklearnSubSplitInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer from pytabkit.models.alg_interfaces.alg_interfaces import OptAlgInterface, AlgInterface, RandomParamsAlgInterface from pytabkit.models.training.metrics import Metrics class XGBCustomMetric: def __init__(self, metric_names: Union[str, List[str]], is_classification: bool, is_higher_better: bool = False): self.metric_names = metric_names self.is_classification = is_classification self.is_higher_better = is_higher_better def __call__(self, y_pred: np.ndarray, dtrain): # dtrain should be of type xgb.DMatrix y = torch.as_tensor(dtrain.get_label(), dtype=torch.long if self.is_classification else torch.float32) if len(y.shape) == 1: y = y[:, None] # print(f'{y_pred.shape=}, {eval_data.get_label().shape=}') y_pred = torch.as_tensor(y_pred, dtype=torch.float32) if len(y_pred.shape) == 1: if self.is_classification: if y_pred.shape[0] == y.shape[0]: # binary classification, transform into both class probabilities y_pred = torch.stack([1. - y_pred, y_pred], dim=-1) else: # bugged multiclass classification in LightGBM, need to reshape # print(y_pred[:7]) y_pred = y_pred.view(-1, y.shape[0]).t().contiguous() # print(y_pred[0, :].sum()) else: y_pred = y_pred[:, None] if self.is_classification: # go from probabilities to logits y_pred = torch.log(y_pred + 1e-30) # print(f'{y_pred[4]=}') # print(f'{torch.min(y_pred).item()=}') # print(f'{np.asarray(dtrain.get_data())[4]=}') # print(f'{y_pred.shape=}, {y.shape=}') # print(f'{y_pred=}, {y=}') if isinstance(self.metric_names, str): return self.metric_names, Metrics.apply(y_pred, y, metric_name=self.metric_names).item() elif isinstance(self.metric_names, list): results = [(metric_name, Metrics.apply(y_pred, y, metric_name=metric_name).item()) for metric_name in self.metric_names] # print(results) return results else: raise ValueError(f'Invalid {type(self.metric_names)=}') class XGBSklearnSubSplitInterface(SklearnSubSplitInterface): def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str]) -> Any: params_config = [('n_estimators', None), ('verbosity', None), ('max_depth', None), ('eta', ['lr', 'learning_rate', 'eta']), ('subsample', None), ('colsample_bytree', None), ('colsample_bylevel', None), ('colsample_bynode', None), ('alpha', ['alpha', 'reg_alpha']), ('lambda', ['lambda', 'reg_lambda']), ('gamma', ['gamma', 'reg_gamma']), ('tree_method', None), ('min_child_weight', None), ('max_delta_step', None), ('max_cat_to_onehot', ['max_cat_to_onehot', 'max_onehot_cat_size', 'one_hot_max_size'], None), ('num_parallel_tree', None), ('max_bin', None), ('nthread', ['nthread', 'n_threads'], n_threads), ] params = utils.extract_params(self.config, params_config) if self.n_classes > 0: from xgboost import XGBClassifier return XGBClassifier(random_state=seed, **params) else: from xgboost import XGBRegressor return XGBRegressor(random_state=seed, **params) def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.xgb_class_time, cpu_ram_params=ResourceParams.xgb_class_ram) return rc.get_required_resources(ds) class XGBSubSplitInterface(TreeBasedSubSplitInterface): # for RF: https://xgboost.readthedocs.io/en/latest/tutorials/rf.html def _get_params(self): # n_estimators is not set in params but directly in bst.fit() below params_config = [('verbosity', None, 0), ('max_depth', None, 6), ('eta', ['lr', 'learning_rate', 'eta'], 0.3), ('subsample', None, 1.0), ('colsample_bytree', None, 1.0), ('colsample_bylevel', None, 1.0), ('colsample_bynode', None, 1.0), ('alpha', ['alpha', 'reg_alpha'], 0.0), ('lambda', ['lambda', 'reg_lambda'], 1.0), ('gamma', ['gamma', 'reg_gamma'], 0.0), ('tree_method', None, 'auto'), ('min_child_weight', None), ('max_delta_step', None), ('max_cat_to_onehot', ['max_cat_to_onehot', 'max_onehot_cat_size', 'one_hot_max_size'], None), ('num_parallel_tree', None), ('max_bin', None), ('multi_strategy', None), ('grow_policy', None), ('max_leaves', None), ] params = utils.extract_params(self.config, params_config) if self.config.get('use_gpu', False): params['tree_method'] = 'gpu_hist' return params def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': assert n_refit == 1 return XGBSubSplitInterface(fit_params=fit_params or self.fit_params, **self.config) # adapted from https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/xgboost_experiment.py def _preprocess_params(self, params: Dict[str, Any], n_classes: int) -> Dict[str, Any]: params = copy.deepcopy(params) if n_classes == 0: train_metric_name = self.config.get('train_metric_name', 'mse') # val_metric_name = self.config.get('val_metric_name', 'rmse') if train_metric_name == 'mse': params['objective'] = 'reg:squarederror' # params['eval_metric'] = 'rmse' elif train_metric_name.startswith('pinball('): quantile = float(train_metric_name[len('pinball('):-1]) params['objective'] = 'reg:quantileerror' params['quantile_alpha'] = quantile else: raise ValueError(f'Train metric "{train_metric_name}" is currently not supported!') # params.update({'objective': 'reg:squarederror', 'eval_metric': 'rmse'}) elif n_classes == 2: params.update({'objective': 'binary:logistic'}) elif n_classes > 2: params.update({'objective': 'multi:softprob', 'num_class': n_classes}) if n_classes <= 2 and 'multi_strategy' in params: del params['multi_strategy'] # could use gpu using # param['gpu_id'] = 0 # param['tree_method'] = 'gpu_hist' params['max_depth'] = int(params['max_depth']) return params def _convert_ds(self, ds: DictDataset) -> Any: import xgboost as xgb label = None if 'y' not in ds.tensors else ds.tensors['y'].cpu().numpy() has_cat = 'x_cat' in ds.tensor_infos and ds.tensor_infos['x_cat'].get_n_features() > 0 x_df = ds.without_labels().to_df() # print(f'{x_df.iloc[4 if ds.n_samples < 1000 else 240]=}') # print([x_df[col].cat.categories.tolist() for col in x_df.select_dtypes(include="category").columns]) return xgb.DMatrix(x_df, label, enable_categorical=has_cat) def _fit(self, train_ds: DictDataset, val_ds: Optional[DictDataset], params: Dict[str, Any], seed: int, n_threads: int, val_metric_name: Optional[str] = None, tmp_folder: Optional[Path] = None) -> Tuple[Any, Optional[List[float]]]: import xgboost as xgb # print(f'Fitting XGBoost') n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item() params = self._preprocess_params(params, n_classes) params.update({'seed': seed, 'nthread': n_threads}) evals = [] if val_ds is None else [(self._convert_ds(val_ds), 'val')] evals_result = {} custom_metric = None eval_metric_name = None val_metric_names = self.config.get('val_metric_names', None) if val_ds is not None: # print(f'{val_ds.n_samples=}') if val_metric_names is not None: eval_metric_name = val_metric_names[0] custom_metric = XGBCustomMetric(val_metric_names, is_classification=n_classes > 0) else: # single validation metric if val_metric_name is None: val_metric_name = 'class_error' if n_classes > 0 else 'rmse' if val_metric_name == 'class_error': eval_metric_name = 'error' if n_classes == 2 else 'merror' elif val_metric_name == 'cross_entropy': eval_metric_name = 'logloss' if n_classes == 2 else 'mlogloss' elif val_metric_name == 'rmse': eval_metric_name = 'rmse' elif val_metric_name == 'mae': eval_metric_name = 'mae' else: eval_metric_name = val_metric_name custom_metric = XGBCustomMetric(val_metric_name, is_classification=n_classes > 0) if custom_metric is None: params['eval_metric'] = eval_metric_name else: params['disable_default_eval_metric'] = True extra_train_params = {} if val_ds is not None and 'early_stopping_rounds' in self.config: extra_train_params['early_stopping_rounds'] = self.config['early_stopping_rounds'] n_estimators = self.config.get('n_estimators', 1000) if 'n_estimators' in params: # can happen for refit because fit_params are directly joined into params n_estimators = int(params['n_estimators']) bst = xgb.train(params, self._convert_ds(train_ds), evals=evals, evals_result=evals_result, custom_metric=custom_metric, num_boost_round=n_estimators, verbose_eval=False, **extra_train_params) # print(f'xgb train completed') # import psutil # import os # print(f'Memory: {psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3} GB') if val_ds is not None: if val_metric_names is not None: val_errors = {vmn: evals_result['val'][vmn] for vmn in val_metric_names} # for vmn in val_metric_names: # print(f'{vmn=}, {np.argmin(val_errors[vmn])=}, {np.min(val_errors[vmn])=}') else: val_errors = evals_result['val'][eval_metric_name] # print(f'{np.min(val_errors)=}') # print(f'{val_ds.tensors["x_cont"][4]=}') # print(f'{val_ds.tensors["x_cat"][4]=}') # print(f'{self._predict(bst, val_ds, n_classes, dict(n_estimators=np.argmin(val_errors)+1))[4]=}') else: val_errors = None return bst, val_errors def _predict(self, bst, ds: DictDataset, n_classes: int, other_params: Dict[str, Any]) -> torch.Tensor: # bst should be of type xgb.Booster # print(f'XGB _predict() with {other_params=}') # print(f'predict with {ds.n_samples=}, {ds.tensors["x_cont"][4]=}, {ds.tensors["x_cat"][4]=}, {ds.tensors["x_cont"][240]=}, {ds.tensors["x_cat"][240]=}') iteration_range = (0, 0) if other_params is None else (0, int(other_params['n_estimators'])) y_pred = torch.as_tensor(bst.predict(self._convert_ds(ds), iteration_range=iteration_range), dtype=torch.float32) if n_classes == 0: y_pred = y_pred.unsqueeze(-1) elif n_classes == 2: y_pred = torch.stack([1. - y_pred, y_pred], dim=-1) if n_classes >= 2: y_pred = torch.log(y_pred + 1e-30) # print(f'min: {torch.min(y_pred).item():g}, max: {torch.max(y_pred).item():g}') return y_pred def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=1000, max_depth=6, max_n_threads=8), self.config) rc = ResourcePredictor(config=updated_config, time_params=ResourceParams.xgb_class_time, cpu_ram_params=ResourceParams.xgb_class_ram) return rc.get_required_resources(ds) class XGBHyperoptAlgInterface(OptAlgInterface): def __init__(self, space=None, n_hyperopt_steps: int = 50, **config): from hyperopt import hp default_config = {} max_config = dict() if space is None: space = config.get('hpo_space_name', None) if space == 'catboost_quality_benchmarks': # space from catboost quality benchmarks # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/xgboost_experiment.py # the parameter names in the space are for the alg interface, not directly for the GBDT interface! space = { 'eta': hp.loguniform('eta', -7, 0), 'max_depth': hp.quniform('max_depth', 2, 10, 1), 'subsample': hp.uniform('subsample', 0.5, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1), 'min_child_weight': hp.loguniform('min_child_weight', -16, 5), 'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]), 'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]), 'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)]) } default_config = dict(n_estimators=5000) max_config['max_depth'] = 10 elif space == 'NODE' or space == 'popov': # space from NODE paper: # Popov, Morozov, and Babenko, Neural oblivious decision ensembles for deep learning on tabular data # the parameter names in the space are for the alg interface, not directly for the GBDT interface! # same as catboost_quality_benchmarks except with smaller n_estimators space = { 'eta': hp.loguniform('eta', -7, 0), 'max_depth': hp.quniform('max_depth', 2, 10, 1), 'subsample': hp.uniform('subsample', 0.5, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1), 'min_child_weight': hp.loguniform('min_child_weight', -16, 5), 'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]), 'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]), 'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)]) } default_config = dict(n_estimators=2048) max_config['max_depth'] = 10 elif space == 'shwartz-ziv': # from Shwartz-Ziv and Armon, Tabular data: Deep learning is not all you need # the TabPFN-Paper uses the same configuration space = { 'n_estimators': hp.quniform('n_estimators', 100, 4000, 1), 'eta': hp.loguniform('eta', -7, 0), 'max_depth': hp.quniform('max_depth', 1, 10, 1), 'subsample': hp.uniform('subsample', 0.2, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1), 'min_child_weight': hp.loguniform('min_child_weight', -16, 5), 'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]), 'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]), 'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)]) } max_config['max_depth'] = 10 elif space == 'kadra': # from Kadra, Lindauer, Hutter, and Grabocka, Well-tuned Simple Nets Excel on Tabular Datasets space = { 'n_estimators': hp.quniform('n_estimators', 1, 1000, 1), 'eta': hp.loguniform('eta', np.log(1e-3), 0), 'max_depth': hp.quniform('max_depth', 1, 20, 1), 'subsample': hp.uniform('subsample', 0.01, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 1), 'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(20.0)), 'max_delta_step': hp.quniform('max_delta_step', 0, 10, 1), 'reg_alpha': hp.loguniform('alpha', np.log(1e-10), 0), 'reg_lambda': hp.loguniform('lambda', np.log(1e-10), 0), 'reg_gamma': hp.loguniform('gamma', np.log(1e-10), 0) } max_config['max_depth'] = 20 elif space == 'grinsztajn': # from Grinsztajn, Oyallon, Varoquaux, # Why do tree-based models still outperform deep learning on typical tabular data? # they have early-stopping-rounds=20 # they also use XGBClassifier / XGBRegressor from scikit-learn # they also start the random searches with the default hyperparameters of the model # see https://github.com/LeoGrin/tabular-benchmark/blob/main/src/configs/model_configs/xgb_config.py space = { 'eta': hp.loguniform('eta', np.log(1e-5), np.log(0.7)), 'max_depth': hp.quniform('max_depth', 1, 11, 1), 'min_child_weight': hp.qloguniform('min_child_weight', 0.0, np.log(100.0), 1), 'subsample': hp.uniform('subsample', 0.5, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1), 'reg_alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1e-2)), 'reg_lambda': hp.loguniform('lambda', np.log(1.0), np.log(4.0)), 'reg_gamma': hp.loguniform('gamma', np.log(1e-8), np.log(7.0)) } default_config = dict(n_estimators=1000) max_config['max_depth'] = 11 elif space == 'gorishniy': # from Gorishniy, Rubachev, Khrulkov, Babenko, Revisiting Deep Learning Models for Tabular Data # they also have booster = "gbtree" (default), early-stopping-rounds=50, # n_hyperopt_steps=100 space = { 'eta': hp.loguniform('eta', np.log(1e-5), np.log(1.0)), 'max_depth': hp.quniform('max_depth', 3, 10, 1), 'min_child_weight': hp.qloguniform('min_child_weight', np.log(1e-8), np.log(1e5), 1), 'subsample': hp.uniform('subsample', 0.5, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1), 'reg_alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', np.log(1e-8), np.log(1e2))]), 'reg_lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', np.log(1e-8), np.log(1e2))]), 'reg_gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', np.log(1e-8), np.log(1e2))]) } default_config = dict(n_estimators=2000) max_config['max_depth'] = 10 elif space == 'custom-v1': space = { 'eta': hp.loguniform('eta', np.log(2e-3), np.log(0.5)), 'max_depth': hp.quniform('max_depth', 1, 10, 1), 'min_child_weight': hp.qloguniform('min_child_weight', np.log(1e-5), np.log(100.0), 1), 'subsample': hp.uniform('subsample', 0.4, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1), 'reg_alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)), 'reg_lambda': hp.loguniform('lambda', np.log(1e-8), np.log(4.0)), 'reg_gamma': hp.loguniform('gamma', np.log(1e-8), np.log(7.0)) } default_config = dict(n_estimators=1000) max_config['max_depth'] = 11 config = utils.update_dict(default_config, config) super().__init__(hyper_optimizer=HyperoptOptimizer(space=space, fixed_params=dict(), n_hyperopt_steps=n_hyperopt_steps, **config), max_resource_config=utils.join_dicts(config, max_config), **config) def create_alg_interface(self, n_sub_splits: int, **config) -> AlgInterface: return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**config) for i in range(n_sub_splits)]) class RandomParamsXGBAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): rng = np.random.default_rng(seed) # adapted from Grinsztajn et al. (2022) hpo_space_name = self.config.get('hpo_space_name', 'grinsztajn') if hpo_space_name == 'grinsztajn': params = { 'eta': np.exp(rng.uniform(np.log(1e-5), np.log(0.7))), 'max_depth': rng.integers(1, 11, endpoint=True), 'min_child_weight': round(np.exp(rng.uniform(0.0, np.log(100.0)))), 'subsample': rng.uniform(0.5, 1), 'colsample_bytree': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-8), np.log(1e-2))), 'reg_lambda': np.exp(rng.uniform(np.log(1.0), np.log(4.0))), 'reg_gamma': np.exp(rng.uniform(np.log(1e-8), np.log(7.0))) } elif hpo_space_name == 'probclass': params = { 'eta': np.exp(rng.uniform(np.log(1e-3), np.log(0.7))), 'max_depth': rng.integers(1, 11, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-5), np.log(100.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bytree': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))), 'reg_gamma': np.exp(rng.uniform(np.log(1e-5), np.log(5.0))) } elif hpo_space_name == 'large': params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(1e-3), np.log(0.7))), 'max_depth': rng.integers(1, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bytree': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_gamma': np.exp(rng.uniform(np.log(1e-4), np.log(1.0))) } elif hpo_space_name == 'large-v2': # modified (mostly larger) version of large params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(1e-2), np.log(0.2))), # shrunk 'max_depth': rng.integers(1, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bytree': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.5, 1), 'colsample_bynode': rng.uniform(0.5, 1), # added 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_gamma': np.exp(rng.uniform(np.log(1e-4), np.log(1.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), # added # hard to meta-optimize since 0 is the default # 'max_leaves': round(np.exp(rng.uniform(np.log(1e-1), np.log(64)))) # 'multi_strategy' # 'num_parallel_tree' # makes things slower # 'max_bin' # also makes things slower } elif hpo_space_name == 'large-v3': # shrunk version of large-v2: removed gamma, colsample_bytree params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(1e-2), np.log(8e-2))), # shrunk 'max_depth': rng.integers(3, 10, endpoint=True), # shrunk 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.6, 1), # shrunk 'colsample_bynode': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), } elif hpo_space_name == 'large-v4': # modified version of large-v3 params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))), # shrunk 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))), # expanded 'subsample': rng.uniform(0.6, 1), # shrunk 'colsample_bylevel': rng.uniform(0.7, 1), # shrunk 'colsample_bynode': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(10.0))), # modified 'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))), # modified 'grow_policy': rng.choice(['depthwise', 'lossguide']), 'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0)))) # added } elif hpo_space_name == 'large-v5': # shrunk version of large-v3 but without the extra stuff from large-v4 params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))), # shrunk 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))), # expanded 'subsample': rng.uniform(0.6, 1), # shrunk 'colsample_bylevel': rng.uniform(0.7, 1), # shrunk 'colsample_bynode': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), # modified 'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))), # modified } elif hpo_space_name == 'large-v6': # shrunk version of large-v4 params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(3e-2), np.log(5e-2))), # shrunk 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(32.0))), # expanded 'subsample': rng.uniform(0.65, 1), # shrunk 'colsample_bylevel': rng.uniform(0.7, 1), # shrunk 'colsample_bynode': rng.uniform(0.9, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), # modified 'reg_lambda': np.exp(rng.uniform(np.log(1e-3), np.log(20.0))), # modified 'grow_policy': rng.choice(['lossguide']), 'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0)))) # added } elif hpo_space_name == 'large-v7-10k': # large-v3 but with tabrepo lr space and with 10k estimators params = { 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.6, 1), 'colsample_bynode': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), } elif hpo_space_name == 'large-v8-10k': # large-v7-10k but really tuning grow_policy this time (it wasn't picked up before) # also tuning max_leaves (which also wasn't picked up before) params = { 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.6, 1), 'colsample_bynode': rng.uniform(0.5, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), # added 'max_leaves': round(np.exp(rng.uniform(np.log(2.0), np.log(2048.0)))) # added } elif hpo_space_name == 'large-v9-10k': # large-v8-10k but with smaller max_leaves space, # larger lower bound for colsample_bynode and colsample_bylevel params = { 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(3, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.5, 1), 'colsample_bylevel': rng.uniform(0.7, 1), 'colsample_bynode': rng.uniform(0.6, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), 'max_leaves': round(np.exp(rng.uniform(np.log(16), np.log(1024.0)))) # added } elif hpo_space_name == 'tabrepo1-es': params = { 'n_estimators': 1000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(0.5), np.log(1.5))), 'colsample_bytree': rng.uniform(0.5, 1), # there is enable_categorical, but I don't know how it makes sense to tune it } elif hpo_space_name == 'tabrepo1-es-10k': params = { 'n_estimators': 10_000, 'early_stopping_rounds': 50, 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(0.5), np.log(1.5))), 'colsample_bytree': rng.uniform(0.5, 1), # there is enable_categorical, but I don't know how it makes sense to tune it } elif hpo_space_name == 'tabarena': params = { 'n_estimators': 10_000, 'early_stopping_rounds': 300, # probably not exactly equivalent to TabArena 'eta': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))), 'max_depth': rng.integers(4, 10, endpoint=True), 'min_child_weight': np.exp(rng.uniform(np.log(1e-3), np.log(5.0))), 'subsample': rng.uniform(0.6, 1), 'colsample_bylevel': rng.uniform(0.6, 1), 'colsample_bynode': rng.uniform(0.6, 1), 'reg_alpha': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'reg_lambda': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))), 'grow_policy': rng.choice(['depthwise', 'lossguide']), 'max_cat_to_onehot': int(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))).item()), 'max_leaves': int(np.floor(np.exp(rng.uniform(np.log(8.0), np.log(1025.0)))).item()), } else: raise ValueError(f'Unknown {hpo_space_name=}') return params def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**config) for i in range(n_tv_splits)]) def get_available_predict_params(self) -> Dict[str, Dict[str, Any]]: return XGBSubSplitInterface(**self.config).get_available_predict_params() def set_current_predict_params(self, name: str) -> None: super().set_current_predict_params(name) ================================================ FILE: pytabkit/models/alg_interfaces/xrfm_interfaces.py ================================================ import contextlib import random from pathlib import Path from typing import Optional, List, Any, Tuple, Dict import numpy as np import torch from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface, \ RandomParamsAlgInterface from pytabkit.models.alg_interfaces.base import RequiredResources, SplitIdxs, InterfaceResources from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface from pytabkit.models.data.data import DictDataset from pytabkit.models.nn_models.base import Fitter from pytabkit.models.nn_models.models import PreprocessingFactory from pytabkit.models.torch_utils import get_available_memory_gb from pytabkit.models.training.logging import Logger class xRFMSubSplitInterface(SingleSplitAlgInterface): def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): super().__init__(fit_params=fit_params, **config) def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface': raise NotImplementedError() def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources, logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> Optional[ List[List[List[Tuple[Dict, float]]]]]: assert len(idxs_list) == 1 assert idxs_list[0].n_trainval_splits == 1 torch.set_float32_matmul_precision('highest') seed = idxs_list[0].sub_split_seeds[0] # print(f'Setting seed: {seed}') torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) n_train = idxs_list[0].n_train n_classes = ds.get_n_classes() device = interface_resources.gpu_devices[0] if len(interface_resources.gpu_devices) >= 1 else 'cpu' self.n_classes_ = n_classes self.device_ = device # create preprocessing factory factory = self.config.get('factory', None) if 'tfms' not in self.config: self.config['tfms'] = ['mean_center', 'l2_normalize', 'one_hot'] if factory is None: print("factory is None, creating factory") factory = PreprocessingFactory(**self.config) if idxs_list[0].val_idxs is None: raise ValueError(f'Training without validation set is currently not implemented') ds_train = ds.get_sub_dataset(idxs_list[0].train_idxs[0]) ds_val = ds.get_sub_dataset(idxs_list[0].val_idxs[0]) num_numerical = ds_train.tensor_infos['x_cont'].get_n_features() raw_cat_sizes = ds_train.tensor_infos['x_cat'].get_cat_sizes() if isinstance(raw_cat_sizes, torch.Tensor): raw_cat_sizes = raw_cat_sizes.tolist() else: raw_cat_sizes = [int(size) for size in raw_cat_sizes] if 'factory' in self.config or 'one_hot' not in self.config['tfms']: cat_sizes = [] # don't apply fast_categorical stuff else: use_missing_zero = self.config.get('use_missing_zero', True) use_binary_drop = self.config.get('use_1d_binary_onehot', True) cat_sizes = [] for size in raw_cat_sizes: adjusted = size - 1 if use_missing_zero else size if adjusted == 2 and use_binary_drop: adjusted = 1 cat_sizes.append(adjusted) # transform according to factory fitter: Fitter = factory.create(ds.tensor_infos) self.tfm_, ds_train = fitter.fit_transform(ds_train) ds_val = self.tfm_(ds_val) # print("Expected shape from ds_train: ", ds_train.tensors['x_cont'].shape) numerical_indices, categorical_indices, categorical_vectors = None, None, None if 'one_hot' in self.config['tfms']: # Simpler categorical_info construction (no standardization): # - Treat one-hots for categories with <=100 levels as numerical features # - Only provide identity vectors for categories with >100 levels numerical_block = torch.arange(num_numerical) categorical_indices = [] categorical_vectors = [] numerical_indices_parts = [] idx = num_numerical for cat_size in cat_sizes: cat_idxs = torch.arange(idx, idx + cat_size) if cat_size > 100: categorical_indices.append(cat_idxs) categorical_vectors.append(torch.eye(cat_size)) else: numerical_indices_parts.append(cat_idxs) idx += cat_size if len(numerical_indices_parts) > 0: numerical_indices = torch.cat([numerical_block] + numerical_indices_parts) else: numerical_indices = numerical_block # assume categoricals are encoded x_train = ds_train.tensors['x_cont'].to(device) x_val = ds_val.tensors['x_cont'].to(device) y_train = ds_train.tensors['y'].to(device) y_val = ds_val.tensors['y'].to(device) if self.n_classes_ == 0: # regression assert ds.tensor_infos['y'].get_n_features() == 1 self.y_mean_ = y_train.mean().item() self.y_std_ = y_train.std(correction=0).item() y_train = (y_train - self.y_mean_) / (self.y_std_ + 1e-30) y_val = (y_val - self.y_mean_) / (self.y_std_ + 1e-30) else: y_train = y_train.long() y_val = y_val.long() bandwidth = self.config.get('bandwidth', 10) p_interp = self.config.get('p_interp', 0.0) exponent = self.config.get('exponent', 1.0) reg = self.config.get('reg', 1e-3) iters = self.config.get('rfm_iters', 5) diag = self.config.get('diag', True) min_subset_size = self.config.get('max_leaf_samples', self.config.get('min_subset_size', 60_000)) early_stop_rfm = self.config.get('early_stop_rfm', True) early_stop_multiplier = self.config.get('early_stop_multiplier', 1.1) classification_mode = self.config.get('classification_mode', 'prevalence') fast_categorical = self.config.get('fast_categorical', True) M_batch_size = self.config.get('M_batch_size', 'auto') overlap_fraction = self.config.get('overlap_fraction', 0.1) use_temperature_tuning = self.config.get('use_temperature_tuning', True) temp_tuning_space = self.config.get('temp_tuning_space', None) bandwidth_mode = self.config.get('bandwidth_mode', 'constant') kernel_type = self.config.get('kernel_type', 'l2') split_method = self.config.get('split_method', 'top_vector_agop_on_subset') if bandwidth_mode in ['constant', 'adaptive']: pass elif bandwidth_mode == 'sqrtd': bandwidth *= np.sqrt(x_train.shape[0]) else: raise ValueError() if M_batch_size == 'auto': if kernel_type in ['gen_laplace', 'l1-laplace', 'lpq-laplace', 'l1', 'lpq', 'lpq_kermac']: # heuristic for storing a (n_train, M_batch_size, n_features) tensor in memory # 4 bytes per float full_tensor_size_per_elem_gb = (4 * n_train * ds_train.tensor_infos['x_cont'].get_n_features()) / ( 1024 ** 3) full_tensor_size_per_elem_gb *= 12 # just a heuristic M_batch_size = max(1, min(8192, round(get_available_memory_gb(device) / full_tensor_size_per_elem_gb))) # M_batch_size = 512 if n_train <= 10_000 else (256 if n_train <= 20_000 else 64) else: M_batch_size = 8192 print(f'{kernel_type=}, {M_batch_size=}') model_params, fit_params = {}, {} model_params['kernel'] = kernel_type model_params['bandwidth'] = bandwidth model_params['exponent'] = exponent model_params['norm_p'] = exponent + (2-exponent)*p_interp model_params['bandwidth_mode'] = bandwidth_mode model_params['diag'] = diag model_params['fast_categorical'] = fast_categorical fit_params['reg'] = reg fit_params['iters'] = iters fit_params['verbose'] = True fit_params['early_stop_rfm'] = early_stop_rfm fit_params['early_stop_multiplier'] = early_stop_multiplier fit_params['M_batch_size'] = M_batch_size if self.n_classes_ == 2: fit_params['solver'] = self.config.get('binary_solver', 'solve') else: fit_params['solver'] = 'solve' rfm_params = {'model': model_params, 'fit': fit_params} if 'one_hot' in self.config['tfms']: # Provide identity vectors only for high-cardinality categoricals; treat others as numerical categorical_info = { 'numerical_indices': numerical_indices.to(device), 'categorical_indices': [i.to(device) for i in categorical_indices], 'categorical_vectors': [v.to(device) for v in categorical_vectors], } else: # treat cats like numerical features categorical_info = None classification = self.n_classes_ > 0 val_metric_name = self.config.get('val_metric_name', 'class_error' if classification else 'mse') metric_name_to_metric_class = { '1-auroc-ovr': 'auc', 'class_error': 'accuracy', 'mse': 'mse', 'rmse': 'rmse', 'logloss': 'logloss', 'cross_entropy': 'logloss', 'brier': 'mse', } tuning_metric = metric_name_to_metric_class[val_metric_name] from xrfm import xRFM self.model_ = xRFM(rfm_params, device=device, min_subset_size=min_subset_size, tuning_metric=tuning_metric, categorical_info=categorical_info, classification_mode=classification_mode, split_method=split_method, overlap_fraction=overlap_fraction, use_temperature_tuning=use_temperature_tuning, temp_tuning_space=temp_tuning_space) self.model_.fit(x_train, y_train, x_val, y_val) return None def predict(self, ds: DictDataset) -> torch.Tensor: ds = self.tfm_(ds).to(self.device_) x_cont = ds.tensors['x_cont'] if self.n_classes_ > 0: with torch.cuda.device(self.device_) if self.device_.startswith('cuda') else contextlib.nullcontext(): y_pred = torch.from_numpy(self.model_.predict_proba(x_cont)).to(self.device_) y_pred = torch.log(y_pred) else: with torch.cuda.device(self.device_) if self.device_.startswith('cuda') else contextlib.nullcontext(): y_pred = torch.from_numpy(self.model_.predict(x_cont)).to(self.device_) y_pred = y_pred * self.y_std_ + self.y_mean_ return y_pred[None] # add n_models dimension def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int, split_seeds: List[int], n_train: int) -> RequiredResources: assert n_cv == 1 assert n_refit == 0 assert n_splits == 1 updated_config = utils.join_dicts(dict(n_estimators=100, max_n_threads=2), self.config) time_params = {'': 10, 'ds_onehot_size_gb': 10.0, 'n_train': 8e-5, 'n_samples*n_features': 8e-8} ram_params = {'': 0.15, 'ds_onehot_size_gb': 2.0} # gpu_ram_params = {'': 0.3, 'ds_onehot_size_gb': 1.0, 'n_train': 1e-6, 'n_features': 3e-4, # 'cat_size_sum': 2e-3} # gpu_ram_params = {'': 0.2, 'ds_onehot_size_gb': 5.0, 'n_train': 6e-5, 'n_features': 2e-3, # 'n_train*n_train': 20.0 / (1024 ** 3), 'n_train*n_features': 20.0 / (1024 ** 3)} gpu_ram_params = {'': 0.0, 'ds_onehot_size_gb': 0.0, 'n_train': 0.0, 'n_features': 0.0, 'n_train*n_train': 0.0, 'n_train*n_features': 0.0} rc = ResourcePredictor(config=updated_config, time_params=time_params, gpu_ram_params=gpu_ram_params, cpu_ram_params=ram_params, n_gpus=1, gpu_usage=1) # print("rc.get_required_resources(ds, n_train=n_train)") # rr = rc.get_required_resources(ds, n_train=n_train) # print("rr.n_threads = ", rr.n_threads) # print("rr.cpu_ram_gb = ", rr.cpu_ram_gb) # print("rr.n_gpus = ", rr.n_gpus) # print("rr.gpu_usage = ", rr.gpu_usage) # print("rr.gpu_ram_gb = ", rr.gpu_ram_gb) # print("rr.time_s = ", rr.time_s) # exit() return rc.get_required_resources(ds, n_train=n_train) # return RequiredResources(time_s=10.0, n_threads=16, cpu_ram_gb=50, n_gpus=0) def sample_xrfm_params(seed: int, hpo_space_name: str = 'default'): rng = np.random.default_rng(seed) if hpo_space_name == 'default': # similar or identical to the search space used in TabArena # (but here we also tune the categorical preprocessing) num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth': np.exp(rng.uniform(np.log(0.5), np.log(200.0))), 'reg': np.exp(rng.uniform(np.log(1e-6), np.log(10.))), 'exponent': rng.uniform(0.7, 1.4), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq', 'l2'], p=[0.8, 0.2]), # don't set these here so they can be overridden # they're the default values anyway # 'bandwidth_mode': rng.choice(['constant']), # 'min_subset_size': 60_000, # 'rfm_iters': 5, # 'classification_mode': 'prevalence', # 'binary_solver': 'solve', # 'early_stop_rfm': True, # 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) # 'split_method': 'top_vector_agop_on_subset', } elif hpo_space_name == 'only_l2': num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth': np.exp(rng.uniform(np.log(0.5), np.log(200.0))), 'reg': np.exp(rng.uniform(np.log(1e-6), np.log(10.))), 'exponent': rng.uniform(0.7, 1.4), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), # don't set these here so they can be overridden # 'bandwidth_mode': rng.choice(['constant']), # 'kernel_type': 'l2', # 'min_subset_size': 60_000, # 'rfm_iters': 5, # 'classification_mode': 'prevalence', # 'binary_solver': 'solve', # 'early_stop_rfm': True, # 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) # 'split_method': 'top_vector_agop_on_subset', } elif hpo_space_name == 'paper-large': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', # todo: adjust general solver? 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'top_vector_agop_on_subset', 'overlap_fraction': 0.0, 'use_temperature_tuning': False, } elif hpo_space_name == 'paper-large-pca': # like paper-large, but with pca splitting num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, 'rfm_iters': 5, # don't put it here, it's the default anyway and can be overridden 'classification_mode': 'zero_one', 'binary_solver': 'solve', # todo: adjust general solver? 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'pca', # changed compared 'overlap_fraction': 0.0, 'use_temperature_tuning': False, } elif hpo_space_name == 'large-soft': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'top_vector_agop_on_subset', # 'overlap_fraction': 0.0, # 'use_temperature_tuning': False, } elif hpo_space_name == 'large-soft-pca': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'pca', # 'overlap_fraction': 0.0, # 'use_temperature_tuning': False, } elif hpo_space_name == 'large-temptune': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'top_vector_agop_on_subset', 'overlap_fraction': 0.0, # 'use_temperature_tuning': False, 'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15)) } elif hpo_space_name == 'large-temptune-pca': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'pca', 'overlap_fraction': 0.0, # 'use_temperature_tuning': False, 'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15)) } elif hpo_space_name == 'large-temptune-rf': # used on meta-test in the paper num_tfms_list = [['mean_center', 'l2_normalize']] num_tfms = num_tfms_list[rng.integers(len(num_tfms_list))] cat_tfms_list = [['ordinal_encoding'], ['one_hot']] cat_tfms = cat_tfms_list[rng.integers(len(cat_tfms_list))] params = { 'bandwidth_mode': rng.choice(['constant', 'adaptive']), 'bandwidth': np.exp(rng.uniform(np.log(0.4), np.log(80.0))), 'reg': np.exp(rng.uniform(np.log(1e-5), np.log(50.))), 'exponent': rng.uniform(0.7, 1.3), 'p_interp': rng.uniform(0., 0.8), 'tfms': num_tfms + cat_tfms, 'diag': rng.choice([False, True]), 'kernel_type': rng.choice(['lpq_kermac', 'l2'], p=[0.8, 0.2]), # 'max_leaf_samples': 60_000, # don't put it here, it's the default anyway and can be overridden 'rfm_iters': 5, 'classification_mode': 'zero_one', 'binary_solver': 'solve', 'early_stop_rfm': True, 'early_stop_multiplier': 1.1, # early stop if val metric > esm * best val metric (for loss) 'split_method': 'rf_criterion', 'overlap_fraction': 0.0, # 'use_temperature_tuning': False, 'temp_tuning_space': [0.0] + list(np.logspace(np.log10(0.025), np.log10(4.5), num=15)) } else: raise ValueError(f'Unknown {hpo_space_name=}') return params class RandomParamsxRFMAlgInterface(RandomParamsAlgInterface): def _sample_params(self, is_classification: bool, seed: int, n_train: int): return sample_xrfm_params(seed, self.config.get('hpo_space_name', 'default')) def _create_interface_from_config(self, n_tv_splits: int, **config): return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**config) for i in range(n_tv_splits)]) ================================================ FILE: pytabkit/models/data/__init__.py ================================================ ================================================ FILE: pytabkit/models/data/conversion.py ================================================ import warnings from typing import Union, List, Optional import numpy as np import pandas as pd import torch from pandas import Index from sklearn.compose import ColumnTransformer, make_column_selector from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer from pytabkit.models.data.data import DictDataset, TensorInfo class ToDictDatasetConverter: def __init__(self, cat_features: Optional[Union[List[bool], np.ndarray]] = None, verbosity: int = 0): self.cat_features = cat_features if cat_features is None else np.asarray(cat_features, dtype=np.bool_) self.num_tf = None self.cat_tf = None self.fitted = False self.tensor_infos = None self.fitted_columns = None self.fitted_type = None self.verbosity = verbosity def fit_transform(self, x: Union[np.ndarray, pd.DataFrame, pd.Series, DictDataset]) -> DictDataset: self.fitted = True self.fitted_type = type(x) if isinstance(x, DictDataset): return x x = pd.DataFrame(x) self.fitted_columns = set(x.columns) if self.cat_features is not None: cat_columns = list(x.columns[self.cat_features]) num_columns = list(x.columns[~self.cat_features]) self.num_tf = ColumnTransformer(transformers=[ ('continuous', FunctionTransformer(), num_columns), ]) self.cat_tf = ColumnTransformer(transformers=[ ('categorical', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), cat_columns) ]) else: self.num_tf = ColumnTransformer(transformers=[ ('continuous', FunctionTransformer(), make_column_selector(dtype_include='number')), # ('continuous', FunctionTransformer(), make_column_selector(dtype_exclude=["string", "object", "category", "boolean"])), # todo: include this if we can make skrub a dependency # ('datetime', DatetimeEncoder(), make_column_selector(dtype_include=['datetime', 'datetimetz'])) ]) self.cat_tf = ColumnTransformer(transformers=[ ('categorical', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), make_column_selector(dtype_include=["string", "object", "category", "boolean"])) ]) x_cont = torch.as_tensor(self.num_tf.fit_transform(x), dtype=torch.float32) x_cat = torch.as_tensor(self.cat_tf.fit_transform(x) + 1, dtype=torch.long) # print(f'{self.num_tf.transformers_=}') # print(f'{self.cat_tf.transformers_=}') selected_cols = [] for col_tfm in [self.num_tf, self.cat_tf]: for name, tfm, cols in col_tfm.transformers_: if tfm != 'drop': selected_cols.extend(list(cols)) if self.verbosity >= 1: print(f'Columns classified as {name}: {list(cols)}') non_selected_cols = self.fitted_columns.difference(set(selected_cols)) if len(non_selected_cols) >= 1: warnings.warn(f'The following columns are not used due to their data type: {list(non_selected_cols)}') cat_sizes = torch.max(x_cat, dim=0)[0] + 1 self.tensor_infos = {'x_cont': TensorInfo(feat_shape=x_cont.shape[1:]), 'x_cat': TensorInfo(cat_sizes=cat_sizes)} return DictDataset(tensors={'x_cont': x_cont, 'x_cat': x_cat}, tensor_infos=self.tensor_infos) def transform(self, x: Union[np.ndarray, pd.DataFrame, pd.Series, DictDataset]) -> DictDataset: if not self.fitted: raise ValueError("Call fit() first to fit the converter.") if not isinstance(x, self.fitted_type): raise ValueError(f'Different input types during fit and predict: {self.fitted_type} and {type(x)}') if isinstance(x, DictDataset): # todo: could check whether cat_sizes etc. match? return x x = pd.DataFrame(x) # print(set(x.columns), self.fitted_columns) if set(x.columns) != self.fitted_columns: print('Raising column error') # second line is to satisfy the sklearn test # check_n_features_in_after_fitting in scikit-learn >= 1.6 raise ValueError(f'Different columns during fit() and predict(): {self.fitted_columns} and {set(x.columns)}\n' f'X has {len(x.columns)} features, but estimator is expecting {len(self.fitted_columns)} features as input') x_cont = torch.as_tensor(self.num_tf.transform(x), dtype=torch.float32) x_cat = torch.as_tensor(self.cat_tf.transform(x) + 1, dtype=torch.long) return DictDataset(tensors={'x_cont': x_cont, 'x_cat': x_cat}, tensor_infos=self.tensor_infos) if __name__ == '__main__': data = {'Continuous1': [1.2, 2.3, 3.4, 4.5, 5.6], 'Continuous2': [5.6, 6.7, 7.8, 8.9, 10.0], 'Category1': ['A', 'B', 'A', 'C', None], 'Category2': ['X', 'Y', None, 'X', None]} df = pd.DataFrame(data) df['Category2'] = df['Category2'].astype('category') print(set(df.columns) == set(df.columns)) print(ToDictDatasetConverter(cat_features=[True, False, True, True]).fit_transform(df).tensors) print(ToDictDatasetConverter().fit_transform(df).tensors) ================================================ FILE: pytabkit/models/data/data.py ================================================ import math from typing import Optional, Union, List, Dict, Tuple import numpy as np import pandas as pd import torch from pytabkit.models import utils from pytabkit.models.torch_utils import seeded_randperm, batch_randperm class TaskType: CLASSIFICATION = 'classification' REGRESSION = 'regression' # todo: add info which values might be missing? # todo: use np arrays instead of torch.Tensor? need to convert back a lot of .item()... class TensorInfo: def __init__(self, feat_shape: Optional[Union[List, np.ndarray, torch.Tensor]] = None, cat_sizes: Optional[Union[List, np.ndarray, torch.Tensor]] = None): self.feat_shape = feat_shape self.cat_sizes = cat_sizes if isinstance(self.feat_shape, torch.Tensor): self.feat_shape = self.feat_shape.detach().cpu().numpy() def get_feat_shape(self) -> np.ndarray: if self.feat_shape is None and self.cat_sizes is not None: self.feat_shape = np.asarray(self.cat_sizes).shape return np.asarray(self.feat_shape) # return torch.as_tensor(self.feat_shape) def get_cat_sizes(self) -> torch.Tensor: if self.cat_sizes is None and self.feat_shape is not None: self.cat_sizes = torch.zeros(*self.feat_shape, dtype=torch.long) return torch.as_tensor(self.cat_sizes) def get_n_features(self) -> int: return np.prod(self.get_feat_shape()) def get_cat_size_product(self) -> int: return torch.prod(self.get_cat_sizes()).item() def is_empty(self) -> bool: return self.get_n_features() == 0 def is_cont(self) -> bool: return self.cat_sizes is None or len(self.cat_sizes) == 0 or self.cat_sizes[ 0] == 0 # todo: might not work for multi-dimensional tensors def is_cat(self) -> bool: return not self.is_cont() def to_dict(self) -> Dict: # convert to list for yaml serialization return {'feat_shape': self.get_feat_shape().tolist(), 'cat_sizes': self.get_cat_sizes().numpy().tolist()} @staticmethod def from_dict(data: Dict) -> 'TensorInfo': return TensorInfo(data['feat_shape'], data['cat_sizes']) @staticmethod def concat(tensor_infos: List['TensorInfo']) -> 'TensorInfo': """ Create the TensorInfo that corresponds to concatenating the tensors. :param tensor_infos: :return: """ assert len(tensor_infos) > 0 if tensor_infos[0].is_cat(): return TensorInfo(cat_sizes=torch.cat([ti.get_cat_sizes() for ti in tensor_infos], dim=0)) else: return TensorInfo(feat_shape=sum([ti.get_feat_shape() for ti in tensor_infos])) class DictDataset: # todo: add conversion methods to/from pandas dataframe? # also to/from numpy/torch tensors? def __init__(self, tensors: Optional[Dict[str, torch.Tensor]], tensor_infos: Dict[str, TensorInfo], device: Optional[Union[str, torch.device]] = None, n_samples: Optional[int] = None): """ :param tensors: Can be None, but then device and n_samples must be specified. :param tensor_infos: Information (shape, category sizes) for each tensor. :param device: Device that tensors is on. If tensors is specified, this will be computed automatically. :param n_samples: Number of samples. If tensors is specified, this will be computed automatically. """ self.device = device if device is not None else next(iter(tensors.values())).device self.n_samples = n_samples if n_samples is not None else next(iter(tensors.values())).shape[0] self.tensors = None if tensors is None else {key: t.to(device) for key, t in tensors.items()} self.tensor_infos = tensor_infos def split_xy(self) -> Tuple['DictDataset', 'DictDataset']: y_keys = [key for key in self.tensors if key.startswith('y')] x_keys = [key for key in self.tensors if key not in y_keys] return self[x_keys], self[y_keys] def without_labels(self) -> 'DictDataset': return self.split_xy()[0] def to_df(self) -> pd.DataFrame: tensor_dfs = [] for key in self.tensors: val_np = self.tensors[key].detach().cpu().numpy() col_names = [f'{key}_{i}' for i in range(val_np.shape[1])] if self.tensor_infos[key].is_cat(): cat_sizes = self.tensor_infos[key].get_cat_sizes().numpy() df = pd.DataFrame( {col_names[i]: pd.Categorical(val_np[:, i], categories=list(range(cat_sizes[i]))) for i in range(len(col_names))}) else: df = pd.DataFrame(val_np, columns=col_names) tensor_dfs.append(df) return pd.concat(tensor_dfs, axis=1) def get_batch(self, idxs) -> Dict[str, torch.Tensor]: return {key: t[idxs, :] for key, t in self.tensors.items()} def get_sub_dataset(self, idxs) -> 'DictDataset': return DictDataset(self.get_batch(idxs), self.tensor_infos, device=self.device) def get_shuffled(self, seed) -> 'DictDataset': return self.get_sub_dataset(seeded_randperm(self.n_samples, self.device, seed)) def get_size_gb(self) -> float: """ :return: RAM usage in Gigabytes """ return self.n_samples * sum([ti.get_n_features() * (8 if ti.is_cat() else 4) for ti in self.tensor_infos.values()]) / (1024 ** 3) @staticmethod def join(*datasets): return DictDataset(utils.join_dicts(*[ds.tensors for ds in datasets]), utils.join_dicts(*[ds.tensor_infos for ds in datasets])) def to(self, device): return DictDataset(self.tensors, self.tensor_infos, device=device) def __getitem__(self, key): if isinstance(key, list): return DictDataset({k: self.tensors[k] for k in key}, {k: self.tensor_infos[k] for k in key}, device=self.device, n_samples=self.n_samples) return DictDataset({key: self.tensors[key]}, {key: self.tensor_infos[key]}, device=self.device, n_samples=self.n_samples) def get_n_classes(self): """ :return: Returns the number of classes, given by the category size of the first feature of the y tensor. This only makes sense if there is a y tensor, and it does not check if y has more than one feature. """ return self.tensor_infos['y'].get_cat_sizes()[0].item() class ParallelDictDataLoader: def __init__(self, ds: DictDataset, idxs: torch.Tensor, batch_size: int, shuffle: bool = False, adjust_bs: bool = False, drop_last: bool = False, output_device: Optional[Union[str, torch.device]] = None): """ :param dataset: A TaskData instance :param batch_size: default batch size, might be automatically adjusted :param shuffle: whether the dataset should be shuffled before each epoch :param adjust_bs: whether the batch_size may be lowered so that the batches are of more equal size while keeping the number of batches the same :param drop_last: whether the last batch should be omitted if it is smaller than the other ones :param output_device: The device that the returned data should be on (if None, take the device where the data already is) """ self.ds = ds self.idxs = idxs.to(ds.device) self.n_parallel = idxs.shape[0] self.n_samples = idxs.shape[1] self.output_device = ds.device if output_device is None else output_device self.adjust_bs = adjust_bs self.shuffle = shuffle self.drop_last = drop_last self.specified_batch_size = batch_size self.batch_size = min(batch_size, self.n_samples) if self.drop_last: self.n_batches = math.floor(self.n_samples / self.batch_size) if adjust_bs: self.batch_size = math.floor(self.n_samples / self.n_batches) self.sep_idxs = [self.batch_size * i for i in range(self.n_batches + 1)] else: self.n_batches = math.ceil(self.n_samples / self.batch_size) if adjust_bs: self.batch_size = math.ceil(self.n_samples / self.n_batches) self.sep_idxs = [self.batch_size * i for i in range(self.n_batches)] + [self.n_samples] def get_num_samples(self): return self.n_samples def get_num_iterated_samples(self): if self.drop_last: return self.n_batches * self.batch_size return self.get_num_samples() def __len__(self): return self.n_batches def __iter__(self): if self.shuffle: perms = batch_randperm(self.n_parallel, self.n_samples, device=self.ds.device) for start, stop in zip(self.sep_idxs[:-1], self.sep_idxs[1:]): batches = self.ds.get_batch(idxs=self.idxs.gather(1, perms[:, start:stop])) yield {key: t.to(self.output_device) for key, t in batches.items()} else: for start, stop in zip(self.sep_idxs[:-1], self.sep_idxs[1:]): batches = self.ds.get_batch(idxs=self.idxs[:, start:stop]) yield {key: t.to(self.output_device) for key, t in batches.items()} class ValDictDataLoader: def __init__(self, ds: DictDataset, val_idxs: torch.Tensor, val_batch_size=256): """ Create a Prediction Dataloader from Dataset and validation indices """ ds_x, ds_y = ds.split_xy() self.val_x_dl = ParallelDictDataLoader(ds_x, val_idxs, batch_size=val_batch_size) self.val_idxs = val_idxs self.val_y = ds_y.get_batch(val_idxs).get('y', None) self.n_samples = val_idxs.shape[1] def __len__(self): return self.n_samples def __iter__(self): return self.val_x_dl.__iter__() ================================================ FILE: pytabkit/models/data/nested_dict.py ================================================ from typing import Union, List, Tuple, Dict from pytabkit.models import utils class NestedDict: """ Dictionary that can be used with multiple indices. Instead of d = dict() d['first'] = dict() d['first']['second'] = 1.0 we can use d = NestedDict() d['first', 'second'] = 1.0 """ def __init__(self, data_dict=None): self.data_dict = data_dict if data_dict is not None else {} def __getitem__(self, idxs): if not isinstance(idxs, tuple): idxs = (idxs,) d = self.data_dict for idx in idxs: d = d[idx] return d def __setitem__(self, idxs, value): if not isinstance(idxs, tuple): idxs = (idxs,) if isinstance(value, NestedDict): value = value.data_dict # allow to properly "hang in" value in the case that value is of type NestedDict? d = self.data_dict for i, idx in enumerate(idxs): if idx not in d or i+1 == len(idxs): v = value for rev_idx in idxs[:i:-1]: v = {rev_idx: v} d[idx] = v return d = d[idx] def __contains__(self, item: Union[List, Tuple]): current_dict = self.data_dict for elem in item: if elem not in current_dict: return False current_dict = current_dict[elem] return True def get(self, idxs, default=None): try: return self[idxs] except KeyError: return default def _dict_update_rec(self, d1: dict, d2: dict): for key in d2: if key in d1: self._dict_update_rec(d1[key], d2[key]) else: d1[key] = d2[key] def update(self, other: 'NestedDict'): self._dict_update_rec(self.data_dict, other.data_dict) def __str__(self): return str(self.data_dict) def __repr__(self): return f'NestedDict({str(self)})' def get_dict(self) -> Dict: return self.data_dict @staticmethod def from_kwargs(**kwargs): return NestedDict( {key: (value.data_dict if isinstance(value, NestedDict) else value) for key, value in kwargs.items()} ) if __name__ == '__main__': nd = NestedDict() nd['test', 'test'] = 1 print(nd['test']) ================================================ FILE: pytabkit/models/data/splits.py ================================================ import math from typing import Tuple, List, Optional import torch from pytabkit.models import utils from pytabkit.models.data.data import DictDataset from pytabkit.models.torch_utils import seeded_randperm # splits should not reference tasks, since tasks should only be loaded in the respective processes in the DevicePool, # while splits are loaded earlier class Split: def __init__(self, ds: DictDataset, idxs: Tuple[torch.Tensor, torch.Tensor]): """ :param ds: The dataset that is split into parts :param idxs: Tuple of Tensors containing indices of the different parts of ds """ self.ds = ds self.idxs = idxs def get_sub_ds(self, i): return self.ds.get_sub_dataset(self.idxs[i]) def get_sub_idxs(self, i): return self.idxs[i] class Splitter: def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError() def split_ds(self, ds: DictDataset) -> Split: idxs = self.get_idxs(ds) return Split(ds, idxs) def get_split_sizes(self, n_samples: int) -> Tuple: raise NotImplementedError() class RandomSplitter(Splitter): def __init__(self, seed, first_fraction=0.8, max_n_first: Optional[int] = None): self.seed = seed self.first_fraction = first_fraction self.max_n_first = max_n_first def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]: # use ceil such that e.g. in the case of 1 sample, the sample ends up in the training set. split_idx = int(math.ceil(self.first_fraction * ds.n_samples)) if self.max_n_first is not None: split_idx = min(split_idx, self.max_n_first) perm = seeded_randperm(ds.n_samples, ds.device, self.seed) return perm[:split_idx], perm[split_idx:] def get_split_sizes(self, n_samples: int) -> Tuple: split_idx = int(math.ceil(self.first_fraction * n_samples)) if self.max_n_first is not None: split_idx = min(split_idx, self.max_n_first) return split_idx, n_samples-split_idx class IndexSplitter(Splitter): def __init__(self, index): self.index = index def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]: idxs = torch.arange(ds.n_samples, device=ds.device, dtype=torch.long) return idxs[:self.index], idxs[self.index:] def get_split_sizes(self, n_samples: int) -> Tuple: return self.index, n_samples-self.index class AllNothingSplitter(Splitter): def get_idxs(self, ds: DictDataset) -> Tuple[torch.Tensor, torch.Tensor]: all = torch.arange(ds.n_samples, device=ds.device, dtype=torch.long) nothing = torch.zeros(0, device=ds.device, dtype=torch.long) return all, nothing def split_ds(self, ds: DictDataset) -> Split: idxs = self.get_idxs(ds) return Split(ds, idxs) def get_split_sizes(self, n_samples: int) -> Tuple: return n_samples, 0 class MultiSplitter: def get_idxs(self, ds: DictDataset) -> List[Tuple[torch.Tensor, torch.Tensor]]: raise NotImplementedError() def split_ds(self, ds: DictDataset) -> List[Split]: idxs_list = self.get_idxs(ds) return [Split(ds, idxs) for idxs in idxs_list] class KFoldSplitter(MultiSplitter): def __init__(self, k: int, seed: int, stratified=False): if k <= 1: raise ValueError(f'KFoldSplitter: required k>=2, but received {k=}') self.k = k self.seed = seed self.stratified = stratified def get_idxs(self, ds: DictDataset) -> List[Tuple[torch.Tensor, torch.Tensor]]: idxs = seeded_randperm(ds.n_samples, device=ds.device, seed=self.seed) if self.stratified: # do it with random shuffling such that elements of the same class are still shuffled perm = torch.argsort(ds.tensors['y'][idxs, 0]) idxs = idxs[perm] fold_len = (ds.n_samples // self.k) * self.k fold_idxs = [idxs[start:fold_len:self.k] for start in range(self.k)] rest_idxs = idxs[fold_len:] idxs_list = [] for i in range(self.k): idxs_1 = torch.cat([fold_idxs[j] for j in range(self.k) if j != i] + [rest_idxs], dim=-1) idxs_list.append((idxs_1, fold_idxs[i])) return idxs_list def get_split_sizes(self, n_samples: int) -> Tuple: n_val = n_samples // self.k return n_samples - n_val, n_val class SplitInfo: def __init__(self, splitter: Splitter, split_type: str, id: int, alg_seed: int, train_fraction: float = 0.75): self.splitter = splitter self.split_type = split_type # one of "random", "default" self.id = id self.alg_seed = alg_seed self.train_fraction = train_fraction def get_sub_seed(self, split_idx: int, is_cv: bool): return utils.combine_seeds(self.alg_seed, 2 * split_idx + int(is_cv)) # return self.alg_seed + 5000 * int(is_cv) + 10000 * split_idx def get_sub_splits(self, ds: DictDataset, n_splits: int, is_cv: bool) -> List[Split]: if not is_cv: split = AllNothingSplitter().split_ds(ds) return [split] * n_splits if n_splits <= 1: return [RandomSplitter(seed=self.alg_seed, first_fraction=self.train_fraction).split_ds(ds)] else: is_classification = ds.tensor_infos['y'].get_cat_sizes()[0].item() > 0 return KFoldSplitter(n_splits, seed=self.alg_seed, stratified=is_classification).split_ds(ds) def get_train_and_val_size(self, n_samples: int, n_splits: int, is_cv: bool) -> Tuple[int, int]: n_trainval, n_test = self.splitter.get_split_sizes(n_samples) if not is_cv: return n_trainval, 0 elif n_splits <= 1: return RandomSplitter(seed=self.alg_seed, first_fraction=self.train_fraction).get_split_sizes(n_trainval) else: # stratified doesn't influence split sizes return KFoldSplitter(n_splits, seed=self.alg_seed, stratified=False).get_split_sizes(n_samples) ================================================ FILE: pytabkit/models/hyper_opt/__init__.py ================================================ ================================================ FILE: pytabkit/models/hyper_opt/coord_opt.py ================================================ from pathlib import Path import numpy as np from typing import Union, Callable, Any, Optional, Dict, Tuple from pytabkit.models import utils from pytabkit.models.hyper_opt.hyper_optimizers import HyperOptimizer # implementing a custom coordinate-descent style hyperparameter optimizer def identity(x): return x class Hyperparameter: def __init__(self, start_value: Union[int, float], min_step_size: Union[int, float], importance: float, log_scale: bool = False, only_int: bool = False, min_value: Union[int, float] = -np.inf, max_value: Union[int, float] = np.inf, out_func: Callable[[Any], Any] = None, max_step_size: float = np.inf): # if log_scale=True, min_value, max_value, min_step_size, and max_step_size are on the log scale, # i.e., min_value can still be negative # in this case, the values will be exponentiated at the end self.start_value = start_value self.min_step_size = min_step_size self.max_step_size = max_step_size self.importance = importance self.log_scale = log_scale self.only_int = only_int self.min_value = min_value self.max_value = max_value self.out_func = out_func or identity self.tfm = (lambda x: np.exp(x)) if log_scale else identity self.inv_tfm = (lambda x: np.log(x)) if log_scale else identity self.quant_tfm = (lambda x: round(x)) if only_int else identity # if log_scale: # self.min_value = np.log(min_value) if 0 < min_value < np.inf else -np.inf # self.max_value = np.log(max_value) if 0 < max_value < np.inf else np.inf if self.log_scale and self.only_int: # need to avoid having values < 0 for which round(exp(value)) = 0, which is not representable in log-space self.min_value = max(self.min_value, 0.0) def adjust_step_size(self, current_value: float, step_size: float) -> Optional[float]: # should return suggested step size that satisfies all constraints, or None if no suitable step size is found # We have three constraints: step size limit, min_value/max-value, and quantization. # Updating each of them could violate one of the others. # do a loop and check if all three are satisfied # if it doesn't work after a certain number of iterations, we fail and return None for i in range(5): updated = False step_size_sign = np.sign(step_size) # check min_step_size / max_step_size if np.abs(step_size) < self.min_step_size - 1e-8: step_size = step_size_sign * self.min_step_size updated = True if np.abs(step_size) > self.max_step_size + 1e-8: step_size = step_size_sign * self.max_step_size updated = True # check min_value / max_value candidate = current_value + step_size if candidate < self.min_value - 1e-8: candidate = self.min_value updated = True elif candidate > self.max_value + 1e-8: candidate = self.max_value updated = True step_size = candidate - current_value print(f'CoordOpt: {self.min_value=}, {self.max_value=}, {self.start_value=}') print(f'CoordOpt: {current_value=}, {candidate=}') curr_t = self.tfm(current_value) cand_t = self.tfm(candidate) curr_q = self.quant_tfm(curr_t) cand_q = self.quant_tfm(cand_t) if curr_q == cand_q: cand_q = curr_q + step_size_sign if self.log_scale and self.only_int and cand_q <= 0.5: return None # curr_q is 1 and we want to make cand_q = 0 but this doesn't exist in log scale step_size = self.inv_tfm(cand_q) - current_value updated = True if not updated: # step size fulfilled all three constraints in this loop and hence has not been updated return step_size return None # did not find a step size that fulfills all constraints def apply_tfms(self, x: Any) -> Any: return self.out_func(self.quant_tfm(self.tfm(x))) class CoordOptimizerImpl: # potential improvements: # increase the importances in an UCB-style # in coord_opt_idx allow to explore the reverse direction if the first step in the previous direction fails def __init__(self, f: Callable[[Dict], Tuple[float, Any]], space: Dict[str, Hyperparameter], n_steps: int, beta: float = 0.5, step_dec_factor: float = 0.5, step_inc_factor: float = 2.0, initial_step_multiplier: float = 8.0): self.f = f self.space = space self.n_steps = n_steps self.n_f_evals = 0 if n_steps <= 0: raise ValueError(f'CoordOptimizerImpl: Got {n_steps=} but need n_steps > 0') # hyperparameters of the HPO method self.beta = beta self.step_dec_factor = step_dec_factor self.step_inc_factor = step_inc_factor self.initial_step_multiplier = initial_step_multiplier self.max_coord_opt_steps = 10 self.keys = [k for k, v in space.items()] # preserve the order in space self.d = len(self.keys) self.hps = [space[key] for key in self.keys] self.prior_importances = [hp.importance for hp in self.hps] self.priorities = np.argsort(np.asarray(self.prior_importances))[::-1] self.importances = np.zeros(self.d) self.min_step_sizes = np.asarray([hp.min_step_size for hp in self.hps]) self.hp_values = np.asarray([hp.start_value for hp in self.hps]) self.step_sizes = self.initial_step_multiplier * self.min_step_sizes for idx in range(self.d): # adjust direction of step sizes if self.hp_values[idx] - self.hps[idx].min_value > self.hps[idx].max_value - self.hp_values[idx]: # there is more space in the negative direction, start in the other direction self.step_sizes[idx] *= -1 # current best hyperparameter values (before transformation, i.e., can be in log-space) self.evaluated_hp_values = [] # to avoid evaluating the same point twice # eval loss on starting values self.loss, self.additional_info = self.eval(self.hp_values) self.blocked_directions = np.zeros(self.d, dtype=np.int32) def suggest(self, new_hp_values) -> float: # return loss difference, update optimum if necessary etc. # unblock variables if new optimum is found new_loss, new_additional_info = self.eval(new_hp_values) loss_diff = new_loss - self.loss if new_loss < self.loss: # update parameters self.loss = new_loss self.additional_info = new_additional_info self.hp_values = new_hp_values # unblock all coordinates print(f'CoordOpt: Unblocking all coordinates') self.blocked_directions = np.zeros(self.d, dtype=np.int32) return loss_diff def convert_hp_values(self, values: np.ndarray) -> Dict[str, Any]: return {key: hp.apply_tfms(value) for (key, value, hp) in zip(self.keys, values, self.hps)} def eval(self, new_hp_values: np.ndarray) -> Tuple[float, Any]: # convert hyperparameters, call function, increase step counter, raise error if step count is full if self.n_f_evals >= self.n_steps: raise StopIteration() self.n_f_evals += 1 print(f'CoordOpt: Evaluating hyperparameters in step {self.n_f_evals}: {new_hp_values}') self.evaluated_hp_values.append(new_hp_values) converted = {key: hp.apply_tfms(value) for (key, value, hp) in zip(self.keys, new_hp_values, self.hps)} return self.f(converted) def already_evaluated(self, new_hp_values: np.ndarray) -> bool: """ :param new_hp_values: New hyperparameter values that should be tried. :return: True if these hyperparameters have already been evaluated before """ for old_hp_values in self.evaluated_hp_values: if np.allclose(new_hp_values, old_hp_values): return True return False def coord_opt_idx(self, idx: int): # implicitly update importance # keep track of step? or use an exception to break when the step count has finished? for i in range(self.max_coord_opt_steps): print(f'CoordOpt: Optimizing coordinate {idx}, step {i}') # loop while line search over coordinate still finds an improvement # adjust step size adj_step = self.hps[idx].adjust_step_size(current_value=self.hp_values[idx], step_size=self.step_sizes[idx]) if adj_step is None: print(f'CoordOpt: adj_step is None') # no suitable step size was found, for example because the boundary is reached self.step_sizes[idx] = -self.step_dec_factor * self.step_sizes[idx] # if this would bring us below the minimum step size, block the variable if np.abs(self.step_sizes[idx]) < self.hps[idx].min_step_size: print(f'CoordOpt: Blocking coordinate {idx}') self.blocked_directions[idx] += 1 return # make step with suggest() new_hp_values = np.copy(self.hp_values) new_hp_values[idx] += adj_step if self.already_evaluated(new_hp_values): print(f'CoordOpt: Already evaluated hyperparameters') self.step_sizes[idx] = -self.step_dec_factor * adj_step self.blocked_directions[idx] += 1 return loss_diff = self.suggest(new_hp_values) # update importance self.importances[idx] = self.beta * self.importances[idx] + (1 - self.beta) * np.abs(loss_diff) if loss_diff < 0: print(f'CoordOpt: Loss decreased') self.step_sizes[idx] = self.step_inc_factor * adj_step else: print(f'CoordOpt: Loss did not decrease') # if loss didn't reduce, *= - step_dec_factor, return self.step_sizes[idx] = -self.step_dec_factor * adj_step # if this would bring us below the minimum step size, block the variable if np.abs(self.step_sizes[idx]) < self.hps[idx].min_step_size: print(f'CoordOpt: Blocking coordinate {idx}') self.blocked_directions[idx] += 1 return def run(self) -> None: # wrap everything in try/catch for termination try: while True: # select best index according to importance if np.all(self.blocked_directions >= 2): print(f'CoordOpt: Reached a local optimum') return if len(self.priorities) > 0: hp_idx = self.priorities[0] self.priorities = self.priorities[1:] else: # print(f'{self.importances=}') importances = np.copy(self.importances) importances[self.blocked_directions >= 2] = -1.0 hp_idx = np.argmax(importances) if self.blocked_directions[hp_idx] >= 2: raise RuntimeError('CoordOpt: selected blocked index, this should not occur') # run coord_opt_idx on the index self.coord_opt_idx(hp_idx) except StopIteration: return class CoordOptimizer(HyperOptimizer): class CoordOptFuncWrapper: def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: Dict[str, Any]): self.f = f self.fixed_params = fixed_params def __call__(self, params: Dict[str, Any], seed: int = 0): params = utils.join_dicts(params, self.fixed_params) loss, additional_info = self.f(params) return np.inf if np.isnan(loss) else loss, None def __init__(self, space: Dict[str, Hyperparameter], fixed_params: Dict[str, Any], n_hyperopt_steps: int = 50, **config): super().__init__(n_hyperopt_steps=n_hyperopt_steps) self.space = space self.n_hyperopt_steps = n_hyperopt_steps self.fixed_params = fixed_params self.config = config def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None: fn = CoordOptimizer.CoordOptFuncWrapper(f, self.fixed_params) opt = CoordOptimizerImpl(fn, self.space, n_steps=self.n_hyperopt_steps) opt.run() ================================================ FILE: pytabkit/models/hyper_opt/hyper_optimizers.py ================================================ import time from pathlib import Path from typing import Callable, Tuple, Any, Dict, Union, Optional import numpy as np from pytabkit.models import utils from pytabkit.models.training.logging import Logger class FunctionEvaluationTracker: """ Helper class to keep track of where the function to be optimized is evaluated and what are the best parameters """ def __init__(self, f: Callable[[dict], Tuple[float, Any]], n_steps: int, opt_desc: str, logger: Logger): self.f = f self.n_steps = n_steps self.opt_desc = opt_desc self.logger = logger self.best_params = None self.best_result = None self.n_calls = 0 def __call__(self, params: dict) -> Tuple[float, Any]: # params = utils.join_dicts(params, self.fixed_params) start_time = time.time() result = self.f(params) if np.isnan(result[0]): result = (np.inf, result[1]) eval_time = time.time() - start_time if self.best_result is None or (result[0] <= self.best_result[0]): # print(f'new best result') self.best_params = params self.best_result = result self.n_calls += 1 self.logger.log(-1, f'Hyperopt step {self.n_calls}/{self.n_steps} on {self.opt_desc} took {eval_time:g} s') # don't return the second part of result as HPO libraries might store all of them, causing RAM problems return result[0], None def get_best_params_and_result(self) -> Tuple[Dict, Tuple[float, Any]]: return self.best_params, self.best_result class HyperOptimizer: def __init__(self, n_hyperopt_steps: int): self.n_hyperopt_steps = n_hyperopt_steps def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None: # override this in subclasses raise NotImplementedError() def optimize(self, f: Callable[[dict], Tuple[float, Any]], seed: int, opt_desc: str, logger: Logger) \ -> Tuple[Dict, Any]: """ :param f: Function to minimize. It should take a dict of parameters and return a tuple containing the validation loss and additional information about the run (additional information could for example be the early stopping epoch found in this particular run, for example {'n_estimators': best_n_estimators}) :param seed: Random seed for optimization :param opt_desc: name of the optimized algorithm / optimization problem (used for printing optimization intermediate state) :param logger: Logger used for printing information :return: Returns a tuple containing a dictionary with the optimal parameters and the additional info generated by the function at the optimal parameters """ # todo: could also add verbosity level # todo: may need to be able to treat failures, hence make the tuple optional? # todo: could allow to pass the iteration number to the function tracker = FunctionEvaluationTracker(f, n_steps=self.n_hyperopt_steps, opt_desc=opt_desc, logger=logger) self._optimize_impl(tracker, seed=seed) best_params, best_result = tracker.get_best_params_and_result() return best_params, best_result[1] def get_n_hyperopt_steps(self) -> int: return self.n_hyperopt_steps # todo: have one class that does performance tracking of all intermediate steps (or do that in HyperoptAlgInterface?) # and maybe also do logging separately? # maybe have wrapper function / callable class that tracks it? # Then implement something like _optimize() that gets the wrapped function? class ConstantHyperOptimizer(HyperOptimizer): def __init__(self, params: dict): super().__init__(n_hyperopt_steps=1) self.params = params def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None: f(self.params) def f_unpack_dict(dct): """ Unpacks all sub-dictionaries in given dictionary recursively. There should be no duplicated keys across all nested subdictionaries, or some instances will be lost without warning Source: https://www.kaggle.com/fanvacoolt/tutorial-on-hyperopt Parameters: ---------------- dct : dictionary to unpack Returns: ---------------- : unpacked dictionary """ res = {} for (k, v) in dct.items(): if isinstance(v, dict): res = {**res, **f_unpack_dict(v)} else: res[k] = v return res class HyperoptOptimizer(HyperOptimizer): class HyperoptFuncWrapper: def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: dict): self.f = f self.fixed_params = fixed_params def __call__(self, params: dict): params = f_unpack_dict(params) # for nested/conditional params from hyperopt import STATUS_FAIL, STATUS_OK params = utils.join_dicts(params, self.fixed_params) loss, additional_info = self.f(params) return {'loss': loss, 'additional_info': additional_info, 'status': STATUS_FAIL if np.isnan(loss) else STATUS_OK, 'params': params.copy()} def __init__(self, space, fixed_params, n_hyperopt_steps: int = 50, **config): super().__init__(n_hyperopt_steps=n_hyperopt_steps) self.space = space self.fixed_params = fixed_params self.config = config def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None: import hyperopt trials = hyperopt.Trials() # todo: could serialize the trials object for restarting algo_name = self.config.get('hyperopt_algo', 'tpe') if algo_name == 'tpe': algo = hyperopt.tpe.suggest elif algo_name == 'atpe': # atpe seems to be not deterministic even when setting the seed... raise ValueError('atpe for hyperopt is not implemented since it is not deterministic and can throw errors') # print(f'Using atpe', flush=True) # algo = hyperopt.atpe.suggest elif algo_name == 'rand': print(f'Using rand', flush=True) algo = hyperopt.rand.suggest else: raise ValueError(f'Unknown hyperopt_algo name "{algo_name}"') fn = HyperoptOptimizer.HyperoptFuncWrapper(f, self.fixed_params) time_limit_s: Optional[float] = self.config.get('time_limit_s', None) _ = hyperopt.fmin(fn=fn, timeout=None if time_limit_s is None else int(time_limit_s), space=self.space, algo=algo, max_evals=self.n_hyperopt_steps, trials=trials, rstate=np.random.default_rng(seed=seed), verbose=False, show_progressbar=False) class SMACOptimizer(HyperOptimizer): class SMACFuncWrapper: def __init__(self, f: Callable[[dict], Tuple[float, Any]], fixed_params: Dict[str, Any]): self.f = f self.fixed_params = fixed_params def __call__(self, params, seed: int = 0): # params should be of type ConfigSpace.Configuration params = params.get_dictionary() params = utils.join_dicts(params, self.fixed_params) loss, additional_info = self.f(params) return np.inf if np.isnan(loss) else loss def __init__(self, space, fixed_params: Dict[str, Any], n_hyperopt_steps: int = 50, tmp_folder: Union[str, Path] = 'smac3_output', **config): super().__init__(n_hyperopt_steps=n_hyperopt_steps) self.space = space self.n_hyperopt_steps = n_hyperopt_steps self.fixed_params = fixed_params self.config = config self.tmp_folder = tmp_folder def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> None: use_gp = self.config.get('smac_surrogate', 'RF') == 'GP' fn = SMACOptimizer.SMACFuncWrapper(f, self.fixed_params) import smac scenario = smac.Scenario(self.space, deterministic=True, n_trials=self.n_hyperopt_steps, seed=seed, use_default_config=True, output_directory=self.tmp_folder) max_ratio = 0.25 n_configs_per_hyperparameter = 8 if use_gp else 10 if 'n_initial_design' in self.config: max_ratio = self.config['n_initial_design'] / self.n_hyperopt_steps n_configs_per_hyperparameter = self.config['n_initial_design'] from smac.initial_design import SobolInitialDesign initial_design = SobolInitialDesign( scenario=scenario, n_configs=None, n_configs_per_hyperparameter=n_configs_per_hyperparameter, max_ratio=max_ratio, additional_configs=[], ) # Now we use SMAC to find the best hyperparameters if use_gp: print(f'Using SMAC with GP surrogate') facade = smac.BlackBoxFacade( scenario=scenario, target_function=fn.__call__, overwrite=True, logging_level=False, initial_design=initial_design ) else: facade = smac.HyperparameterOptimizationFacade( scenario, fn.__call__, # We pass the target function here overwrite=True, # Overrides any previous results that are found that are inconsistent with the meta-data logging_level=False, # no logging initial_design=initial_design, ) facade.optimize() ================================================ FILE: pytabkit/models/nn_models/__init__.py ================================================ ================================================ FILE: pytabkit/models/nn_models/activations.py ================================================ import torch import torch.nn.functional as F from typing import Dict # ------ from fastai2 from torch.jit import script from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.nn_models.base import Variable, Fitter, FitterFactory, FunctionFitter, Layer @script def _swish_jit_fwd(x): return x.mul(torch.sigmoid(x)) @script def _swish_jit_bwd(x, grad_output): x_sigmoid = torch.sigmoid(x) return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid))) class _SwishJitAutoFn(torch.autograd.Function): @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return _swish_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_variables[0] return _swish_jit_bwd(x, grad_output) # don't use the optimized version since this seems to behave slightly differently for Pytorch Lightning # def swish(x): return _SwishJitAutoFn.apply(x) def swish(x): return x * torch.sigmoid(x) @script def _mish_jit_fwd(x): return x.mul(torch.tanh(F.softplus(x))) @script def _mish_jit_bwd(x, grad_output): x_sigmoid = torch.sigmoid(x) x_tanh_sp = F.softplus(x).tanh() return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp)) class MishJitAutoFn(torch.autograd.Function): @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return _mish_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] return _mish_jit_bwd(x, grad_output) # don't use the optimized version since this seems to behave slightly differently for Pytorch Lightning # def mish(x): return MishJitAutoFn.apply(x) def mish(x): return x.mul(torch.tanh(F.softplus(x))) def golu(x): return x * torch.exp(-torch.exp(-torch.clamp(x, min=-10))) # ----- end fastai2 class ParametricActivationLayer(Layer): def __init__(self, f, weight): super().__init__() self.f = f self.weight = weight def forward_cont(self, x): # print(f'{self.weight.mean().item()=:g}') return x + (self.f(x) - x) * self.weight def _stack(self, layers): return ParametricActivationLayer(self.f, Variable.stack([l.weight for l in layers])) class ParametricActivationFitter(Fitter): def __init__(self, f, **config): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) self.f = f self.act_lr_factor = config.get('act_lr_factor', 1.0) self.act_wd_factor = config.get('act_wd_factor', 1.0) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: n_cont = ds.tensor_infos['x_cont'].get_n_features() return ParametricActivationLayer(self.f, Variable(torch.ones(1, n_cont, device=ds.device), trainable=True, hyper_factors={'lr': self.act_lr_factor, 'wd': self.act_wd_factor})) class ActivationFactory(FitterFactory): def __init__(self, **config): super().__init__() self.config = config def _create(self, tensor_infos) -> Fitter: # todo: implement more activations, also parametric ones act_name = self.config.get('act_name', self.config.get('act', 'relu')) if act_name == 'relu': f = torch.relu elif act_name == 'selu': f = torch.selu elif act_name == 'swish' or act_name == 'silu': f = swish elif act_name == 'sswish': # normalized by output variance f = lambda x: 1.6765 * swish(x) elif act_name == 'mish': f = mish elif act_name == 'smish': # normalized by output variance f = lambda x: 1.6 * mish(x) elif act_name == 'gelu': f = F.gelu elif act_name == 'elu': f = F.elu elif act_name == 'golu': f = golu else: raise ValueError(f'Activation {act_name} unknown') if self.config.get('use_parametric_act', False): return ParametricActivationFitter(f, **self.config) else: return FunctionFitter(f) ================================================ FILE: pytabkit/models/nn_models/base.py ================================================ from pytabkit.models import torch_utils, utils from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.training.coord import HyperparamManager import torch import torch.nn.functional as F import torch.nn as nn from torch._C import _disabled_torch_function_impl import numpy as np import threading import re import copy from contextlib import contextmanager from typing import Optional, List, Union, Dict, Tuple # have a layer that allows to split/merge DictDatasets? # need something like numerical_preprocess # could specify a input to output mapping, e.g. {'x_cont': None, 'x_cat': 'x_cont'}, which could also allow to merge # or just have a ParallelFitter that merges outputs, with a tensor subselection beforehand # e.g. # num_pipeline = SequentialFactory([FilterFactory('x_cont'), PreprocessingFactory(), NumericalEmbeddingFactory()]) # cat_pipeline = SequentialFactory([FilterFactory('x_cat', 'y'), OneHotFactory(), PreprocessingFactory(), EmbeddingFactory()] # pipeline = ConcatFactory([num_pipeline, cat_pipeline]) # theoretically, could allow to split off fitters by max RAM usage / max num features depending on size of dataset # then, small datasets could be preprocessed in advance even with heavy parallelization # but this would require a parallelized version of DictDataset... # fitter.fit() should pass context to Variable - how? After returning! # But pass scope into fit already, otherwise parent scope will not be known # fit() should then take scope, hp_manager - how to pass that on to sub-fitters? Or have nn.ModuleList-like system? # in the latter case, could have a set_context() function - but then would need to ensure that this is called... # problem: setting later to layer needs to be done still before layer is called - if fit_transform is implemented, # this will not work # use context manager instead? # use self.create_variable()? (could also easily be forgotten) # what about if each layer takes a fitter? # layer could also forget to pass scope to variable # should a variable have hyper_getter itself instead of having it in the optimizer? # Fitter constructor should have an attribute scope_name or so # use context managers at factory creation, then replicate context manager in create() and fit() # and implement fit_impl() and create_impl() then grab context in Layer() and Variable() constructors? # in order to let factory set its own context as well (e.g. weight), include constructor parameter? # need separate context for HyperparamManager around fit()? # can we have a thread-local context? # can those contexts also be used to select configs? (like for first_layer_config etc.) # could use linux-like scope /first_layer/block/weight or /pipeline/1/robust_scale and then filter it using regexes # run in problems with stack() and register_hypers() twice? or no problem because of new naming convention? # or should stack() not call register_hypers() again but use a list of getters? # (that would be good for having different hypers for different parallel layers, # but bad for dropout implementation and maybe speed) # could have a simplify() function in Fitter to remove Identity layers and empty SequentialLayers recursively # then could maybe save the IdentityLayer check in SequentialLayer # todo: does multiple inheritance from Fitter and FitterFactory work with contexts? class Scope: def __init__(self, names: Optional[List[str]] = None): self._names = names or [] def get_sub_scope(self, name: str) -> 'Scope': return Scope(self._names + [name]) def __str__(self): return '/' + '/'.join(self._names) def matches(self, regex: Union[str, re.Pattern]) -> bool: if isinstance(regex, str): regex = re.compile(regex) return bool(regex.match(str(self))) class TrainContext: # see https://stackoverflow.com/questions/51849395/how-can-we-associate-a-python-context-manager-to-the-variables-appearing-in-it _data = threading.local() def __init__(self, scope: Optional[Scope] = None, hp_manager: Optional[HyperparamManager] = None): self.scope = scope or Scope() self.hp_manager = hp_manager def clone(self): return TrainContext(copy.deepcopy(self.scope), self.hp_manager) @staticmethod def get_global_context() -> 'TrainContext': if not hasattr(TrainContext._data, 'context'): TrainContext._data.context = TrainContext() return TrainContext._data.context @contextmanager def sub_scope_context(name: str): current_context = TrainContext.get_global_context() old_scope = current_context.scope current_context.scope = old_scope.get_sub_scope(name) yield current_context.scope = old_scope @contextmanager def sub_scopes_context(names: List[str]): current_context = TrainContext.get_global_context() old_scope = current_context.scope new_scope = old_scope for name in names: new_scope = new_scope.get_sub_scope(name) current_context.scope = new_scope yield current_context.scope = old_scope @contextmanager def set_scope_context(scope: Scope): current_context = TrainContext.get_global_context() old_scope = current_context.scope current_context.scope = scope yield current_context.scope = old_scope @contextmanager def set_hp_context(hp_manager: Optional[HyperparamManager]): current_context = TrainContext.get_global_context() old_hp_manager = current_context.hp_manager if hp_manager: current_context.hp_manager = hp_manager yield current_context.hp_manager = old_hp_manager class ContextAware: def __init__(self, scope_names: Optional[List[str]] = None): super().__init__() # needed in case of multiple inheritance from ContextAware and another base class self.scope_names = scope_names or [] def add_scope(self, name: str): self.scope_names.append(name) return self def add_others_scope(self, other: 'ContextAware'): self.scope_names.extend(other.scope_names) return self @contextmanager def set_context(self): with sub_scopes_context(self.scope_names): yield class ContextRecorder: def __init__(self): super().__init__() # needed in case of multiple inheritance from ContextRecorder and another base class self.context = TrainContext.get_global_context().clone() @contextmanager def set_context(self): with set_scope_context(self.context.scope): with set_hp_context(self.context.hp_manager): yield class StringConvertible: def __init__(self): super().__init__() # for multiple inheritance def __repr__(self): return str(self) def __str__(self): return self.__class__.__name__ + '(' \ + ', '.join([f'{key} = {value}' for key, value in self.__dict__.items()]) + ')' class Variable(ContextRecorder, nn.Parameter): def __new__(cls, data=None, trainable=True, requires_grad=None, hyper_factors=None): if data is None: data = torch.Tensor() if requires_grad is None: requires_grad = trainable obj = super().__new__(cls, data, requires_grad) obj.hyper_factors = hyper_factors or dict() obj.trainable = trainable return obj def __init__(self, data=None, trainable=True, requires_grad=None, hyper_factors=None): super().__init__() def __deepcopy__(self, memo): if id(self) in memo: return memo[id(self)] else: result = type(self)(self.data.clone(memory_format=torch.preserve_format), self.trainable, self.requires_grad, self.hyper_factors) memo[id(self)] = result return result def __repr__(self): return f'Variable(trainable={self.trainable}) containing:\n' + super(Variable, self).__repr__() __torch_function__ = _disabled_torch_function_impl @staticmethod def stack(vars: List['Variable'], dim=0): # vars must not be an empty list # todo: could make hyper_factors stackable with vars[0].set_context(): with torch.no_grad(): return Variable(torch.stack(vars, dim=dim), trainable=vars[0].trainable, requires_grad=vars[0].requires_grad, hyper_factors=vars[0].hyper_factors) # ------- Layers ------- class Layer(ContextRecorder, StringConvertible, nn.Module): """ Extended version of nn.Module, allowing vectorization, processing data sets with multiple tensors, using Variable instead of Parameter, ... The following methods need to be overridden: - forward_tensor_infos (but if the output is constant, we can just set new_tensor_infos in the constructor) - forward_tensor or forward_cont (the latter if only x_cont is changed) - _stack() - optionally __repr__() and __str__() """ def __init__(self, new_tensor_infos: Optional[Dict[str, TensorInfo]] = None, fitter: Optional['Fitter'] = None, remove_keys: Optional[Union[str, List[str]]] = None): """ Constructor. Puts the layer in eval mode, since it might be used inside the fit_transform() of the Fitter. The parameters provide different opportunities to specify a default implementation for forward_tensor_infos(). The default implementation is: ``` if self.fitter is not None: return self.fitter.forward_tensor_infos(tensor_infos) return utils.update_dict(tensor_infos, self.new_tensor_infos, remove_keys=self.remove_keys) ``` """ super().__init__() self.new_tensor_infos = {} if new_tensor_infos is None else new_tensor_infos self.remove_keys = remove_keys self.fitter = fitter self.hp_manager = None # don't put in eval mode, so we have realistic behavior during fit_transform() self.eval() # todo: remove def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: """ Override this method if the information from constructor is not sufficient. :param tensor_infos: Tensor infos (shapes etc.) :return: Transformed tensor infos. """ if self.fitter is not None: return self.fitter.forward_tensor_infos(tensor_infos) return utils.update_dict(tensor_infos, self.new_tensor_infos, remove_keys=self.remove_keys) def forward(self, data: Union[DictDataset, Dict[str, torch.Tensor]]) -> Union[DictDataset, Dict[str, torch.Tensor]]: """ This is an implementation of the nn.Module forward() function, which is called by __call__(). Don't override this method. :param data: data set or dict of tensors. :return: Transformed version of the data set or dict of tensors. """ if isinstance(data, DictDataset): return self.forward_ds(data) else: return self.forward_tensors(data) def forward_ds(self, ds: DictDataset) -> DictDataset: # default implementation return DictDataset(None if ds.tensors is None else self.forward_tensors(ds.tensors), self.forward_tensor_infos(ds.tensor_infos), device=ds.device, n_samples=ds.n_samples) def forward_cont(self, x: torch.Tensor) -> torch.Tensor: # only needs to be overridden if the default implementation of forward_tensors() is used # we check this to avoid infinite recursion if forward_tensors() is not overridden if self.__class__.forward_tensors != Layer.forward_tensors: return self.forward_tensors({'x_cont': x})['x_cont'] raise NotImplementedError() def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Transforms the given tensors. :param tensors: :return: """ # default implementation just updates x_cont using self.forward_cont() # print(f'{self.__class__.__name__}: {tensors.keys()=}') return utils.join_dicts(tensors, {'x_cont': self.forward_cont(tensors['x_cont'])}) def _stack(self, layers: List['Layer']) -> 'Layer': """ Implementation of stack(). Can be overridden. Vectorizes the given layers. The given layers should all have the same structure. If layers[0] has no parameters (trainable or buffer), then the default implementation simply returns layers[0]. Override if another implementation is desired. :param layers: Layers that should be stacked for vectorization. :return: Returns the stacked Layer object """ # this needs to be overridden by some classes if len(list(layers[0].state_dict())) == 0: # no parameters, can simply vectorize by taking the first layer return layers[0] else: raise NotImplementedError() def stack(self, layers: List['Layer']) -> 'Layer': """ Vectorizes the given layers. The given layers should all have the same structure. Do not override this method, override _stack() instead. :param layers: Layers that should be stacked for vectorization. :return: Returns the stacked Layer object """ with self.set_context(): return self._stack(layers) def __setattr__(self, name, value): # adapted from nn.Module.__setattr__ # first checks whether the value is a Variable, otherwise uses nn.Module.__setattr__ def remove_from(*dicts_or_sets): for d in dicts_or_sets: if name in d: if isinstance(d, dict): del d[name] else: d.discard(name) if isinstance(value, Variable): if value.trainable: if self.__dict__.get('_parameters') is None: raise AttributeError( "cannot assign parameters before Module.__init__() call") remove_from(self.__dict__, self._parameters, self._buffers, self._modules, self._non_persistent_buffers_set) self.register_parameter(name, value) else: if self.__dict__.get('_buffers') is None: raise AttributeError( "cannot assign parameters before Module.__init__() call") remove_from(self.__dict__, self._parameters, self._buffers, self._modules, self._non_persistent_buffers_set) self.register_buffer(name, value) else: super(Layer, self).__setattr__(name, value) class IdentityLayer(Layer): # Attention: do not inherit from IdentityLayer since this might mess with optimizations in SequentialLayer! def forward_tensors(self, x): return x class SequentialLayer(Layer): def __init__(self, tfms: List[Layer]): super().__init__() self.tfms = nn.ModuleList([tfm for tfm in tfms if not isinstance(tfm, IdentityLayer)]) def forward_tensor_infos(self, tensor_infos): for tfm in self.tfms: tensor_infos = tfm.forward_tensor_infos(tensor_infos) return tensor_infos def forward_ds(self, ds: DictDataset): for tfm in self.tfms: ds = tfm.forward_ds(ds) return ds def forward_tensors(self, tensors): for tfm in self.tfms: tensors = tfm(tensors) return tensors def _stack(self, seq_tfms): return SequentialLayer([seq_tfms[0].tfms[i].stack([seq_tfm.tfms[i] for seq_tfm in seq_tfms]) for i in range(len(seq_tfms[0].tfms))]) def __repr__(self): return str(self) def __str__(self): sub_strings = [' ' + line for tfm in self.tfms for line in str(tfm).split('\n')] return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n' class ResidualLayer(Layer): def __init__(self, inner_layer: Layer): super().__init__() self.inner_layer = inner_layer def forward_tensor_infos(self, tensor_infos): return self.inner_layer.forward_tensor_infos(tensor_infos) def forward_tensors(self, tensors: Dict[str, torch.Tensor]): new_tensors = self.inner_layer.forward_tensors(tensors) new_tensors['x_cont'] = tensors['x_cont'] + new_tensors['x_cont'] return new_tensors def _stack(self, seq_tfms): return ResidualLayer(seq_tfms[0].inner_layer.stack([seq_tfm.inner_layer for seq_tfm in seq_tfms])) def __repr__(self): return str(self) def __str__(self): sub_strings = [' ' + line for line in str(self.inner_layer).split('\n')] return f'ResidualLayer [\n' + '\n'.join(sub_strings) + '\n]\n' class ConcatParallelLayer(Layer): """ Executes all layers on the given input and combines the resulting output tensors by concatenating along the last dimension (as in DenseNet, for example). Not all layers need to output the same tensors, e.g., one can output only 'x_cont' and the other can output 'x_cont' and 'y', in which case 'y' will not be concatenated with another tensor. """ def __init__(self, layers: List[Layer], fitter: 'Fitter'): super().__init__(fitter=fitter) self.layers = nn.ModuleList(layers) def forward_tensors(self, tensors): out_tensors = [layer.forward_tensors(tensors) for layer in self.layers] out_keys = {key for t in out_tensors for key in t.keys()} # print(f'{[t["x_cont"].shape for t in out_tensors]=}') return {key: torch_utils.cat_if_necessary([t[key] for t in out_tensors if key in t], dim=-1) for key in out_keys} def _stack(self, tfms: List[Layer]): return ConcatParallelLayer([tfms[0].layers[i].stack([tfm.layers[i] for tfm in tfms]) for i in range(len(tfms[0].layers))], fitter=tfms[0].fitter) def __repr__(self): return str(self) def __str__(self): sub_strings = [' ' + line for tfm in self.layers for line in str(tfm).split('\n')] return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n' class FilterTensorsLayer(Layer): """ Only returns those tensors whose name is in a list of names """ def __init__(self, include_keys: Optional[List[str]], exclude_keys: Optional[List[str]], fitter: 'Fitter'): """ :param keys: List of tensor names that is allowed to pass through """ super().__init__(fitter=fitter) self.include_keys = include_keys self.exclude_keys = exclude_keys def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: # return {key: value for key, value in tensors.items() if key in self.keys} result = {key: (value if (self.include_keys is None or key in self.include_keys) and (self.exclude_keys is None or key not in self.exclude_keys) else value[..., :0]) for key, value in tensors.items()} # print(result) return result class FunctionLayer(Layer): def __init__(self, f): super().__init__() self.f = f def forward_cont(self, x: torch.Tensor) -> torch.Tensor: return self.f(x) class BiasLayer(Layer): def __init__(self, bias: Variable, factor: float = 1.0): super().__init__() self.bias = bias self.factor = factor def forward_cont(self, x): if self.factor != 1.0: x = x + self.factor * self.bias else: x = x + self.bias return x def _stack(self, tfms): return BiasLayer(Variable.stack([tfm.bias for tfm in tfms]), factor=tfms[0].factor) class ScaleLayer(Layer): def __init__(self, scale: Variable): super().__init__() self.scale = scale def forward_cont(self, x): # print(f'{x.norm().item()=:g}, {self.scale.norm().item()=:g}') return x * self.scale def _stack(self, tfms): return ScaleLayer(Variable.stack([tfm.scale for tfm in tfms])) class WeightLayer(Layer): def __init__(self, weight: Variable, factor: float = 1.0): super().__init__(new_tensor_infos={'x_cont': TensorInfo(feat_shape=[weight.shape[-1]])}) # weight should be x in_features x out_features unlike in nn.Linear self.weight = weight self.factor = factor def forward_cont(self, x): x = x.matmul(self.weight) if self.factor != 1.0: x = self.factor * x return x def _stack(self, tfms): return WeightLayer(Variable.stack([tfm.weight for tfm in tfms]), factor=tfms[0].factor) class RenameTensorLayer(Layer): def __init__(self, old_name: str, new_name: str, fitter: 'Fitter'): super().__init__(fitter=fitter) self.old_name = old_name self.new_name = new_name def forward_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: if self.old_name not in tensors: return tensors elif self.new_name not in tensors: return utils.update_dict(tensors, {self.new_name: tensors[self.old_name]}, remove_keys=self.old_name) else: # print(f'{tensors[self.new_name].shape=}, {tensors[self.old_name].shape=}') new_tensor = torch.cat([tensors[self.new_name], tensors[self.old_name]], dim=-1) return utils.update_dict(tensors, {self.new_name: new_tensor}, remove_keys=self.old_name) def _stack(self, layers: List['Layer']) -> 'Layer': return layers[0] # ------ Fitters ------ class Fitter(ContextAware, StringConvertible): """ Fitters produce Layer objects given a data set (of inputs to the fitter at initialization) """ def __init__(self, needs_tensors: bool = True, is_individual: bool = True, scope_names: Optional[List[str]] = None, modified_tensors: Optional[List[str]] = None): """ :param needs_tensors: Set to true if the fitter needs to have the tensors in fit() or fit_transform(). If false, then in fit(ds) or fit_transform(ds), ds.tensors is allowed to be None. :param is_individual: Set to false if fit(ds) deterministically produces a non-trainable layer. (In this case, this Fitter only needs to be called once in k-fold CV on the train+val set.) :param scope_names: List of names to add to the scope (will be present in the names of Variables constructed in this Fitter) :param modified_tensors: List of names of tensors that are modified by this Fitter, e.g., ['x_cont']. This is used for the default implementation of get_n_forward(), which is used to get a RAM estimate for the forward pass. The default RAM estimate is simply the size of all modified tensors. """ super().__init__(scope_names=scope_names) # needs_data=False specifies that in fit(ds), ds.tensors is allowed to be None # is_individual=False specifies that fit(ds) deterministically produces a non-trainable layer self.needs_tensors = needs_tensors self.is_individual = is_individual self.modified_tensors = modified_tensors def _get_n_values(self, tensor_infos: Dict[str, TensorInfo], relevant_tensors: Optional[List[str]]): """ Helper function that can be used internally to get the number of elements of a list of tensors. Should not be overridden. :param tensor_infos: Tensor infos of the data set :param relevant_tensors: List of tensor names that should be considered. If None, 0 is returned. :return: Returns the number of components of a list of tensors (per batch element). """ if relevant_tensors is None: return 0 return sum([ti.get_n_features() for key, ti in tensor_infos.items() if key in relevant_tensors]) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: """ Should be overridden if the fitter produces layers with trainable parameters. :param tensor_infos: Tensor infos. :return: Returns the number of parameters of the fitted layer for the given tensor_infos. """ return 0 def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: """ Should be overridden if the fitter does more than just one operation. :param tensor_infos: Ingoing tensor infos. :return: Should return the number of bytes used in the forward pass per batch element """ if self.modified_tensors is None: return 0 return self._get_n_values(self.forward_tensor_infos(tensor_infos), self.modified_tensors) def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: """ Should be overridden if the fitter changes the tensor shapes. :param tensor_infos: Tensor infos (for shapes and category sizes). :return: Transformed tensor infos. """ return tensor_infos # should be overridden by subclasses if tensor_infos change def fit(self, ds: DictDataset) -> Layer: """ Produces a layer initialized based on a given data set. This method should not be overridden, override _fit() instead. :param ds: Data set. :return: Layer object. """ with self.set_context(): return self._fit(ds) def fit_transform(self, ds: DictDataset, needs_tensors: bool = True) -> Tuple[Layer, DictDataset]: """ Produces a layer initialized based on a given data set. This method should not be overridden, override _fit_transform() instead. :param ds: Data set. :param needs_tensors: Whether the transformed data set should also contain transformed tensors (compared to only transformed tensor_infos). :return: Layer object and the data set transformed by the Layer. """ with self.set_context(): return self._fit_transform(ds, needs_tensors) def fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \ -> Tuple[Layer, DictDataset]: """ Similar to fit_transform(), but may subsample the data set in order to stay within a given RAM limit. This method should not be overridden, override _fit_transform_subsample() instead. :param ds: Data set. :param ram_limit_gb: RAM limit in GB. :param needs_tensors: Whether the transformed tensors should be output. :return: Tuple of the resulting Layer and the transformed DictDataset. """ with self.set_context(): return self._fit_transform_subsample(ds, ram_limit_gb, needs_tensors) def _fit(self, ds: DictDataset) -> Layer: """ Implementation of fit(). At least one of _fit() or _fit_transform() should be overridden by subclasses. :param ds: Data set. :return: Initialized Layer object. """ if self.__class__._fit_transform != Fitter._fit_transform: # avoid infinite recursion if the method is not overridden tfm, ds = self._fit_transform(ds, False) return tfm elif self.__class__._fit_transform_subsample != Fitter._fit_transform_subsample: # avoid infinite recursion if the method is not overridden tfm, ds = self._fit_transform_subsample(ds, ram_limit_gb=np.inf, needs_tensors=False) return tfm if isinstance(self, Layer): return self raise NotImplementedError() def _fit_transform(self, ds: DictDataset, needs_tensors: bool) -> Tuple[Layer, DictDataset]: """ Implementation of fit_transform(). At least one of _fit() or _fit_transform() should be overridden by subclasses. :param ds: Data set. :param needs_tensors: Whether the transformed data set should also contain transformed tensors (compared to only transformed tensor_infos). :return: Initialized Layer object and transformed data set """ if self.__class__._fit_transform_subsample != Fitter._fit_transform_subsample: return self._fit_transform_subsample(ds, ram_limit_gb=np.inf, needs_tensors=needs_tensors) else: tfm = self._fit(ds) if needs_tensors: return tfm, tfm.forward_ds(ds) else: return tfm, DictDataset(None, tfm.forward_tensor_infos(ds.tensor_infos), ds.device, ds.n_samples) def _fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \ -> Tuple[Layer, DictDataset]: n_forward = self.get_n_forward(ds.tensor_infos) # check if subsampling is necessary if ram_limit_gb < np.inf and n_forward > 0 and ds.tensors is not None and (self.needs_tensors or needs_tensors): # optimistically assume 4 bytes per number, while 8 are needed for categorical values max_n_samples = max(1, int(ram_limit_gb * (1024 ** 3) / (4 * n_forward))) if max_n_samples < ds.n_samples: # subsample the data set subsample_idxs = torch.randperm(ds.n_samples, device=ds.device)[:max_n_samples] ds = ds.get_sub_dataset(subsample_idxs) return self._fit_transform(ds, needs_tensors) def split_off_dynamic(self) -> Tuple['Fitter', 'Fitter']: """ Can be overridden by subclasses if a trivial split based on self.needs_tensors and self.is_individual is not desired. :return: Returns a tuple of a static and a dynamic transform such that self is equivalent to SequentialFitter([static, dynamic]) and such that the static transform does not need data and is not trainable. The idea is that in the vectorized setting, the static transform only needs to be applied once to the data set, while the dynamic transform needs to be applied separately for each of the vectorized models. """ if self.needs_tensors or self.is_individual: return IdentityFitter(), self else: return self, IdentityFitter() def split_off_individual(self): """ Can be overridden by subclasses if a trivial split based on self.is_individual is not desired. :return: Returns a tuple of a non-individual and an individual transform such that self is equivalent to SequentialFitter([non_individual, individual]) and such that the non_individual transform deterministically produces a non-trainable layer. The idea is that the non-individual transform only needs to be applied once in k-fold cross-validation. """ if self.is_individual: return IdentityFitter(), self else: return self, IdentityFitter() class IdentityFitter(Fitter): def __init__(self, **config): super().__init__(needs_tensors=False, is_individual=False) def _fit(self, ds: DictDataset) -> Layer: return IdentityLayer() class SequentialFitter(Fitter): def __init__(self, fitters: List[Fitter], **config): super().__init__(needs_tensors=np.any([f.needs_tensors for f in fitters]), is_individual=np.any([f.is_individual for f in fitters])) self.fitters = fitters # print(f'Creating SequentialFitter with fitters {fitters} and {self.needs_tensors=}') def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]): for f in self.fitters: tensor_infos = f.forward_tensor_infos(tensor_infos) return tensor_infos def get_n_params(self, tensor_infos: Dict[str, TensorInfo]): n_params = 0 for f in self.fitters: n_params += f.get_n_params(tensor_infos) tensor_infos = f.forward_tensor_infos(tensor_infos) return n_params def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]): forward_bytes = 0 for f in self.fitters: forward_bytes += f.get_n_forward(tensor_infos) tensor_infos = f.forward_tensor_infos(tensor_infos) return forward_bytes def _fit_transform(self, ds: DictDataset, needs_tensors: bool = True): needs_tensors_list = [f.needs_tensors for f in self.fitters] + [needs_tensors] max_tensors_idx = np.max(np.argwhere(needs_tensors_list)) if np.any(needs_tensors_list) else 0 tfms = [] for i, fitter in enumerate(self.fitters): tfm, ds = fitter.fit_transform(ds, needs_tensors=(i < max_tensors_idx)) tfms.append(tfm) return SequentialLayer(tfms), ds def _fit_transform_subsample(self, ds: DictDataset, ram_limit_gb: float, needs_tensors: bool = True) \ -> Tuple[Layer, DictDataset]: needs_tensors_list = [f.needs_tensors for f in self.fitters] + [needs_tensors] max_tensors_idx = np.max(np.argwhere(needs_tensors_list)) if np.any(needs_tensors_list) else 0 tfms = [] for i, fitter in enumerate(self.fitters): tfm, ds = fitter.fit_transform_subsample(ds, ram_limit_gb=ram_limit_gb, needs_tensors=(i < max_tensors_idx)) tfms.append(tfm) return SequentialLayer(tfms), ds def split_off_dynamic(self): is_dynamic = [f.needs_tensors or f.is_individual for f in self.fitters] if np.any(is_dynamic): first_dynamic = np.min(np.argwhere(is_dynamic)) static, dynamic = self.fitters[first_dynamic].split_off_dynamic() return SequentialFitter(self.fitters[:first_dynamic] + [static]).add_others_scope(self), \ SequentialFitter([dynamic] + self.fitters[first_dynamic + 1:]).add_others_scope(self) else: return self, IdentityFitter() def split_off_individual(self): is_individual = [f.is_individual for f in self.fitters] if np.any(is_individual): first_indiv = np.min(np.argwhere(is_individual)) non_indiv, indiv = self.fitters[first_indiv].split_off_individual() return SequentialFitter(self.fitters[:first_indiv] + [non_indiv]).add_others_scope(self), \ SequentialFitter([indiv] + self.fitters[first_indiv + 1:]).add_others_scope(self) else: return self, IdentityFitter() def __str__(self): sub_strings = [' ' + line for fitter in self.fitters for line in str(fitter).split('\n')] return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n' class ResidualFitter(Fitter): def __init__(self, inner_fitter: Fitter, **config): super().__init__(needs_tensors=inner_fitter.needs_tensors, is_individual=inner_fitter.is_individual) self.inner_fitter = inner_fitter def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]): return self.inner_fitter.forward_tensor_infos(tensor_infos) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]): return self.inner_fitter.get_n_params(tensor_infos) def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]): return self.inner_fitter.get_n_forward(tensor_infos) + self._get_n_values(tensor_infos, ['x_cont']) def _fit_transform(self, ds: DictDataset, needs_tensors=True): layer = ResidualLayer(self.inner_fitter.fit(ds)) if needs_tensors: ds = layer.forward_ds(ds) return layer, ds def split_off_dynamic(self): if self.inner_fitter.needs_tensors or self.inner_fitter.is_individual: return IdentityFitter(), self else: return self, IdentityFitter() def split_off_individual(self): if self.inner_fitter.is_individual: return IdentityFitter(), self else: return self, IdentityFitter() def __str__(self): sub_strings = [' ' + line for fitter in [self.inner_fitter] for line in str(fitter).split('\n')] return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n' class FunctionFitter(Fitter): def __init__(self, f, **config): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont']) self.f = f def _fit(self, ds: DictDataset): return FunctionLayer(self.f) class ConcatParallelFitter(Fitter): # todo: could implement better _fit_transform_subsample() def __init__(self, fitters: List[Fitter]): super().__init__(needs_tensors=np.any([f.needs_tensors for f in fitters]), is_individual=np.any([f.is_individual for f in fitters])) self.fitters = fitters def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: out_tensor_infos = self.forward_tensor_infos(tensor_infos) # pessimistic bound assuming that all tensors need to get concatenated concat_space = self._get_n_values(out_tensor_infos, relevant_tensors=list(out_tensor_infos.keys())) return sum([f.get_n_forward(tensor_infos) for f in self.fitters]) + concat_space def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return sum([f.get_n_params(tensor_infos) for f in self.fitters]) def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: out_tensor_infos_list = [f.forward_tensor_infos(tensor_infos) for f in self.fitters] out_keys = {key for ti in out_tensor_infos_list for key in ti.keys()} return {key: TensorInfo(cat_sizes=torch_utils.cat_if_necessary([ti[key].get_cat_sizes() for ti in out_tensor_infos_list if key in ti], dim=-1)) for key in out_keys} def _fit(self, ds: DictDataset) -> Layer: return ConcatParallelLayer([f.fit(ds) for f in self.fitters], fitter=self) # ------ Factory ------- class FitterFactory(ContextAware, StringConvertible): """ Class that allows to create Fitter objects depending on tensor_infos (the shape and category sizes of the tensors). """ def __init__(self, scope_names: Optional[List[str]] = None): super().__init__(scope_names=scope_names) def create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter: """ Creates a Fitter object with the scope given in the constructor. Do not override this method, override _create() or _create_transform() instead. :param tensor_infos: Tensor infos (shapes etc.) :return: Fitter object. """ fitter = self._create(tensor_infos) if fitter is self: return fitter return fitter.add_others_scope(self) def create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]: """ Creates a Fitter object with the scope given in the constructor. Do not override this method, override _create() or _create_transform() instead. :param tensor_infos: Tensor infos (shapes etc.) :return: Fitter object and the transformed tensor infos. """ fitter, tensor_infos = self._create_transform(tensor_infos) if fitter is self: return fitter, tensor_infos return fitter.add_others_scope(self), tensor_infos def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter: """ If the subclass also inherits from Fitter, this will just return self. Otherwise, override at least one of _create() or _create_transform(). :param tensor_infos: Tensor infos. :return: Fitter object. """ if self.__class__._create_transform != FitterFactory._create_transform: # don't have to worry about infinite recursion return self._create_transform(tensor_infos)[0] if isinstance(self, Fitter): return self raise NotImplementedError() def _create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]: fitter = self._create(tensor_infos) return fitter, fitter.forward_tensor_infos(tensor_infos) class SequentialFactory(FitterFactory): def __init__(self, factories: List[FitterFactory]): super().__init__() self.factories = factories def _create_transform(self, tensor_infos: Dict[str, TensorInfo]): fitters = [] for f in self.factories: fitter, tensor_infos = f.create_transform(tensor_infos) fitters.append(fitter) return SequentialFitter(fitters), tensor_infos def __str__(self): sub_strings = [' ' + line for factory in self.factories for line in str(factory).split('\n')] return f'{self.__class__.__name__} [\n' + '\n'.join(sub_strings) + '\n]\n' class IdentityFactory(FitterFactory): def _create(self, tensor_infos): return IdentityFitter() class FunctionFactory(FitterFactory): def __init__(self, f): super().__init__() self.f = f def _create(self, tensor_infos): return FunctionFitter(self.f) class ConcatParallelFactory(FitterFactory): def __init__(self, factories: List[FitterFactory]): super().__init__() self.factories = factories def _create(self, tensor_infos) -> Fitter: return ConcatParallelFitter([factory.create(tensor_infos) for factory in self.factories]) class FilterTensorsFactory(Fitter, FitterFactory): def __init__(self, include_keys: Optional[List[str]] = None, exclude_keys: Optional[List[str]] = None): super().__init__(needs_tensors=False, is_individual=False) self.include_keys = include_keys self.exclude_keys = exclude_keys def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: return {key: (ti if ((self.include_keys is None or key in self.include_keys) and (self.exclude_keys is None or key not in self.exclude_keys)) else TensorInfo(feat_shape=0 * ti.get_feat_shape())) for key, ti in tensor_infos.items()} def _fit(self, ds: DictDataset) -> Layer: return FilterTensorsLayer(include_keys=self.include_keys, exclude_keys=self.exclude_keys, fitter=self) class RenameTensorFactory(Fitter, FitterFactory): def __init__(self, old_name: str, new_name: str, **config): super().__init__(needs_tensors=False, is_individual=False) self.old_name = old_name self.new_name = new_name def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: if self.old_name in tensor_infos and self.new_name in tensor_infos: return self._get_n_values(tensor_infos, [self.old_name, self.new_name]) else: return 0 def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: if self.old_name not in tensor_infos: return tensor_infos elif self.new_name not in tensor_infos: return utils.update_dict(tensor_infos, {self.new_name: tensor_infos[self.old_name]}, remove_keys=self.old_name) else: # both names exist in tensor_infos new_tensor_info = TensorInfo.concat([tensor_infos[self.new_name], tensor_infos[self.old_name]]) return utils.update_dict(tensor_infos, {self.new_name: new_tensor_info}, remove_keys=self.old_name) def _fit(self, ds: DictDataset) -> Layer: return RenameTensorLayer(old_name=self.old_name, new_name=self.new_name, fitter=self) ================================================ FILE: pytabkit/models/nn_models/categorical.py ================================================ from typing import Iterable, List, Dict, Tuple, Any, Callable, Optional, Union import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from pytabkit.models import utils from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.nn_models.base import FitterFactory, IdentityFitter, Layer, Fitter, Variable from pytabkit.models.torch_utils import cat_if_necessary class SingleEncodingFactory(FitterFactory): def __init__(self, create_fitter, min_cat_size=0, max_cat_size=-1): super().__init__() self.min_cat_size = min_cat_size self.max_cat_size = max_cat_size self.create_fitter = create_fitter def apply_on(self, cat_size: int, n_classes: int): # can be overridden return cat_size >= self.min_cat_size and (self.max_cat_size < 0 or cat_size <= self.max_cat_size) def _create(self, tensor_infos): if 'x_cat' not in tensor_infos: return IdentityFitter() x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy() if len(x_cat_sizes) != 1: raise ValueError( 'SingleEncoderFactory has to be applied to a single category but was applied to category sizes ' + str(x_cat_sizes)) cat_size = x_cat_sizes[0] n_classes = tensor_infos['y'].get_cat_sizes()[0].item() if self.apply_on(cat_size, n_classes): return self.create_fitter(tensor_infos) return IdentityFitter() class EncodingLayer(Layer): def __init__(self, single_enc_layers: Iterable[Layer], enc_output_name: str, fitter): super().__init__(fitter=fitter) self.emb_layers = nn.ModuleList(single_enc_layers) self.enc_output_name = enc_output_name def forward_tensors(self, tensors): x_cat = tensors['x_cat'] prev_output_tensors = [tensors[self.enc_output_name]] if self.enc_output_name in tensors else [] new_tensors = [] for i, l in enumerate(self.emb_layers): sub_x_cat = x_cat[tuple([slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)])] sub_tensors = {'x_cat': sub_x_cat} if 'y' in tensors: sub_tensors['y'] = tensors['y'] new_tensors.append(l.forward_tensors(sub_tensors)) output_tensors = prev_output_tensors + [t['x_cont'] for t in new_tensors if 'x_cont' in t] if len(output_tensors) == 0: # create empty tensor new_conts = torch.zeros(*x_cat.shape[:-1], 0, device=x_cat.device, dtype=torch.float32) else: new_conts = cat_if_necessary(output_tensors, dim=-1) cat_tensors = [t['x_cat'] for t in new_tensors if 'x_cat' in t] if len(cat_tensors) > 0: new_cats = torch.cat(cat_tensors, dim=-1) return utils.update_dict(tensors, {self.enc_output_name: new_conts, 'x_cat': new_cats}) else: return utils.update_dict(tensors, {self.enc_output_name: new_conts}, remove_keys='x_cat') def _stack(self, layers: List['EncodingLayer']): return EncodingLayer([layers[0].emb_layers[i].stack([layers[j].emb_layers[i] for j in range(len(layers))]) for i in range(len(layers[0].emb_layers))], layers[0].enc_output_name, layers[0].fitter) class EncodingFitter(Fitter): def __init__(self, single_encoder_fitters: List[Fitter], enc_output_name: str = 'x_cont', **config): super().__init__(needs_tensors=any([enc.needs_tensors for enc in single_encoder_fitters]), is_individual=any([enc.is_individual for enc in single_encoder_fitters])) self.single_encoder_fitters = single_encoder_fitters self.enc_output_name = enc_output_name # allow to have something other than x_cont assert enc_output_name != 'x_cat' def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return sum([f.get_n_params(ti) for f, ti in zip(self.single_encoder_fitters, self._sub_tensor_infos(tensor_infos))]) def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: # for splitting categories forward_bytes = self._get_n_values(tensor_infos, ['x_cat']) forward_bytes += sum([f.get_n_forward(ti) for f, ti in zip(self.single_encoder_fitters, self._sub_tensor_infos(tensor_infos))]) # for concat forward_bytes += self._get_n_values(self.forward_tensor_infos(tensor_infos), [self.enc_output_name, 'x_cat']) return forward_bytes def _sub_tensor_infos(self, tensor_infos): x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy() if 'y' in tensor_infos: return [{'x_cat': TensorInfo(cat_sizes=[cat_sz]), 'y': tensor_infos['y']} for cat_sz in x_cat_sizes] return [{'x_cat': TensorInfo(cat_sizes=[cat_sz])} for cat_sz in x_cat_sizes] def forward_tensor_infos(self, tensor_infos): x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy() n_cont = tensor_infos[self.enc_output_name].get_n_features() \ if self.enc_output_name in tensor_infos else 0 out_cat_sizes = [] for cat_sz, enc in zip(x_cat_sizes, self.single_encoder_fitters): ti = {'x_cat': TensorInfo(cat_sizes=[cat_sz])} out_ti = enc.forward_tensor_infos(ti) if 'x_cont' in out_ti: n_cont += out_ti['x_cont'].get_n_features() else: out_cat_sizes.append(out_ti['x_cat'].get_cat_sizes()[0].item()) if len(out_cat_sizes) > 0: return utils.update_dict(tensor_infos, {self.enc_output_name: TensorInfo(feat_shape=[n_cont]), 'x_cat': TensorInfo(cat_sizes=out_cat_sizes)}) else: return utils.update_dict(tensor_infos, {self.enc_output_name: TensorInfo(feat_shape=[n_cont])}, remove_keys='x_cat') def _fit(self, ds: DictDataset) -> Layer: x_cat_sizes = ds.tensor_infos['x_cat'].get_cat_sizes().numpy() enc_layers = [] for i in range(len(x_cat_sizes)): enc = self.single_encoder_fitters[i] if enc.needs_tensors: tensors = {'x_cat': ds.tensors['x_cat'][:, i:i+1]} if 'y' in ds.tensors: tensors['y'] = ds.tensors['y'] else: tensors = None tensor_infos = {'x_cat': TensorInfo(cat_sizes=[x_cat_sizes[i]])} if 'y' in ds.tensor_infos: tensor_infos['y'] = ds.tensor_infos['y'] enc_layers.append(enc.fit(DictDataset(tensors, tensor_infos, ds.device, ds.n_samples))) return EncodingLayer(enc_layers, self.enc_output_name, self) # def split_off_dynamic(self): # splits = [f.split_off_dynamic() for f in self.single_encoder_fitters] # s0 = [s[0] for s in splits] # s1 = [s[1] for s in splits] # # todo class EncodingFactory(FitterFactory): def __init__(self, single_encoder_factory, enc_output_name: str = 'x_cont'): super().__init__() self.single_encoder_factory = single_encoder_factory self.enc_output_name = enc_output_name def _create(self, tensor_infos): if 'x_cat' not in tensor_infos or tensor_infos['x_cat'].get_n_features() == 0: return IdentityFitter() x_cat_sizes = tensor_infos['x_cat'].get_cat_sizes().numpy() single_encoder_fitters = [self.single_encoder_factory.create({'x_cat': TensorInfo(cat_sizes=[cat_sz]), 'y': tensor_infos['y']}) for cat_sz in x_cat_sizes] return EncodingFitter(single_encoder_fitters, enc_output_name=self.enc_output_name) # ----- One-Hot ------ class SingleOneHotLayer(Layer): def __init__(self, fitter: Fitter, onoff, cat_size, use_missing_zero: bool, use_1d_binary_onehot: bool): super().__init__(fitter=fitter) self.onoff = onoff self.cat_size = cat_size self.use_missing_zero = use_missing_zero self.use_1d_binary_onehot = use_1d_binary_onehot def _binary(self, x_cat, values): src = torch.as_tensor(values, dtype=torch.float32, device=x_cat.device) # add other dimensions to match those of x_cat src = src[tuple([None] * (x_cat.dim()-1) + [slice(None)])].expand(*(list(x_cat.shape[:-1]) + [-1])) return src.gather(dim=-1, index=x_cat) def _multiple(self, x_cat, on_value, off_value): cont_shape = (*x_cat.shape[:-1], self.cat_size) cont = torch.full(cont_shape, off_value, dtype=torch.float32, device=x_cat.device) src = torch.full([1] * x_cat.dim(), on_value, dtype=torch.float32, device=x_cat.device).expand(*x_cat.shape) cont.scatter_(dim=-1, index=x_cat, src=src) return cont def forward_tensors(self, tensors): x_cat = tensors['x_cat'] # default_slices = [slice(None)] * (x_cat_sq.dim() - 1) on_value = self.onoff[0] off_value = self.onoff[1] if self.use_missing_zero: if self.cat_size == 2 and self.use_1d_binary_onehot: # should not be used with use_missing_zero anyway cont = self._binary(x_cat, [-on_value, on_value]) elif self.cat_size == 3 and self.use_1d_binary_onehot: cont = self._binary(x_cat, [off_value, on_value, -on_value]) else: cont = self._multiple(x_cat, on_value=on_value, off_value=off_value) # cont = cont[[slice(None)] * (x_cat.dim() - 1) + [slice(1, None)]] cont = cont[..., 1:] # cut off the dimension for the missing value one-hot else: if self.cat_size == 2 and self.use_1d_binary_onehot: cont = self._binary(x_cat, [off_value, on_value]) else: cont = self._multiple(x_cat, on_value=on_value, off_value=off_value) return utils.update_dict(tensors, {'x_cont': cont}, remove_keys='x_cat') class SingleOneHotFitter(Fitter): def __init__(self, use_missing_zero: bool, bin_onoff: Tuple[float, float], multi_onoff: Tuple[float, float], use_1d_binary_onehot: bool): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont', 'x_cat']) self.use_missing_zero = use_missing_zero self.bin_onoff = bin_onoff self.multi_onoff = multi_onoff self.use_1d_binary_onehot = use_1d_binary_onehot def forward_tensor_infos(self, tensor_infos): cat_size = tensor_infos['x_cat'].get_cat_sizes()[0].item() if self.use_missing_zero: cat_size -= 1 if cat_size == 2 and self.use_1d_binary_onehot: cat_size = 1 return utils.update_dict(tensor_infos, {'x_cont': TensorInfo(feat_shape=[cat_size])}, remove_keys='x_cat') def _fit(self, ds: DictDataset) -> Layer: cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item() is_binary = cat_size - int(self.use_missing_zero) <= 2 return SingleOneHotLayer(self, onoff=self.bin_onoff if is_binary else self.multi_onoff, cat_size=cat_size, use_missing_zero=self.use_missing_zero, use_1d_binary_onehot=self.use_1d_binary_onehot) class SingleOneHotFactory(SingleEncodingFactory): def __init__(self, use_missing_zero=True, bin_onoff=(1.0, 0.0), multi_onoff=(1.0, 0.0), min_one_hot_cat_size=0, max_one_hot_cat_size=-1, max_one_hot_size_by_n_classes=False, use_1d_binary_onehot: bool = True, **config): super().__init__(create_fitter=lambda tensor_infos: SingleOneHotFitter(use_missing_zero=use_missing_zero, bin_onoff=bin_onoff, multi_onoff=multi_onoff, use_1d_binary_onehot=use_1d_binary_onehot), min_cat_size=min_one_hot_cat_size, max_cat_size=max_one_hot_cat_size) self.max_one_hot_size_b_n_classes = max_one_hot_size_by_n_classes def apply_on(self, cat_size: int, n_classes: int): if self.max_one_hot_size_b_n_classes: return cat_size <= n_classes else: return super().apply_on(cat_size, n_classes) # ------ Embedding -------- class SingleEmbeddingLayer(Layer): def __init__(self, emb: Variable): super().__init__(new_tensor_infos={'x_cont': TensorInfo(feat_shape=[emb.shape[-1]])}, remove_keys='x_cat') # emb.shape should be (parallel dims) x cat_size x emb_size # print(f'{emb.numel()=}') self.emb = emb def forward_tensors(self, tensors): x_cat = tensors['x_cat'] # print(f'{x_cat.shape=}') x_cat = x_cat.squeeze(-1) # squeeze feature dimension, we assume that there is only one feature parallel_dims = self.emb.dim() - 2 # subtract category and feature dimension # idxs = [] # for dim in range(parallel_dims): # # todo: could cache these and not create them newly every time? # view_shape = [1] * (parallel_dims+1) # view_shape[dim] = self.emb.shape[dim] # idxs.append(torch.arange(self.emb.shape[dim], dtype=torch.long, device=self.emb.device).view(*view_shape)) # idxs.append(x_cat) # x_cont = self.emb[idxs] # code using index_select which is faster than fancy indexing # put all parallel dimensions into the batch dimension cat_size = self.emb.shape[-2] n_flattened_idxs = cat_size n_batch = x_cat.shape[-1] # shape: (n_parallel * cat_size) x n_features emb_flat = self.emb.reshape(-1, self.emb.shape[-1]) while x_cat.dim() > 1: # merge batch dimension with all parallel dimensions n_parallel = x_cat.shape[-2] parallel_idxs = torch.arange(x_cat.shape[-2], dtype=torch.long, device=self.emb.device) # add offsets to parallel dimension x_cat = x_cat + n_flattened_idxs * parallel_idxs[:, None] # merge parallel and batch dimension x_cat = x_cat.reshape(*x_cat.shape[:-2], -1) # now the indexes span a larger range n_flattened_idxs *= n_parallel # for dim in range(parallel_dims): # # todo: # pass # print(f'{x_cat.shape=}, {emb_flat.shape=}, {n_flattened_idxs=}, {x_cat.max().item()=}') x_cont = emb_flat.index_select(0, x_cat) x_cont = x_cont.reshape(*self.emb.shape[:-2], n_batch, self.emb.shape[-1]) # print(f'{torch.norm(x_cont)=}, {torch.norm(x_cont-x_cont_other)=}') return utils.update_dict(tensors, {'x_cont': x_cont}, remove_keys='x_cat') def _stack(self, layers: List['SingleEmbeddingLayer']): return SingleEmbeddingLayer(Variable.stack([layer.emb for layer in layers])) def fastai_emb_size_fn(n_cat: int): return min(600, round(1.6 * n_cat ** 0.56)) class ConstantFunction: def __init__(self, value: Any): self.value = value def __call__(self, *args, **kwargs) -> Any: return self.value def get_embedding_size(fn: Optional[Union[int, str, Callable[[int], int]]]) -> Callable[[int], int]: if fn is None: fn = 'fastai' if isinstance(fn, int): return ConstantFunction(value=fn) elif isinstance(fn, str): if fn == 'howard' or fn == 'fastai': # heuristic by Jeremy Howard in fastai return fastai_emb_size_fn else: raise ValueError(f'Unknown embedding_size name "{fn}"') else: return fn class SingleEmbeddingFitter(Fitter): def __init__(self, embedding_size=None, **config): super().__init__(needs_tensors=False, modified_tensors=['x_cont', 'x_cat']) # default option is taken from fastai2 self.size_func = get_embedding_size(embedding_size) if embedding_size is not None \ else fastai_emb_size_fn self.emb_init_mode = config.get('emb_init_mode', 'normal') self.emb_init_gain = config.get('emb_init_gain', 1.0) self.emb_reduce_norm = config.get('emb_reduce_norm', False) self.emb_lr_factor = config.get('emb_lr_factor', 1.0) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: cat_sz = tensor_infos['x_cat'].get_cat_sizes()[0].item() return cat_sz * self.size_func(cat_sz) def forward_tensor_infos(self, tensor_infos): new_info = TensorInfo(feat_shape=[self.size_func(tensor_infos['x_cat'].get_cat_sizes()[0].item())]) return utils.update_dict(tensor_infos, {'x_cont': new_info}, remove_keys='x_cat') def _fit(self, ds: DictDataset) -> Layer: cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item() emb_size = self.size_func(cat_size) if self.emb_init_mode == 'normal': emb = torch.randn(cat_size, emb_size, device=ds.device) elif self.emb_init_mode == 'uniform': emb = 2*torch.rand(cat_size, emb_size, device=ds.device) - 1 elif self.emb_init_mode == 'kaiming-uniform-t': # as in the RTDL nets, use 1/sqrt(out_features) emb = (1./np.sqrt(emb_size)) * (2 * torch.rand(cat_size, emb_size, device=ds.device) - 1) emb[0, :] = 0.0 # set unknown/missing category to 0 else: raise ValueError(f'Unknown emb_init_mode: {self.emb_init_mode}') # todo: should emb_reduce_norm be used differently as for NTK param (Adam vs not Adam)? emb_factor = self.emb_init_gain * (np.sqrt(1.0/emb_size) if self.emb_reduce_norm else 1.0) return SingleEmbeddingLayer(Variable(emb_factor * emb, trainable=True, hyper_factors={'lr': self.emb_lr_factor})) class SingleEmbeddingFactory(SingleEncodingFactory): def __init__(self, embedding_size=None, min_embedding_cat_size=0, max_embedding_cat_size=-1, **config): super().__init__(create_fitter=lambda tensor_infos: SingleEmbeddingFitter(embedding_size=embedding_size, **config), min_cat_size=min_embedding_cat_size, max_cat_size=max_embedding_cat_size) # ------- Target Encoding (a kind of fixed embedding) ------- class SingleTargetEncodingFitter(Fitter): def __init__(self, n_classes, **config): super().__init__(is_individual=False, modified_tensors=['x_cont', 'x_cat']) self.n_classes = n_classes def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: n_classes = tensor_infos['y'].get_cat_sizes()[0].item() emb_sz = 1 if n_classes <= 2 else n_classes cat_sz = tensor_infos['x_cat'].get_cat_sizes()[0].item() return emb_sz * cat_sz def forward_tensor_infos(self, tensor_infos): new_info = TensorInfo(feat_shape=[1 if self.n_classes <= 2 else self.n_classes]) return utils.update_dict(tensor_infos, {'x_cont': new_info}, remove_keys='x_cat') def _fit(self, ds: DictDataset) -> Layer: x_cat = ds.tensors['x_cat'].squeeze(-1) x_cat_size = ds.tensor_infos['x_cat'].get_cat_sizes()[0].item() y = ds.tensors['y'] y_cat_sizes = ds.tensor_infos['y'].get_cat_sizes().numpy() if y_cat_sizes[0] > 2: # multi-class classification y = F.one_hot(y[:, 0], num_classes=y_cat_sizes[0]).float() elif y_cat_sizes[0] == 2: # binary classification y = y.float() # convert int to float prior = y.mean(dim=-2) # mean over batch dimension sums = torch.zeros(x_cat_size, y.shape[-1], device=y.device) # In the following, scatter_add_ executes sums[x_cat[:, i][j], k] += y[j, k] # see also https://discuss.pytorch.org/t/pytorch-equivalent-to-tf-unsorted-segment-sum/25275/5 sums.scatter_add_(0, x_cat[:,None].expand(-1, y.shape[-1]), y) frequencies = torch.bincount(x_cat, minlength=x_cat_size) # could also give the prior a different weight, this is just an option emb = (sums + prior[None, :]) / (frequencies[:, None] + 1) return SingleEmbeddingLayer(Variable(emb, trainable=False)) class SingleTargetEncodingFactory(SingleEncodingFactory): def __init__(self, min_targetenc_cat_size=0, max_targetenc_cat_size=-1, **config): create_fitter = lambda tensor_infos: SingleTargetEncodingFitter(n_classes=tensor_infos['y'].get_cat_sizes()[0].item()) super().__init__(create_fitter=create_fitter, min_cat_size=min_targetenc_cat_size, max_cat_size=max_targetenc_cat_size) # ------- Label Encoding ------- class SingleOrdinalEncodingLayer(Layer): def __init__(self, fitter, cat_size: int, permute_ordinal_encoding: bool = False): super().__init__(fitter=fitter) self.cat_size = cat_size self.permute_ordinal_encoding = permute_ordinal_encoding self.perm = None if permute_ordinal_encoding: self.perm = Variable(torch.randperm(cat_size, dtype=torch.long), trainable=False) def forward_tensors(self, tensors): x_cat = tensors['x_cat'] if self.permute_ordinal_encoding: x_cat = self.perm[x_cat] return utils.update_dict(tensors, {'x_cont': x_cat.type(torch.float32)}, remove_keys='x_cat') class SingleOrdinalEncodingFitter(Fitter): def __init__(self, permute_ordinal_encoding: bool = False, **config): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont', 'x_cat']) self.permute_ordinal_encoding = permute_ordinal_encoding def forward_tensor_infos(self, tensor_infos): return utils.update_dict(tensor_infos, {'x_cont': tensor_infos['x_cat']}, remove_keys='x_cat') def _fit(self, ds: DictDataset) -> Layer: return SingleOrdinalEncodingLayer(self, cat_size=ds.tensor_infos['x_cat'].get_cat_sizes()[0].item(), permute_ordinal_encoding=self.permute_ordinal_encoding) class SingleOrdinalEncodingFactory(SingleEncodingFactory): def __init__(self, min_labelenc_cat_size=0, max_labelenc_cat_size=-1, **config): super().__init__(create_fitter=lambda tensor_infos: SingleOrdinalEncodingFitter(**config), min_cat_size=min_labelenc_cat_size, max_cat_size=max_labelenc_cat_size) ================================================ FILE: pytabkit/models/nn_models/models.py ================================================ import copy import functools from typing import Dict, Tuple import numpy as np import torch from sklearn.preprocessing import QuantileTransformer from pytabkit.models.nn_models.activations import ActivationFactory from pytabkit.models.nn_models.base import FitterFactory, SequentialFitter, ResidualFitter, Fitter, RenameTensorFactory, \ FunctionFactory, \ SequentialFactory, FilterTensorsFactory, ConcatParallelFactory from pytabkit.models.nn_models.categorical import EncodingFactory, SingleOneHotFactory, SingleEmbeddingFactory, \ SingleOrdinalEncodingFactory, \ SingleTargetEncodingFactory from pytabkit.models.nn_models.nn import DropoutFitter, WeightFitter, BiasFitter, ScaleFitter, NoiseFitter, \ PLREmbeddingsFactory, ScaleFactory, \ PeriodicEmbeddingsFactory, RFFeatureImportanceFactory, LabelSmoothingFactory, StochasticLabelNoiseFactory, \ StochasticGateFactory, FeatureImportanceFactory, FixedWeightFactory, AntisymmetricInitializationFactory, \ NormalizeOutputFactory, ClampOutputFactory from pytabkit.models.nn_models.pipeline import MedianCenterFactory, RobustScaleFactory, MeanCenterFactory, \ GlobalScaleNormalizeFactory, \ L2NormalizeFactory, L1NormalizeFactory, ThermometerCodingFactory, CircleCodingFactory, SklearnTransformFactory, \ RobustScaleV2Factory, MinMaxScaleFactory from pytabkit.models import utils from pytabkit.models.data.data import TensorInfo from pytabkit.models.utils import TabrQuantileTransformer class BlockFactory(FitterFactory): def __init__(self, out_features: int, block_str: str = 'w-b-a', **config): super().__init__() # could also make this a SequentialFactory if there were factories for all the individual fitters # or a LambdaFactory self.block_str = block_str self.out_features = out_features self.config = config def _create_transform(self, tensor_infos): in_features = tensor_infos['x_cont'].get_n_features() fitters = [] for layer_str in self.block_str.split('-'): # todo: mixup layer? if layer_str in ['a', 'act', 'activation']: fitters.append(ActivationFactory(**self.config).create(tensor_infos).add_scope('act')) elif layer_str in ['d', 'drop', 'dropout']: fitters.append(DropoutFitter()) elif layer_str in ['w', 'weight']: fitters.append(WeightFitter(self.out_features, **self.config).add_scope('weight')) elif layer_str in ['b', 'bias']: fitters.append(BiasFitter(in_features=in_features, **self.config).add_scope('bias')) # elif layer_str == 'D': # alpha-dropout for self-normalizing neural networks # pass # todo elif layer_str in ['s', 'scale']: fitters.append(ScaleFitter(**self.config).add_scope('scale')) # elif layer_str == 'n': # pass # todo: batchnorm elif layer_str in ['noise']: fitters.append(NoiseFitter(**self.config)) elif layer_str in ['r', 'res', 'residual']: out_tensor_infos = SequentialFitter(fitters).forward_tensor_infos(tensor_infos) if np.equal(tensor_infos['X_cont'].get_feat_shape(), out_tensor_infos['X_cont'].get_feat_shape()): # can use residual connection fitters = [ResidualFitter(SequentialFitter(fitters))] else: raise ValueError(f'BlockFactory: Unknown layer string {layer_str}') tensor_infos = fitters[-1].forward_tensor_infos(tensor_infos) return SequentialFitter(fitters), tensor_infos def smooth_clip_func(x, max_abs_value: float = 3.0): return x / (1 + (1 / (max_abs_value ** 2)) * x ** 2).sqrt() def tanh_clip_func(x): return 5 * torch.tanh(0.2 * x) class PreprocessingFactory(FitterFactory): def __init__(self, **config): super().__init__() self.config = config def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter: tfm_factories = [] for tfm in self.config.get('tfms', []): if tfm == 'one_hot': tfm_factories.append(EncodingFactory(SingleOneHotFactory(**self.config), enc_output_name='x_one_hot')) tfm_factories.append(RenameTensorFactory(old_name='x_one_hot', new_name='x_cont')) elif tfm == 'median_center': tfm_factories.append(MedianCenterFactory(**self.config)) elif tfm == 'robust_scale': tfm_factories.append(RobustScaleFactory(**self.config)) elif tfm == 'smooth_clip': tfm_factories.append(FunctionFactory(functools.partial(smooth_clip_func, max_abs_value=self.config.get( 'smooth_clip_max_abs_value', 3.0)))) elif tfm == 'tanh_5_clip': tfm_factories.append(FunctionFactory(tanh_clip_func)) elif tfm == 'mean_center': tfm_factories.append(MeanCenterFactory(**self.config)) elif tfm == 'embedding': tfm_factories.append(EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb')) elif tfm == 'global_scale_normalize': tfm_factories.append(GlobalScaleNormalizeFactory(**self.config)) elif tfm == 'l2_normalize': tfm_factories.append(L2NormalizeFactory(**self.config)) elif tfm == 'l1_normalize': tfm_factories.append(L1NormalizeFactory(**self.config)) elif tfm == 'minmax': tfm_factories.append(MinMaxScaleFactory(**self.config)) elif tfm == 'thermometer_coding': tfm_factories.append(ThermometerCodingFactory(**self.config)) elif tfm == 'circle_coding': tfm_factories.append(CircleCodingFactory(**self.config)) elif tfm == 'ordinal_encoding': tfm_factories.append(EncodingFactory(SingleOrdinalEncodingFactory(**self.config))) elif tfm == 'target_encoding': tfm_factories.append(EncodingFactory(SingleTargetEncodingFactory(**self.config))) elif tfm == 'kdi': from kditransform import KDITransformer tfm = KDITransformer(alpha=self.config.get('kdi_alpha', 1.0), output_distribution=self.config.get('kdi_output_distribution', 'normal'), random_state=0) tfm_factories.append(SklearnTransformFactory(tfm)) elif tfm == 'quantile': tfm = QuantileTransformer(output_distribution=self.config.get('quantile_output_distribution', 'normal'), random_state=0) tfm_factories.append(SklearnTransformFactory(tfm)) elif tfm == "quantile_tabr": tfm = TabrQuantileTransformer(random_state=0) tfm_factories.append(SklearnTransformFactory(tfm)) else: raise NotImplementedError(f"Transformation '{tfm}' is not implemented.") # old interface, using 'tfms' is preferred if self.config.get('use_one_hot', False): tfm_factories.append(EncodingFactory(SingleOneHotFactory(**self.config))) if self.config.get('use_median_center', False): tfm_factories.append(MedianCenterFactory(**self.config)) if self.config.get('use_robust_scale', False): tfm_factories.append(RobustScaleFactory(**self.config)) if self.config.get('use_robust_scale_v2', False): tfm_factories.append(RobustScaleV2Factory(**self.config)) if self.config.get('use_smooth_clip', False): tfm_factories.append(FunctionFactory(lambda x: x / (1 + (1 / 9) * x ** 2).sqrt())) if self.config.get('use_mean_center', False): tfm_factories.append(MeanCenterFactory(**self.config)) if self.config.get('use_embedding', False): tfm_factories.append(EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb')) if self.config.get('use_global_scale_normalize', False): tfm_factories.append(GlobalScaleNormalizeFactory(**self.config)) return SequentialFactory(tfm_factories).add_scope('tfms').create(tensor_infos=tensor_infos) class NNFactory(FitterFactory): def __init__(self, **config): super().__init__() self.config = config if 'use_embedding' not in config: # dirty fix to not miss out on categorical values here, # but do no use this as a default in PreprocessingFactory since that is also used for GBDTs # that can have native categorical processing capabilities self.config['use_embedding'] = True def _create_transform(self, tensor_infos: Dict[str, TensorInfo]) -> Tuple[Fitter, Dict[str, TensorInfo]]: y_cat_sizes = tensor_infos['y'].get_cat_sizes().numpy() n_classes = y_cat_sizes[0] factories = [] net_factories = [] if 'one_hot' in self.config.get('tfms', []) or self.config.get('use_one_hot', False): # do it already here so it can get done once instead of per batch factories.append(EncodingFactory(SingleOneHotFactory(**self.config), enc_output_name='x_one_hot')) prep_factory = PreprocessingFactory(**self.config) num_emb_type = self.config.get('num_emb_type', None) num_emb_config = copy.copy(self.config) if num_emb_type is None or num_emb_type == 'ignore': pass # don't modify the other configuration parameters elif num_emb_type == 'none': num_emb_config['use_plr_embeddings'] = False num_emb_config['use_periodic_emb'] = False elif num_emb_type == 'pl': num_emb_config['use_plr_embeddings'] = True num_emb_config['plr_use_densenet'] = False num_emb_config['plr_use_cos_bias'] = False num_emb_config['plr_act_name'] = 'linear' elif num_emb_type == 'plr': num_emb_config['use_plr_embeddings'] = True num_emb_config['plr_use_densenet'] = False num_emb_config['plr_use_cos_bias'] = False num_emb_config['plr_act_name'] = 'relu' elif num_emb_type == 'pbld': num_emb_config['use_plr_embeddings'] = True num_emb_config['plr_use_densenet'] = True num_emb_config['plr_use_cos_bias'] = True num_emb_config['plr_act_name'] = 'linear' elif num_emb_type == 'pblrd': num_emb_config['use_plr_embeddings'] = True num_emb_config['plr_use_densenet'] = True num_emb_config['plr_use_cos_bias'] = True num_emb_config['plr_act_name'] = 'relu' else: raise ValueError(f'Unknown numerical embedding type: {num_emb_type=}') if num_emb_config.get('use_plr_embeddings', False): plr_factory = PLREmbeddingsFactory(**num_emb_config).add_scope('plr') if num_emb_config.get('use_plr_scale', False): plr_factory = SequentialFactory([ScaleFactory(**num_emb_config), plr_factory]) num_factory = SequentialFactory([ FilterTensorsFactory(include_keys=['x_cont']), prep_factory, plr_factory ]) cat_factory = SequentialFactory([ FilterTensorsFactory(exclude_keys=['x_cont']), # EncodingFactory(SingleOneHotFactory(**self.config)), prep_factory, # EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb') ]) factories.append(ConcatParallelFactory([num_factory, cat_factory])) elif num_emb_config.get('use_periodic_emb', False): periodic_emb_factory = PeriodicEmbeddingsFactory(**num_emb_config).add_scope('periodic_emb') num_factory = SequentialFactory([ FilterTensorsFactory(include_keys=['x_cont']), prep_factory, periodic_emb_factory ]) cat_factory = SequentialFactory([ FilterTensorsFactory(exclude_keys=['x_cont']), # EncodingFactory(SingleOneHotFactory(**self.config)), prep_factory, # EncodingFactory(SingleEmbeddingFactory(**self.config)).add_scope('emb') ]) factories.append(ConcatParallelFactory([num_factory, cat_factory])) else: factories.append(prep_factory) if self.config.get('use_rf_importances', False): factories.append(RFFeatureImportanceFactory()) if self.config.get('use_ls', False) and n_classes > 0: factories.append(LabelSmoothingFactory(**self.config)) if self.config.get('use_sln', False) and n_classes > 0: factories.append(StochasticLabelNoiseFactory()) if self.config.get('use_sg', False) and n_classes > 0: factories.append(StochasticGateFactory()) if self.config.get('add_importance_layer', False): factories.append(FeatureImportanceFactory()) if self.config.get('add_fixed_weight_layer', False): factories.append(FixedWeightFactory()) hidden_sizes = self.config.get('hidden_sizes', [256] * 3) if hidden_sizes == 'rectangular': hidden_sizes = [self.config.get('hidden_width', 256)] * self.config.get('n_hidden_layers', 3) train_metric_name = self.config.get('train_metric_name', None) if isinstance(train_metric_name, str) and train_metric_name.startswith('multi_pinball('): out_factor = train_metric_name.count(',') + 1 else: out_factor = 1 out_sizes = hidden_sizes + [len(y_cat_sizes) * out_factor if n_classes == 0 else n_classes] for i in range(len(out_sizes)): layer_position = 'middle' block_scope_2 = f'layer-{i}' config = self.config if i + 1 == len(out_sizes): config = utils.join_dicts(config, {'block_str': 'w-b'}, config.get('last_layer_config', {})) layer_position = 'last' elif i == 0: config = utils.join_dicts(config, config.get('first_layer_config', {})) if config.get('add_front_scale', False): config['block_str'] = 's-' + config.get('block_str', 'w-b-a-d') first_layer_lr_factor = config.get('first_layer_lr_factor', None) if first_layer_lr_factor is not None: config['weight_lr_factor'] = config.get('weight_lr_factor', 1.0) * first_layer_lr_factor config['bias_lr_factor'] = config.get('bias_lr_factor', 1.0) * first_layer_lr_factor layer_position = 'first' block_scope = layer_position + '_layer' net_factories.append( BlockFactory(out_features=out_sizes[i], layer_position=layer_position, **config).add_scope(block_scope).add_scope(block_scope_2)) factories.append(SequentialFactory(net_factories).add_scope('net')) if self.config.get('use_antisymmetric_initialization', False): factories = [AntisymmetricInitializationFactory(SequentialFactory(factories), **self.config)] if self.config.get('output_factor', 1.0) != 1.0: factories.append(FunctionFactory(lambda x, c=self.config['output_factor']: c * x)) if self.config.get('normalize_output', False): factories.append(NormalizeOutputFactory(**self.config)) if self.config.get('clamp_output', False): # use clamp after normalization! factories.append(ClampOutputFactory(**self.config)) factory = SequentialFactory(factories) return factory.create_transform(tensor_infos) ================================================ FILE: pytabkit/models/nn_models/nn.py ================================================ import copy from typing import Dict import numpy as np import torch import torch.nn.functional as F from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from pytabkit.models import utils from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.nn_models.base import Fitter, Variable, WeightLayer, BiasLayer, ScaleLayer, FitterFactory, Layer, \ TrainContext, sub_scope_context, SequentialFitter, SequentialLayer, FunctionLayer from pytabkit.models.torch_utils import gauss_cdf class WeightFitter(Fitter): def __init__(self, out_features, **config): super().__init__(modified_tensors=['x_cont']) self.out_features = out_features self.weight_init_mode = config.get('weight_init_mode', 'normal') self.weight_init_gain = config.get('weight_init_gain', 1.0) self.weight_lr_factor = config.get('weight_lr_factor', 1.0) self.weight_l2_factor = config.get('weight_l2_factor', 1.0) self.weight_l1_factor = config.get('weight_l1_factor', 1.0) self.weight_wd_factor = config.get('weight_wd_factor', 1.0) # use abc parameterization here? # todo: ntk param can imply different learning rate factors for different optimizers # also, the influence of Adam's epsilon can be different # maybe this can be resolved using abc-style parameterization self.use_ntk_param = config.get('use_ntk_param', False) self.use_ntk_param_v2 = config.get('use_ntk_param_v2', False) self.use_ntk_param_v3 = config.get('use_ntk_param_v3', False) self.weight_param = config.get('weight_param', 'standard') if self.use_ntk_param: raise ValueError(f'use_ntk_param is discontinued, use weight_param="ntk" instead') if self.use_ntk_param_v2: raise ValueError(f'use_ntk_param_v2 is discontinued, use weight_param="ntk-v2" instead') if self.use_ntk_param_v3: raise ValueError(f'use_ntk_param_v3 is discontinued, use weight_param="ntk-v3" instead') self.use_norm_weight = config.get('use_norm_weight', False) self.norm_weight_transpose = config.get('norm_weight_transpose', False) self.layer_position = config.get('layer_position', None) self.weight_gain = config.get('weight_gain', 1.0) super().__init__(needs_tensors=self.weight_init_mode in ['std']) # todo: adjust for some weight init modes def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self.out_features * self._get_n_values(tensor_infos, ['x_cont']) def forward_tensor_infos(self, tensor_infos): return utils.update_dict(tensor_infos, {'x_cont': TensorInfo(feat_shape=[self.out_features])}) def _fit(self, ds: DictDataset): in_features = ds.tensor_infos['x_cont'].get_n_features() init_factor = self.weight_init_gain * np.sqrt(1.0 / in_features) lr_factor = self.weight_lr_factor wd_factor = self.weight_wd_factor weight_gain = self.weight_gain l2_factor = self.weight_l2_factor l1_factor = self.weight_l1_factor if self.weight_param == 'xavier': # todo: this is not a parametrization, use weight_init_mode instead init_factor = self.weight_init_gain * np.sqrt(2.0 / (in_features + self.out_features)) elif self.weight_param == 'ntk' or self.weight_param == 'ntk-v3': weight_gain = self.weight_gain * np.sqrt(1.0 / in_features) init_factor = self.weight_init_gain elif self.weight_param == 'ntk-old': lr_factor *= weight_gain * np.sqrt(1.0 / in_features) init_factor *= weight_gain weight_gain = 1.0 elif self.weight_param == 'ntk-v2': lr_factor = self.weight_lr_factor * weight_gain * np.sqrt(1.0 / in_features) init_factor *= weight_gain weight_gain = 1.0 # this is chosen because wd is multiplied by lr when performing weight decay, # and the effective wd step size should not scale with in_features wd_factor = self.weight_wd_factor * np.sqrt(in_features) / weight_gain # print(f'{self.weight_gain=}, {lr_factor=}, {wd_factor=}') elif self.weight_param == 'ntk-adam': init_factor = self.weight_init_gain * self.weight_gain * np.sqrt(1.0 / in_features) lr_factor = self.weight_lr_factor * self.weight_gain / in_features / np.sqrt(self.out_features) wd_factor = self.weight_wd_factor l2_factor = self.weight_l2_factor * np.sqrt(1.0 / self.out_features) l1_factor = self.weight_l1_factor * np.sqrt(1.0 / self.out_features) weight_gain = 1.0 elif self.weight_param == 'mup-adam': # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer" if self.layer_position == 'first': lr_factor = self.weight_lr_factor elif self.layer_position == 'middle': lr_factor = self.weight_lr_factor / in_features elif self.layer_position == 'last': init_factor = self.weight_init_gain / in_features lr_factor = self.weight_lr_factor / in_features else: raise ValueError(f'Unknown layer_position for mup-adam: {self.layer_position}') elif self.weight_param == 'mup-sgd': # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer" if self.layer_position == 'first': lr_factor = self.weight_lr_factor * self.out_features elif self.layer_position == 'middle': lr_factor = self.weight_lr_factor elif self.layer_position == 'last': init_factor = self.weight_init_gain / in_features lr_factor = self.weight_lr_factor / in_features else: raise ValueError(f'Unknown layer_position for mup-adam: {self.layer_position}') elif self.weight_param == 'mup-adam-custom': # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer" if self.layer_position == 'first': lr_factor = self.weight_lr_factor / in_features elif self.layer_position == 'middle': lr_factor = self.weight_lr_factor / in_features elif self.layer_position == 'last': init_factor = self.weight_init_gain / in_features lr_factor = self.weight_lr_factor / in_features else: raise ValueError(f'Unknown layer_position for mup-adam-custom: {self.layer_position}') elif self.weight_param == 'mup-adam-custom-2': # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer" # with custom weight decay factors if self.layer_position == 'first': lr_factor = self.weight_lr_factor / in_features wd_factor = self.weight_wd_factor * np.sqrt(in_features) elif self.layer_position == 'middle': lr_factor = self.weight_lr_factor / in_features wd_factor = self.weight_wd_factor * np.sqrt(in_features) elif self.layer_position == 'last': init_factor = self.weight_init_gain / in_features lr_factor = self.weight_lr_factor / in_features # unclear if this wd is the right one, # but here the lr_factor is already on the scale of the initialization wd_factor = self.weight_wd_factor else: raise ValueError(f'Unknown layer_position for mup-adam-custom-2: {self.layer_position}') elif self.weight_param == 'mup-sgd-custom': # following Table 3 in "Tuning Large Neural Networks via zero-Shot Hyperparameter Transfer" if self.layer_position == 'first': lr_factor = self.weight_lr_factor elif self.layer_position == 'middle': lr_factor = self.weight_lr_factor elif self.layer_position == 'last': init_factor = self.weight_init_gain / in_features lr_factor = self.weight_lr_factor / in_features else: raise ValueError(f'Unknown layer_position for mup-sgd-custom: {self.layer_position}') elif self.weight_param == 'standard': pass # standard parameterization else: raise ValueError(f'Unknown weight_param "{self.weight_param}"') # pytorch default is # for weights: # kaiming_uniform from unif[-bound, bound] # bound = sqrt(3) * gain / sqrt(in_features) # gain = sqrt(2 / (1 + sqrt(5)^2)) = sqrt(1/3) # therefore bound = 1 / sqrt(in_features) # for biases it's also unif[-1/sqrt(in_features), 1/sqrt(in_features)] if self.weight_init_mode == 'normal': weight = torch.randn(in_features, self.out_features, device=ds.device) elif self.weight_init_mode == 'uniform': # include np.sqrt(3) to ensure variance = 1 weight = np.sqrt(3) * (2 * torch.rand(in_features, self.out_features, device=ds.device) - 1) elif self.weight_init_mode == 'zeros' or self.weight_init_mode == 'zero': weight = torch.zeros(in_features, self.out_features, device=ds.device) elif self.weight_init_mode == 'std': weight = torch.randn(in_features, self.out_features, device=ds.device) x = ds.tensors['x_cont'] weight = weight / x.matmul(weight_gain * init_factor * weight).std(dim=-2, correction=0, keepdim=True) elif self.weight_init_mode == 'sqmom': weight = torch.randn(in_features, self.out_features, device=ds.device) x = ds.tensors['x_cont'] weight = weight / x.matmul(weight_gain * init_factor * weight).square().sum(dim=-2, keepdim=True).sqrt() else: raise ValueError(f'Unknown weight_init_mode: {self.weight_init_mode}') # print(f'{repr(weight)=}') # print(f'{hash_tensor(weight)=}') if self.use_norm_weight: factor = np.sqrt(self.out_features / in_features) if self.norm_weight_transpose else 1.0 return NormWeightLayer(Variable(init_factor * weight, trainable=True, hyper_factors={'lr': lr_factor, 'wd': wd_factor, 'l2': l2_factor, 'l1': l1_factor}), factor=weight_gain * factor, fitter=self, transpose=self.norm_weight_transpose) else: return WeightLayer(Variable(init_factor * weight, trainable=True, hyper_factors={'lr': lr_factor, 'wd': wd_factor, 'l2': l2_factor, 'l1': l1_factor}), factor=weight_gain) class BiasFitter(Fitter): def __init__(self, **config): super().__init__(modified_tensors=['x_cont']) self.in_features = config.get('in_features', None) self.bias_init_mode = config.get('bias_init_mode', 'zeros') self.bias_init_gain = config.get('bias_init_gain', 1.0) self.bias_lr_factor = config.get('bias_lr_factor', 1.0) self.bias_l1_reg_factor = config.get('bias_l1_reg_factor', 1.0) self.bias_l2_reg_factor = config.get('bias_l2_reg_factor', 1.0) self.bias_wd_factor = config.get('bias_wd_factor', 1.0) self.bias_param = config.get('bias_param', 'standard') self.layer_position = config.get('layer_position', None) self.bias_gain = config.get('bias_gain', 1.0) # todo: adjust for some bias init modes super().__init__( needs_tensors=self.bias_init_mode in ['he+5', 'mean', 'neg-uniform-dynamic', 'neg-uniform-dynamic-2', 'normal-dynamic']) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def heplus_bias(self, x, n_simplex): idxs = torch.randint(0, x.shape[0], size=(x.shape[1], n_simplex), device=x.device) simplex_weights = torch.distributions.Exponential(1.0).sample((x.shape[1], n_simplex)) simplex_weights = simplex_weights.to(x.device) simplex_weights /= simplex_weights.sum(dim=1)[:, None] out_selected = torch.stack([x[idxs[:, i], torch.arange(x.shape[1], device=x.device)] for i in range(n_simplex)], dim=1) return -(out_selected * simplex_weights).sum(dim=1) def _fit(self, ds: DictDataset): n_features = ds.tensor_infos['x_cont'].get_n_features() lr_factor = self.bias_lr_factor bias_gain = self.bias_gain l2_factor = self.bias_l2_reg_factor l1_factor = self.bias_l1_reg_factor wd_factor = self.bias_wd_factor if self.bias_param == 'mup-sgd' and self.layer_position == 'first': # corresponds to fan_out in Table 3 of "Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer" lr_factor *= self.in_features elif self.bias_param == 'ntk-adam': lr_factor = self.bias_lr_factor / np.sqrt(n_features) l1_factor = self.bias_l1_reg_factor / np.sqrt(n_features) l2_factor = self.bias_l2_reg_factor / np.sqrt(n_features) if self.bias_init_mode == 'zeros' or self.bias_init_mode == 'zero': bias = torch.zeros(n_features, device=ds.device) elif self.bias_init_mode == 'normal': bias = torch.randn(n_features, device=ds.device) elif self.bias_init_mode == 'uniform': # include np.sqrt(3) to ensure variance = 1 bias = np.sqrt(3) * (2 * torch.rand(n_features, device=ds.device) - 1) elif self.bias_init_mode == 'neg-uniform': bias = np.sqrt(3) * (-torch.rand(n_features, device=ds.device)) elif self.bias_init_mode == 'neg-uniform-dynamic': mean = ds.tensors['x_cont'].mean(dim=-2) std = ds.tensors['x_cont'].std(dim=-2, correction=0) bias = -std * (mean + np.sqrt(3) * torch.rand(n_features, device=ds.device)) elif self.bias_init_mode == 'neg-uniform-dynamic-2': mean = ds.tensors['x_cont'].mean(dim=-2) std = ds.tensors['x_cont'].std(dim=-2, correction=0) bias = -mean - std * np.sqrt(3) * torch.rand(n_features, device=ds.device) elif self.bias_init_mode == 'normal-dynamic': mean = ds.tensors['x_cont'].mean(dim=-2) std = ds.tensors['x_cont'].std(dim=-2, correction=0) bias = -mean + std * torch.randn(n_features, device=ds.device) elif self.bias_init_mode == 'he+5': bias = self.heplus_bias(ds.tensors['x_cont'], 5) elif self.bias_init_mode == 'mean': bias = -ds.tensors['x_cont'].mean(dim=-2) elif self.bias_init_mode == 'pytorch-default': bias = np.sqrt(1.0 / self.in_features) * (2 * torch.rand(n_features, device=ds.device) - 1) else: raise ValueError(f'Unknown bias_init_mode: {self.bias_init_mode}') # print(f'{repr(bias)=}') # print(f'{hash_tensor(bias)=}') return BiasLayer(Variable(self.bias_init_gain * bias[None, :] / bias_gain, trainable=True, hyper_factors={'lr': lr_factor, 'wd': wd_factor, 'l1_reg': l1_factor, 'l2_reg': l2_factor}), factor=bias_gain) class ScaleFitter(Fitter): def __init__(self, **config): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) self.scale_init_gain = config.get('scale_init_gain', 1.0) self.scale_lr_factor = config.get('scale_lr_factor', 1.0) self.scale_wd_factor = config.get('scale_wd_factor', 1.0) self.scale_l2_reg_factor = config.get('scale_l2_reg_factor', 1.0) self.scale_l1_reg_factor = config.get('scale_l1_reg_factor', 1.0) self.scale_trainable = config.get('scale_trainable', True) self.scale_param = config.get('scale_param', 'standard') def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset): in_features = ds.tensor_infos['x_cont'].get_n_features() lr_factor = self.scale_lr_factor init_gain = self.scale_init_gain wd_factor = self.scale_wd_factor l2_reg_factor = self.scale_l2_reg_factor l1_reg_factor = self.scale_l1_reg_factor if self.scale_param == 'mup-adam-custom': lr_factor = self.scale_lr_factor / in_features elif self.scale_param == 'ntk-v2': lr_factor = self.scale_lr_factor / np.sqrt(in_features) elif self.scale_param == 'ntk-adam': lr_factor = self.scale_lr_factor / np.sqrt(in_features) elif self.scale_param == 'ntk-adam-v2': lr_factor = self.scale_lr_factor / np.sqrt(in_features) l2_reg_factor = self.scale_l2_reg_factor / np.sqrt(in_features) l1_reg_factor = self.scale_l1_reg_factor / np.sqrt(in_features) n_features = ds.tensor_infos['x_cont'].get_n_features() scale = init_gain * torch.ones(n_features, device=ds.device) return ScaleLayer(Variable(scale[None, :], trainable=self.scale_trainable, hyper_factors={'lr': lr_factor, 'wd': wd_factor, 'l2_reg': l2_reg_factor, 'l1_reg': l1_reg_factor})) class ScaleFactory(FitterFactory): def __init__(self, **config): super().__init__() self.config = config def _create(self, tensor_infos: Dict[str, TensorInfo]) -> Fitter: return ScaleFitter(**self.config) class DropoutLayer(Layer): def __init__(self): super().__init__() self.hyper_getter = self.context.hp_manager.register_hyper('p_drop', self.context.scope) def forward_cont(self, x): p_drop = self.hyper_getter() if p_drop == 0.0: return x return F.dropout(x, p_drop, training=self.training) class DropoutFitter(Fitter): def __init__(self): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: return DropoutLayer() class NoiseLayer(Layer): def __init__(self): super().__init__() self.sigma_getter = self.context.hp_manager.register_hyper('layer_noise_sigma', self.context.scope) def forward_cont(self, x): sigma = self.sigma_getter() if sigma == 0.0 or not self.training: return x return x + sigma * torch.randn_like(x) class NoiseFitter(Fitter): def __init__(self, **config): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: return NoiseLayer() # ------ Regression output rescaling / clamping ------- class ClampLayer(Layer): def __init__(self, low: Variable, high: Variable): super().__init__() self.low = low self.high = high def forward_cont(self, x): if self.training: return x else: return torch.min(torch.max(x, self.low), self.high) def _stack(self, layers): return ClampLayer(Variable.stack([l.low for l in layers]), Variable.stack([l.high for l in layers])) class ClampOutputFactory(Fitter, FitterFactory): def __init__(self, **config): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) self.config = config def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return 2 * self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: y = TrainContext.get_global_context().hp_manager.get_more_info_dict()['trainval_ds'].tensors['y'] return ClampLayer(low=Variable(y.min(dim=-2, keepdim=True)[0], trainable=False), high=Variable(y.max(dim=-2, keepdim=True)[0], trainable=False)) class NormalizeOutputLayer(Layer): def __init__(self, mean: Variable, std: Variable): super().__init__() self.mean = mean self.std = std def forward_tensors(self, tensors): tensors = copy.copy(tensors) # shallow copy if self.training: assert 'y' in tensors tensors['y'] = (tensors['y'] - self.mean) / (self.std + 1e-30) else: tensors['x_cont'] = tensors['x_cont'] * self.std + self.mean return tensors def _stack(self, layers): return NormalizeOutputLayer(mean=Variable.stack([l.mean for l in layers]), std=Variable.stack([l.std for l in layers])) class NormalizeOutputFactory(Fitter, FitterFactory): def __init__(self, **config): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return 2 * self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: y = TrainContext.get_global_context().hp_manager.get_more_info_dict()['trainval_ds'].tensors['y'] return NormalizeOutputLayer(mean=Variable(y.mean(dim=-2, keepdim=True), trainable=False), std=Variable(y.std(dim=-2, correction=0, keepdim=True), trainable=False)) class NormWeightLayer(Layer): def __init__(self, weight: Variable, factor: float, fitter: Fitter, transpose=False): super().__init__(fitter=fitter) self.weight = weight self.factor = factor self.transpose = transpose def forward_cont(self, x): return x.matmul(self.factor * self.weight / self.weight.norm(dim=-1 if self.transpose else -2, keepdim=True)) def _stack(self, layers): return NormWeightLayer(weight=Variable.stack([l.weight for l in layers]), factor=layers[0].factor, fitter=layers[0].fitter, transpose=layers[0].transpose) class FixedScaleFactory(Fitter, FitterFactory): def __init__(self, scale: torch.Tensor): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) self.scale = scale def _fit(self, ds: DictDataset) -> Layer: return ScaleLayer(Variable(self.scale, trainable=False)) class FeatureImportanceFactory(Fitter, FitterFactory): def __init__(self): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: scale = TrainContext.get_global_context().hp_manager.get_more_info_dict()['feature_importances'][None, :] return ScaleLayer(Variable(scale.to(ds.device), trainable=False)) class FixedWeightFactory(Fitter, FitterFactory): def __init__(self): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: weight = TrainContext.get_global_context().hp_manager.get_more_info_dict()['fixed_weights'] return WeightLayer(Variable(weight.to(ds.device), trainable=False)) class RFFeatureImportanceFactory(Fitter, FitterFactory): def __init__(self): super().__init__(needs_tensors=True, is_individual=True, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: x = ds.tensors['x_cont'].cpu().numpy() y = ds.tensors['y'].cpu().numpy() n_estimators = 50 if ds.tensor_infos['y'].is_cont(): # assume it's regression model = RandomForestRegressor(n_estimators=n_estimators, n_jobs=1) else: # assume it's classification model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=1) model.fit(x, y) scale = torch.as_tensor(model.feature_importances_, dtype=torch.float32, device=ds.device) # print(f'RF feature importances: {scale}') scale *= np.sqrt(scale.shape[0]) / scale.norm(dim=-1) return ScaleLayer(Variable(scale[None, :], trainable=False)) # ------ Mixup and Label smoothing ------ class PLREmbeddingsFactory(Fitter, FitterFactory): # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings def __init__(self, plr_sigma: float = 1.0, plr_hidden_1: int = 8, plr_hidden_2: int = 8, plr_lr_factor: float = 1.0, plr_lr_factor_1: float = 1.0, plr_lr_factor_2: float = 1.0, plr_wd_factor: float = 1.0, plr_act_name: str = 'relu', plr_use_densenet: bool = False, plr_use_cos_bias: bool = False, **config): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) self.plr_sigma = plr_sigma self.plr_hidden_1 = plr_hidden_1 self.plr_hidden_2 = plr_hidden_2 self.plr_lr_factor = plr_lr_factor self.plr_lr_factor_1 = plr_lr_factor_1 self.plr_lr_factor_2 = plr_lr_factor_2 self.plr_wd_factor = plr_wd_factor self.plr_act_name = plr_act_name self.plr_use_densenet = plr_use_densenet self.plr_use_cos_bias = plr_use_cos_bias if not plr_use_cos_bias: assert plr_hidden_1 % 2 == 0 def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: n_cont = self._get_n_values(tensor_infos, ['x_cont']) hidden_2 = self.plr_hidden_2 if self.plr_use_densenet: hidden_2 -= 1 # don't count densenet output for parameters if self.plr_use_cos_bias: return n_cont * (2 * self.plr_hidden_1 + (self.plr_hidden_1 + 1) * hidden_2) else: return n_cont * (self.plr_hidden_1 // 2 + (self.plr_hidden_1 + 1) * hidden_2) def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: hidden_2 = self.plr_hidden_2 if self.plr_use_densenet: # for before the torch.cat() and after the torch.cat() hidden_2 = 2 * hidden_2 - 1 if self.plr_act_name != 'linear': hidden_2 += self.plr_hidden_2 n_cont = self._get_n_values(tensor_infos, ['x_cont']) if self.plr_use_cos_bias: # 3 for wx, wx+b, cos(wx+b) return n_cont * (3 * self.plr_hidden_1 + hidden_2) else: # in first hidden layer, have wx, sin(wx), cos(wx), cat(...) return n_cont * (int(2.5 * self.plr_hidden_1) + hidden_2) def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: return utils.update_dict(tensor_infos, {'x_cont': TensorInfo( feat_shape=[tensor_infos['x_cont'].get_n_features() * self.plr_hidden_2])}) def _fit(self, ds: DictDataset) -> Layer: n_cont = ds.tensor_infos['x_cont'].get_n_features() # assuming that the shape is rank 1 hyper_factors_1 = {'lr': self.plr_lr_factor * self.plr_lr_factor_1, 'wd': self.plr_wd_factor} hyper_factors_2 = {'lr': self.plr_lr_factor * self.plr_lr_factor_2, 'wd': self.plr_wd_factor} if self.plr_use_cos_bias: with sub_scope_context('weight_1'): weight_1 = Variable(self.plr_sigma * torch.randn(n_cont, 1, self.plr_hidden_1, device=ds.device), hyper_factors=hyper_factors_1) with sub_scope_context('bias_1'): # use uniform [-pi, pi] instead of uniform [0, 2pi] for smaller values in case of weight decay bias_1 = Variable(np.pi * (-1 + 2 * torch.rand(n_cont, 1, self.plr_hidden_1, device=ds.device)), hyper_factors=hyper_factors_1) else: # normal initialization as in the paper with sub_scope_context('weight_1'): weight_1 = Variable(self.plr_sigma * torch.randn(n_cont, 1, self.plr_hidden_1 // 2, device=ds.device), hyper_factors=hyper_factors_1) # kaiming init from nn.Linear in_features = self.plr_hidden_1 hidden_2 = self.plr_hidden_2 if self.plr_use_densenet: hidden_2 -= 1 with sub_scope_context('weight_2'): weight_2 = Variable( (-1 + 2 * torch.rand(n_cont, self.plr_hidden_1, hidden_2, device=ds.device)) / np.sqrt(in_features), hyper_factors=hyper_factors_2) with sub_scope_context('bias_2'): bias_2 = Variable( (-1 + 2 * torch.rand(n_cont, 1, hidden_2, device=ds.device)) / np.sqrt(in_features), hyper_factors=hyper_factors_2) if self.plr_use_cos_bias: return PLREmbeddingsLayerCosBias(fitter=self, weight_1=weight_1, weight_2=weight_2, bias_1=bias_1, bias_2=bias_2, plr_act_name=self.plr_act_name, plr_use_densenet=self.plr_use_densenet) else: return PLREmbeddingsLayer(fitter=self, weight_1=weight_1, weight_2=weight_2, bias_2=bias_2, plr_act_name=self.plr_act_name, plr_use_densenet=self.plr_use_densenet) class PLREmbeddingsLayer(Layer): # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package def __init__(self, fitter: Fitter, weight_1: Variable, weight_2: Variable, bias_2: Variable, plr_act_name: str, plr_use_densenet: bool = False): super().__init__(fitter=fitter) self.weight_1 = weight_1 self.weight_2 = weight_2 self.bias_2 = bias_2 self.plr_act_name = plr_act_name self.plr_use_densenet = plr_use_densenet def forward_cont(self, x): # transpose to treat the continuous feature dimension like a batched dimension # then add a new channel dimension # shape will be (vectorized..., n_cont, batch, 1) x_orig = x x = x.transpose(-1, -2).unsqueeze(-1) x = 2 * torch.pi * x.matmul(self.weight_1) # matmul is automatically batched x = torch.cat([torch.cos(x), torch.sin(x)], dim=-1) x = x.matmul(self.weight_2) # matmul is automatically batched x = x + self.bias_2 if self.plr_act_name == 'relu': x = torch.relu(x) elif self.plr_act_name == 'linear': pass else: raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"') # bring back n_cont dimension after n_batch # then flatten the last two dimensions x = x.transpose(-2, -3) x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1]) if self.plr_use_densenet: x = torch.cat([x, x_orig], dim=-1) return x def _stack(self, layers): return PLREmbeddingsLayer(fitter=layers[0].fitter, weight_1=Variable.stack([l.weight_1 for l in layers]), weight_2=Variable.stack([l.weight_2 for l in layers]), bias_2=Variable.stack([l.bias_2 for l in layers]), plr_act_name=layers[0].plr_act_name, plr_use_densenet=layers[0].plr_use_densenet) class PLREmbeddingsLayerCosBias(Layer): # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package def __init__(self, fitter: Fitter, weight_1: Variable, bias_1: Variable, weight_2: Variable, bias_2: Variable, plr_act_name: str, plr_use_densenet: bool = False): super().__init__(fitter=fitter) self.weight_1 = weight_1 self.weight_2 = weight_2 self.bias_1 = bias_1 self.bias_2 = bias_2 self.plr_act_name = plr_act_name self.plr_use_densenet = plr_use_densenet def forward_cont(self, x): # transpose to treat the continuous feature dimension like a batched dimension # then add a new channel dimension # shape will be (vectorized..., n_cont, batch, 1) x_orig = x x = x.transpose(-1, -2).unsqueeze(-1) x = 2 * torch.pi * x.matmul(self.weight_1) # matmul is automatically batched x = x + self.bias_1 # x = torch.sin(x) x = torch.cos(x) x = x.matmul(self.weight_2) # matmul is automatically batched x = x + self.bias_2 if self.plr_act_name == 'relu': x = torch.relu(x) elif self.plr_act_name == 'linear': pass else: raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"') # bring back n_cont dimension after n_batch # then flatten the last two dimensions x = x.transpose(-2, -3) x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1]) if self.plr_use_densenet: x = torch.cat([x, x_orig], dim=-1) return x def _stack(self, layers): return PLREmbeddingsLayerCosBias(fitter=layers[0].fitter, weight_1=Variable.stack([l.weight_1 for l in layers]), weight_2=Variable.stack([l.weight_2 for l in layers]), bias_1=Variable.stack([l.bias_1 for l in layers]), bias_2=Variable.stack([l.bias_2 for l in layers]), plr_act_name=layers[0].plr_act_name, plr_use_densenet=layers[0].plr_use_densenet) class PeriodicEmbeddingsFactory(Fitter, FitterFactory): # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings def __init__(self, periodic_emb_sigma: float = 1.0, periodic_emb_dim: int = 8, periodic_emb_lr_factor: float = 1.0, periodic_emb_wd_factor: float = 1.0, periodic_emb_only_cos: bool = False, periodic_emb_densenet: bool = False, **config): super().__init__(needs_tensors=False, is_individual=True, modified_tensors=['x_cont']) self.periodic_emb_sigma = periodic_emb_sigma self.periodic_emb_dim = periodic_emb_dim self.periodic_emb_lr_factor = periodic_emb_lr_factor self.periodic_emb_wd_factor = periodic_emb_wd_factor self.periodic_emb_only_cos = periodic_emb_only_cos self.periodic_emb_densenet = periodic_emb_densenet def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: n_params_single = self.periodic_emb_dim if self.periodic_emb_densenet: n_params_single -= 1 if self.periodic_emb_only_cos: n_params_single *= 2 else: n_params_single //= 2 return self._get_n_values(tensor_infos, ['x_cont']) * n_params_single def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: n_cont = self._get_n_values(tensor_infos, ['x_cont']) # factor 2 * for sin, cos, x, and concat return 2 * n_cont * self.periodic_emb_dim def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]) -> Dict[str, TensorInfo]: return utils.update_dict(tensor_infos, {'x_cont': TensorInfo( feat_shape=[tensor_infos['x_cont'].get_n_features() * self.periodic_emb_dim])}) def _fit(self, ds: DictDataset) -> Layer: n_cont = ds.tensor_infos['x_cont'].get_n_features() # assuming that the shape is rank 1 hyper_factors = {'lr': self.periodic_emb_lr_factor, 'wd': self.periodic_emb_wd_factor} param_dim = self.periodic_emb_dim if self.periodic_emb_densenet: param_dim -= 1 if self.periodic_emb_only_cos: # not implemented because it turned out to be not so good to omit the linear layer afterward raise NotImplementedError() else: if param_dim % 2 == 1: raise ValueError(f'Wrong parity for periodic_emb_dim, got {self.periodic_emb_dim=}') param_dim //= 2 with sub_scope_context('weight'): weight = Variable( self.periodic_emb_sigma * torch.randn(n_cont, 1, param_dim, device=ds.device), hyper_factors=hyper_factors) return PeriodicEmbeddingsLayerSinCos(self, weight, periodic_emb_densenet=self.periodic_emb_densenet) class PeriodicEmbeddingsLayerSinCos(Layer): # an implementation of https://github.com/yandex-research/tabular-dl-num-embeddings # see https://github.com/yandex-research/rtdl-num-embeddings/tree/main/package def __init__(self, fitter: Fitter, weight: Variable, periodic_emb_densenet: bool): super().__init__(fitter=fitter) self.weight = weight self.periodic_emb_densenet = periodic_emb_densenet def forward_cont(self, x): # transpose to treat the continuous feature dimension like a batched dimension # then add a new channel dimension # shape will be (vectorized..., n_cont, batch, 1) x_orig = x x = x.transpose(-1, -2).unsqueeze(-1) x = 2 * torch.pi * x.matmul(self.weight) # matmul is automatically batched x = torch.cat([torch.cos(x), torch.sin(x)], dim=-1) # bring back n_cont dimension after n_batch # then flatten the last two dimensions x = x.transpose(-2, -3) x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1]) if self.periodic_emb_densenet: x = torch.cat([x, x_orig], dim=-1) return x def _stack(self, layers): return PeriodicEmbeddingsLayerSinCos(fitter=layers[0].fitter, weight=Variable.stack([l.weight for l in layers]), periodic_emb_densenet=layers[0].periodic_emb_densenet) class ToSoftLabelLayer(Layer): def __init__(self, y_tensor_info, fitter: Fitter): super().__init__(fitter=fitter) self.y_tensor_info = y_tensor_info def forward_tensors(self, tensors): if 'y' not in tensors: return tensors else: y = tensors['y'] y_cs = self.y_tensor_info.get_cat_sizes().numpy() new_y_cols = [] for i, cs in enumerate(y_cs): if cs == 0: # already continuous new_y_cols.append(y[tuple([slice(None)] * (y.dim() - 1) + [slice(i, i + 1)])]) else: # make continuous # todo: is there a better one-hot function without the long -> float conversion? new_y_cols.append(F.one_hot(y[tuple([slice(None)] * (y.dim() - 1) + [i])], num_classes=cs).float()) return utils.join_dicts(tensors, {'y': torch.cat(new_y_cols, dim=-1)}) class ToSoftLabelFitter(Fitter): def __init__(self): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['y']) def forward_tensor_infos(self, tensor_infos): if 'y' not in tensor_infos: return tensor_infos new_y_shape = sum([max(1, cs) for cs in tensor_infos['y'].get_cat_sizes().numpy()]) return utils.update_dict(tensor_infos, {'y': TensorInfo(feat_shape=[new_y_shape])}) def _fit(self, ds: DictDataset) -> Layer: return ToSoftLabelLayer(y_tensor_info=ds.tensor_infos['y'], fitter=self) class LabelSmoothingLayer(Layer): # assumes soft labels as inputs def __init__(self, ls_dist: Variable): super().__init__() self.hyper_getter = self.context.hp_manager.register_hyper('ls_eps', self.context.scope) self.ls_dist = ls_dist def forward_tensors(self, tensors): # print(f'{self.training=}, {list(tensors.keys())=}') # if not self.training or 'y' not in tensors: if 'y' not in tensors: return tensors ls_eps = self.hyper_getter() # print(f'{ls_eps=:g}') y = tensors['y'] y = (1.0 - ls_eps) * y + ls_eps * self.ls_dist return utils.update_dict(tensors, {'y': y}) def _stack(self, layers): return LabelSmoothingLayer(Variable.stack([l.ls_dist for l in layers])) class LabelSmoothingFitter(Fitter): def __init__(self, use_ls_prior=False, **config): # todo: we set needs_tensors=True and is_individual=True here # because the transformation can depend on the hyperparameter ls_eps, which can be scheduled. # If needs_tensors=True, this fitter is not fitted for one-time preprocessing, # where the schedules are not yet available. # ideally, super().__init__() would use another parameter is_dynamic or so which could be set to true instead # formerly, we used needs_tensors=use_ls_prior super().__init__(needs_tensors=True, is_individual=True, modified_tensors=['y']) self.use_ls_prior = use_ls_prior def _fit(self, ds: DictDataset) -> Layer: # consistency check since y must be soft labels and not hard labels assert ds.tensor_infos['y'].is_cont() # y is assumed to already be converted to one-hot if self.use_ls_prior: y = ds.tensors['y'] ls_dist = y.mean(dim=-2, keepdim=True) else: n_classes = ds.tensor_infos['y'].get_n_features() ls_dist = torch.ones(1, n_classes, device=ds.device) / n_classes return LabelSmoothingLayer(Variable(ls_dist, trainable=False)) class LabelSmoothingFactory(FitterFactory): def __init__(self, **config): super().__init__() self.config = config def _create(self, tensor_infos) -> Fitter: if tensor_infos['y'].get_cat_sizes()[0].item() > 0: # labels are still in categorical form return SequentialFitter([ToSoftLabelFitter(), LabelSmoothingFitter(**self.config)]) return LabelSmoothingFitter(**self.config) class StochasticLabelNoiseLayer(Layer): def __init__(self): super().__init__() self.sigma_getter = self.context.hp_manager.register_hyper('sln_sigma', self.context.scope) def forward_tensors(self, tensors): if 'y' not in tensors: return tensors y = tensors['y'] return utils.join_dicts(tensors, {'y': y + self.sigma_getter() * torch.randn_like(y)}) class StochasticLabelNoiseFitter(Fitter): def __init__(self): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont']) def _fit(self, ds: DictDataset) -> Layer: # todo: could do a consistency check since y must be soft labels and not hard labels return StochasticLabelNoiseLayer() class StochasticLabelNoiseFactory(FitterFactory): def _create(self, tensor_infos) -> Fitter: if tensor_infos['y'].get_cat_sizes()[0].item() > 0: # labels are still in categorical form return SequentialFitter([ToSoftLabelFitter(), StochasticLabelNoiseFitter()]) return StochasticLabelNoiseFitter() # implementing "Feature Selection using Stochastic Gates" class StochasticGateLayer(Layer): def __init__(self, mu: Variable): super().__init__() self.sigma_getter = self.context.hp_manager.register_hyper('sg_sigma', self.context.scope) self.lambda_getter = self.context.hp_manager.register_hyper('sg_lambda', self.context.scope) self.mu = mu def forward_cont(self, x): mu = self.mu if self.training: sigma = self.sigma_getter() mu = mu + sigma * torch.randn_like(x) reg = gauss_cdf(self.mu / sigma).mean(dim=-1).mean(dim=-1).sum() self.context.hp_manager.add_reg_term(self.lambda_getter() * reg) z = mu.clamp(0.0, 1.0) # z = z / (z.mean(dim=-1, keepdim=True) + 1e-8) return x * z def _stack(self, layers): return StochasticGateLayer(Variable.stack([l.mu for l in layers])) class StochasticGateFactory(Fitter, FitterFactory): def __init__(self): super().__init__(needs_tensors=False, modified_tensors=['x_cont']) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]) -> int: # rough upper bound return 15 * self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: # see https://github.com/runopti/stg/blob/9f630968c4f14cff6da4e54421c497f24ac1e08e/python/stg/layers.py#L10 n_cont = ds.tensor_infos['x_cont'].get_n_features() return StochasticGateLayer(Variable(0.5 * torch.ones(1, n_cont, device=ds.device), hyper_factors={'wd': 0.0})) class AntisymmetricInitializationFactory(FitterFactory): def __init__(self, factory, **config): super().__init__() self.factory = factory self.config = config def _create(self, tensor_infos) -> Fitter: fitter = self.factory.create(tensor_infos) # return AntisymmetricInitializationFitter(fitter, **self.config) # only duplicate the part of the fitter that is actually learnable common, individual = fitter.split_off_individual() return SequentialFitter([common, AntisymmetricInitializationFitter(individual, **self.config)]) class AntisymmetricInitializationFitter(Fitter): """ Implements the antisymmetric initialization trick from http://proceedings.mlr.press/v107/zhang20a/zhang20a.pdf """ def __init__(self, fitter: Fitter, **config): super().__init__(needs_tensors=fitter.needs_tensors, is_individual=fitter.is_individual, scope_names=fitter.scope_names, modified_tensors=fitter.modified_tensors) self.fitter = fitter self.asi_factor = config.get('asi_factor', 1 / np.sqrt(2)) def forward_tensor_infos(self, tensor_infos: Dict[str, TensorInfo]): return self.fitter.forward_tensor_infos(tensor_infos) def get_n_params(self, tensor_infos: Dict[str, TensorInfo]): return 2 * self.fitter.get_n_params(tensor_infos) def get_n_forward(self, tensor_infos: Dict[str, TensorInfo]): return 2 * self.fitter.get_n_forward(tensor_infos) # maybe not entirely accurate but almost def _fit(self, ds: DictDataset) -> Layer: tfm1 = self.fitter.fit(ds) tfm2 = self.fitter.fit(ds) with torch.no_grad(): for p1, p2 in zip(tfm1.parameters(), tfm2.parameters()): p2.data = p1.data for b1, b2 in zip(tfm1.buffers(), tfm2.buffers()): b2.data = b1.data # multiply by 1/sqrt(2) at the end to preserve the learning speed for SGD, # however, would need to multiply by 0.5 for adam return SequentialLayer([SubtractionLayer(tfm1, tfm2), FunctionLayer(lambda x, a=self.asi_factor: a * x)]) def __str__(self): sub_strings = [' ' + line for line in str(self.fitter).split('\n')] return f'{self.__class__.__name__} (\n' + '\n'.join(sub_strings) + '\n)\n' class SubtractionLayer(Layer): def __init__(self, layer1: Layer, layer2: Layer): super().__init__() self.layer1 = layer1 self.layer2 = layer2 def forward_tensor_infos(self, tensor_infos): return utils.join_dicts(self.layer1.forward_tensor_infos(tensor_infos), self.layer2.forward_tensor_infos(tensor_infos)) def forward_tensors(self, tensors): out1 = self.layer1.forward_tensors(tensors) out2 = self.layer2.forward_tensors(tensors) if 'x_cont' not in out2: return utils.join_dicts(out1, out2) return utils.join_dicts(out1, out2, {'x_cont': out1['x_cont'] - out2['x_cont']}) def _stack(self, layers): return SubtractionLayer(layers[0].layer1.stack([l.layer1 for l in layers]), layers[0].layer2.stack([l.layer2 for l in layers])) ================================================ FILE: pytabkit/models/nn_models/pipeline.py ================================================ from typing import List, Dict, Union import sklearn import torch from sklearn.base import BaseEstimator, TransformerMixin from pytabkit.models import utils from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.nn_models.base import Layer, Variable, Fitter, FitterFactory, IdentityLayer, BiasLayer, ScaleLayer, \ SequentialLayer from pytabkit.models.torch_utils import torch_np_quantile # todo: add factories class ReplaceMissingContLayer(Layer): def __init__(self, means: Variable): super().__init__() if not isinstance(means, Variable): raise ValueError('means is not a Variable') self.means = means def forward_cont(self, x): return torch.where(torch.isnan(x), self.means, x) def _stack(self, layers: List['ReplaceMissingContLayer']): return ReplaceMissingContLayer(Variable.stack([layer.means for layer in layers])) class MeanReplaceMissingContFactory(Fitter, FitterFactory): def __init__(self, trainable=False, **config): super().__init__(is_individual=trainable, modified_tensors=['x_cont']) self.trainable = trainable def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() x_cont = ds.tensors['x_cont'] is_nan = torch.isnan(x_cont) x_cont_replaced = torch.where(is_nan, torch.zeros_like(x_cont), x_cont) means = x_cont_replaced.sum(dim=-2, keepdim=True) \ / (x_cont.shape[-2] - is_nan.float().sum(dim=-2, keepdim=True) + 1e-30) return ReplaceMissingContLayer(Variable(means, trainable=self.trainable)) class MeanCenterFactory(Fitter, FitterFactory): def __init__(self, trainable=False, **config): super().__init__(is_individual=trainable, modified_tensors=['x_cont']) self.trainable = trainable def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() return BiasLayer(Variable(-ds.tensors['x_cont'].mean(dim=-2, keepdim=True), trainable=self.trainable)) class MedianCenterFactory(Fitter, FitterFactory): def __init__(self, median_center_trainable=False, **config): super().__init__(is_individual=median_center_trainable, modified_tensors=['x_cont']) self.trainable = median_center_trainable def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: # quantile requires PyTorch >= 1.7.0 if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() # use quantile function from numpy since the torch one can use large amounts of RAM for some reason return BiasLayer(Variable(-torch_np_quantile(ds.tensors['x_cont'], 0.5, dim=-2, keepdim=True), trainable=self.trainable)) class L2NormalizeFactory(Fitter, FitterFactory): def __init__(self, trainable=False, l2_normalize_eps=1e-8, **config): super().__init__(is_individual=trainable, modified_tensors=['x_cont']) self.trainable = trainable self.eps = l2_normalize_eps def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() scale = 1.0 / (ds.tensors['x_cont'] ** 2 + self.eps).mean(dim=-2, keepdim=True).sqrt() scale[:, (ds.tensors['x_cont'] ** 2).mean(dim=-2) == 0.0] = 0.0 return ScaleLayer(Variable(scale, trainable=self.trainable)) class L1NormalizeFactory(Fitter, FitterFactory): def __init__(self, trainable=False, eps=1e-8, **config): super().__init__(is_individual=trainable, modified_tensors=['x_cont']) self.trainable = trainable self.eps = eps def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() scale = 1.0 / (ds.tensors['x_cont'].abs() + self.eps).mean(dim=-2, keepdim=True) return ScaleLayer(Variable(scale, trainable=self.trainable)) class MinMaxScaleFactory(Fitter, FitterFactory): def __init__(self, trainable=False, eps=1e-8, **config): super().__init__(is_individual=trainable, modified_tensors=['x_cont']) self.trainable = trainable self.eps = eps def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return 2 * self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() x_cont = ds.tensors['x_cont'] x_min = x_cont.min(dim=-2, keepdim=True)[0] x_max = x_cont.max(dim=-2, keepdim=True)[0] scale = 2.0 / (x_max - x_min + self.eps) bias = -0.5 * (x_max + x_min) return SequentialLayer([BiasLayer(Variable(bias, trainable=self.trainable)), ScaleLayer(Variable(scale, trainable=self.trainable))]) class RobustScaleFactory(Fitter, FitterFactory): def __init__(self, robust_scale_trainable=False, robust_scale_eps=1e-30, **config): super().__init__(is_individual=robust_scale_trainable, modified_tensors=['x_cont']) self.trainable = robust_scale_trainable self.robust_scale_eps = robust_scale_eps def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() x_cont = ds.tensors['x_cont'] quant_diff = torch_np_quantile(x_cont, 0.75, dim=-2) - torch_np_quantile(x_cont, 0.25, dim=-2) max, _ = x_cont.max(dim=-2) min, _ = x_cont.min(dim=-2) idxs = quant_diff == 0.0 quant_diff[idxs] = 0.5 * (max[idxs] - min[idxs]) factors = 1.0 / (quant_diff + self.robust_scale_eps) factors[quant_diff == 0.0] = 0.0 return ScaleLayer(Variable(factors[None, :], trainable=self.trainable)) class RobustScaleV2Factory(Fitter, FitterFactory): def __init__(self, robust_scale_trainable=False, **config): super().__init__(is_individual=robust_scale_trainable, modified_tensors=['x_cont']) self.trainable = robust_scale_trainable def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() x_cont = ds.tensors['x_cont'] x_cont_sorted, _ = torch.sort(x_cont, dim=-2) quantiles = torch.linspace(0.0, 1.0, x_cont.shape[-2], device=x_cont.device) opposite_dists = x_cont_sorted.flip(dims=[-2]) - x_cont_sorted opposite_quantile_dists = quantiles.flip(dims=[0]) - quantiles quarter_idx = x_cont.shape[-2] // 4 + 1 possible_factors = 2.0 * opposite_quantile_dists[:quarter_idx, None] / \ (1e-30 + opposite_dists[..., :quarter_idx, :]) factors = possible_factors.min(dim=-2, keepdim=True)[0] return ScaleLayer(Variable(factors, trainable=self.trainable)) class GlobalScaleNormalizeFactory(Fitter, FitterFactory): def __init__(self, global_scale_factor=1.0, **config): super().__init__(is_individual=False, modified_tensors=['x_cont']) self.global_scale_factor = global_scale_factor def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self._get_n_values(tensor_infos, ['x_cont']) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() x_cont = ds.tensors['x_cont'] scale = self.global_scale_factor / (x_cont ** 2 + 1e-30).mean().sqrt().item() return ScaleLayer(Variable(scale * torch.ones(1, 1, device=x_cont.device), trainable=False)) class ThermometerCodingLayer(Layer): def __init__(self, centers: Variable, scale: float, fitter: Fitter): super().__init__(fitter=fitter) self.centers = centers self.scale = scale # todo: could make scale a variable and allow for different scales per center def forward_cont(self, x): shifted = self.scale * (x.unsqueeze(-1) - self.centers) return torch.tanh(shifted.reshape(list(x.shape[:-1]) + [-1])) def _stack(self, layers): return ThermometerCodingLayer(Variable.stack([l.centers for l in layers]), layers[0].scale, layers[0].fitter) class ThermometerCodingFactory(Fitter, FitterFactory): def __init__(self, tc_low=-1.0, tc_high=1.0, tc_num=3, tc_scale=1.0, **config): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont']) self.tc_low = tc_low self.tc_high = tc_high self.tc_num = tc_num self.tc_scale = tc_scale def get_n_params(self, tensor_infos: Dict[str, TensorInfo]) -> int: return self.tc_num def forward_tensor_infos(self, tensor_infos): n_cont = tensor_infos['x_cont'].get_n_features() return utils.join_dicts(tensor_infos, {'x_cont': TensorInfo(feat_shape=[n_cont * self.tc_num])}) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() centers = torch.linspace(self.tc_low, self.tc_high, self.tc_num, device=ds.device)[None, None, :] centers = Variable(centers, trainable=False) return ThermometerCodingLayer(centers=centers, scale=self.tc_scale, fitter=self) class CircleCodingLayer(Layer): def __init__(self, scale: float, fitter: Fitter): super().__init__(fitter=fitter) self.scale = scale def forward_cont(self, x): x = (1.0 / self.scale) * x factor = 1.0 / torch.sqrt(1.0 + x ** 2) return torch.cat([x * factor, torch.ones_like(x) * factor], dim=-1) def _stack(self, layers): return CircleCodingLayer(layers[0].scale, layers[0].fitter) class CircleCodingFactory(Fitter, FitterFactory): def __init__(self, circle_coding_scale=1.0, **config): super().__init__(needs_tensors=False, is_individual=False, modified_tensors=['x_cont']) self.scale = circle_coding_scale def forward_tensor_infos(self, tensor_infos): n_cont = tensor_infos['x_cont'].get_n_features() return utils.join_dicts(tensor_infos, {'x_cont': TensorInfo(feat_shape=[n_cont * 2])}) def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() return CircleCodingLayer(scale=self.scale, fitter=self) def apply_tfms_rec(tfms: Union[BaseEstimator, List], x: torch.Tensor): if isinstance(tfms, list): return torch.stack([apply_tfms_rec(tfm, x[i]) for i, tfm in enumerate(tfms)], dim=0) else: return torch.as_tensor(tfms.transform(x.detach().cpu().numpy()), dtype=x.dtype, device=x.device) class SklearnTransformLayer(Layer): def __init__(self, tfms: Union[BaseEstimator, List], fitter: Fitter): super().__init__(fitter=fitter) self.tfms = tfms def forward_cont(self, x): return apply_tfms_rec(self.tfms, x) def _stack(self, layers): return SklearnTransformLayer(tfms=[l.tfms for l in layers], fitter=layers[0].fitter) class SklearnTransformFactory(Fitter, FitterFactory): def __init__(self, tfm: BaseEstimator, **config): super().__init__(needs_tensors=True, is_individual=False, modified_tensors=['x_cont']) self.tfm = tfm def _fit(self, ds: DictDataset) -> Layer: if ds.tensor_infos['x_cont'].is_empty(): return IdentityLayer() tfm = sklearn.base.clone(self.tfm) tfm.fit(ds.tensors['x_cont'].detach().cpu().numpy()) return SklearnTransformLayer(tfm, fitter=self) ================================================ FILE: pytabkit/models/nn_models/rtdl_num_embeddings.py ================================================ # taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py """On Embeddings for Numerical Features in Tabular Deep Learning.""" __version__ = '0.0.11' __all__ = [ 'LinearEmbeddings', 'LinearReLUEmbeddings', 'PeriodicEmbeddings', 'compute_bins', 'PiecewiseLinearEncoding', 'PiecewiseLinearEmbeddings', ] import math import warnings from collections import OrderedDict from typing import Any, Literal, Optional, Union try: import sklearn.tree as sklearn_tree except ImportError: sklearn_tree = None import torch import torch.nn as nn from torch import Tensor from torch.nn.parameter import Parameter try: from tqdm import tqdm except ImportError: tqdm = None def _check_input_shape(x: Tensor, expected_n_features: int) -> None: if x.ndim < 1: raise ValueError( f'The input must have at least one dimension, however: {x.ndim=}' ) if x.shape[-1] != expected_n_features: raise ValueError( 'The last dimension of the input was expected to be' f' {expected_n_features}, however, {x.shape[-1]=}' ) class LinearEmbeddings(nn.Module): """Linear embeddings for continuous features. **Shape** - Input: `(*, n_features)` - Output: `(*, n_features, d_embedding)` **Examples** >>> batch_size = 2 >>> n_cont_features = 3 >>> x = torch.randn(batch_size, n_cont_features) >>> d_embedding = 4 >>> m = LinearEmbeddings(n_cont_features, d_embedding) >>> m(x).shape torch.Size([2, 3, 4]) """ def __init__(self, n_features: int, d_embedding: int) -> None: """ Args: n_features: the number of continuous features. d_embedding: the embedding size. """ if n_features <= 0: raise ValueError(f'n_features must be positive, however: {n_features=}') if d_embedding <= 0: raise ValueError(f'd_embedding must be positive, however: {d_embedding=}') super().__init__() self.weight = Parameter(torch.empty(n_features, d_embedding)) self.bias = Parameter(torch.empty(n_features, d_embedding)) self.reset_parameters() def reset_parameters(self) -> None: d_rqsrt = self.weight.shape[1] ** -0.5 nn.init.uniform_(self.weight, -d_rqsrt, d_rqsrt) nn.init.uniform_(self.bias, -d_rqsrt, d_rqsrt) def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" _check_input_shape(x, self.weight.shape[0]) return torch.addcmul(self.bias, self.weight, x[..., None]) class LinearReLUEmbeddings(nn.Sequential): """Simple non-linear embeddings for continuous features. **Shape** - Input: `(*, n_features)` - Output: `(*, n_features, d_embedding)` **Examples** >>> batch_size = 2 >>> n_cont_features = 3 >>> x = torch.randn(batch_size, n_cont_features) >>> >>> d_embedding = 32 >>> m = LinearReLUEmbeddings(n_cont_features, d_embedding) >>> m(x).shape torch.Size([2, 3, 32]) """ def __init__(self, n_features: int, d_embedding: int = 32) -> None: super().__init__( OrderedDict( [ ( 'linear', LinearEmbeddings(n_features, d_embedding), ), ('activation', nn.ReLU()), ] ) ) class _Periodic(nn.Module): """ NOTE: THIS MODULE SHOULD NOT BE USED DIRECTLY. Technically, this is a linear embedding without bias followed by the periodic activations. The scale of the initialization (defined by the `sigma` argument) plays an important role. """ def __init__(self, n_features: int, k: int, sigma: float) -> None: if sigma <= 0.0: raise ValueError(f'sigma must be positive, however: {sigma=}') super().__init__() self._sigma = sigma self.weight = Parameter(torch.empty(n_features, k)) self.reset_parameters() def reset_parameters(self): """Reset the parameters.""" # NOTE[DIFF] # Here, extreme values (~0.3% probability) are explicitly avoided just in case. # In the paper, there was no protection from extreme values. bound = self._sigma * 3 nn.init.trunc_normal_(self.weight, 0.0, self._sigma, a=-bound, b=bound) def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" _check_input_shape(x, self.weight.shape[0]) x = 2 * math.pi * self.weight * x[..., None] x = torch.cat([torch.cos(x), torch.sin(x)], -1) return x # _NLinear is a simplified copy of delu.nn.NLinear: # https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html class _NLinear(nn.Module): """N *separate* linear layers for N feature embeddings. In other words, each feature embedding is transformed by its own dedicated linear layer. """ def __init__( self, n: int, in_features: int, out_features: int, bias: bool = True ) -> None: super().__init__() self.weight = Parameter(torch.empty(n, in_features, out_features)) self.bias = Parameter(torch.empty(n, out_features)) if bias else None self.reset_parameters() def reset_parameters(self): """Reset the parameters.""" d_in_rsqrt = self.weight.shape[-2] ** -0.5 nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt) if self.bias is not None: nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt) def forward(self, x: torch.Tensor) -> torch.Tensor: """Do the forward pass.""" if x.ndim != 3: raise ValueError( '_NLinear supports only inputs with exactly one batch dimension,' ' so `x` must have a shape like (BATCH_SIZE, N_FEATURES, D_EMBEDDING).' ) assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1] x = x.transpose(0, 1) x = x @ self.weight x = x.transpose(0, 1) if self.bias is not None: x = x + self.bias return x class PeriodicEmbeddings(nn.Module): """Embeddings for continuous features based on periodic activations. See README for details. **Shape** - Input: `(*, n_features)` - Output: `(*, n_features, d_embedding)` **Examples** >>> batch_size = 2 >>> n_cont_features = 3 >>> x = torch.randn(batch_size, n_cont_features) >>> >>> d_embedding = 24 >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=False) >>> m(x).shape torch.Size([2, 3, 24]) >>> >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=True) >>> m(x).shape torch.Size([2, 3, 24]) >>> >>> # PL embeddings. >>> m = PeriodicEmbeddings(n_cont_features, d_embedding=8, activation=False, lite=False) >>> m(x).shape torch.Size([2, 3, 8]) """ # noqa: E501 def __init__( self, n_features: int, d_embedding: int = 24, *, n_frequencies: int = 48, frequency_init_scale: float = 0.01, activation: bool = True, lite: bool, ) -> None: """ Args: n_features: the number of features. d_embedding: the embedding size. n_frequencies: the number of frequencies for each feature. (denoted as "k" in Section 3.3 in the paper). frequency_init_scale: the initialization scale for the first linear layer (denoted as "sigma" in Section 3.3 in the paper). **This is an important hyperparameter**, see README for details. activation: if `False`, the ReLU activation is not applied. Must be `True` if ``lite=True``. lite: if True, the outer linear layer is shared between all features. See README for details. """ super().__init__() self.periodic = _Periodic(n_features, n_frequencies, frequency_init_scale) self.linear: Union[nn.Linear, _NLinear] if lite: # NOTE[DIFF] # The lite variation was introduced in a different paper # (about the TabR model). if not activation: raise ValueError('lite=True is allowed only when activation=True') self.linear = nn.Linear(2 * n_frequencies, d_embedding) else: self.linear = _NLinear(n_features, 2 * n_frequencies, d_embedding) self.activation = nn.ReLU() if activation else None def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" x = self.periodic(x) x = self.linear(x) if self.activation is not None: x = self.activation(x) return x def _check_bins(bins: list[Tensor]) -> None: if not bins: raise ValueError('The list of bins must not be empty') for i, feature_bins in enumerate(bins): if not isinstance(feature_bins, Tensor): raise ValueError( 'bins must be a list of PyTorch tensors. ' f'However, for {i=}: {type(bins[i])=}' ) if feature_bins.ndim != 1: raise ValueError( 'Each item of the bin list must have exactly one dimension.' f' However, for {i=}: {bins[i].ndim=}' ) if len(feature_bins) < 2: raise ValueError( 'All features must have at least two bin edges.' f' However, for {i=}: {len(bins[i])=}' ) if not feature_bins.isfinite().all(): raise ValueError( 'Bin edges must not contain nan/inf/-inf.' f' However, this is not true for the {i}-th feature' ) if (feature_bins[:-1] >= feature_bins[1:]).any(): raise ValueError( 'Bin edges must be sorted.' f' However, the for the {i}-th feature, the bin edges are not sorted' ) if len(feature_bins) == 2: warnings.warn( f'The {i}-th feature has just two bin edges, which means only one bin.' ' Strictly speaking, using a single bin for the' ' piecewise-linear encoding should not break anything,' ' but it is the same as using sklearn.preprocessing.MinMaxScaler' ) def compute_bins( X: torch.Tensor, n_bins: int = 48, *, tree_kwargs: Optional[dict[str, Any]] = None, y: Optional[Tensor] = None, regression: Optional[bool] = None, verbose: bool = False, ) -> list[Tensor]: """Compute the bin boundaries for `PiecewiseLinearEncoding` and `PiecewiseLinearEmbeddings`. **Usage** Compute bins using quantiles (Section 3.2.1 in the paper): >>> X_train = torch.randn(10000, 2) >>> bins = compute_bins(X_train) Compute bins using decision trees (Section 3.2.2 in the paper): >>> X_train = torch.randn(10000, 2) >>> y_train = torch.randn(len(X_train)) >>> bins = compute_bins( ... X_train, ... y=y_train, ... regression=True, ... tree_kwargs={'min_samples_leaf': 64, 'min_impurity_decrease': 1e-4}, ... ) Args: X: the training features. n_bins: the number of bins. tree_kwargs: keyword arguments for `sklearn.tree.DecisionTreeRegressor` (if ``regression=True``) or `sklearn.tree.DecisionTreeClassifier` (if ``regression=False``). NOTE: requires ``scikit-learn>=1.0,>2`` to be installed. y: the training labels (must be provided if ``tree`` is not None). regression: whether the labels are regression labels (must be provided if ``tree`` is not None). verbose: if True and ``tree_kwargs`` is not None, than ``tqdm`` (must be installed) will report the progress while fitting trees. Returns: A list of bin edges for all features. For one feature: - the maximum possible number of bin edges is ``n_bins + 1``. - the minimum possible number of bin edges is ``1``. """ # noqa: E501 if not isinstance(X, Tensor): raise ValueError(f'X must be a PyTorch tensor, however: {type(X)=}') if X.ndim != 2: raise ValueError(f'X must have exactly two dimensions, however: {X.ndim=}') if X.shape[0] < 2: raise ValueError(f'X must have at least two rows, however: {X.shape[0]=}') if X.shape[1] < 1: raise ValueError(f'X must have at least one column, however: {X.shape[1]=}') if not X.isfinite().all(): raise ValueError('X must not contain nan/inf/-inf.') if (X == X[0]).all(dim=0).any(): raise ValueError( 'All columns of X must have at least two distinct values.' ' However, X contains columns with just one distinct value.' ) if n_bins <= 1 or n_bins >= len(X): raise ValueError( 'n_bins must be more than 1, but less than len(X), however:' f' {n_bins=}, {len(X)=}' ) if tree_kwargs is None: if y is not None or regression is not None or verbose: raise ValueError( 'If tree_kwargs is None, then y must be None, regression must be None' ' and verbose must be False' ) _upper = 2**24 # 16_777_216 if len(X) > _upper: warnings.warn( f'Computing quantile-based bins for more than {_upper} million objects' ' may not be possible due to the limitation of PyTorch' ' (for details, see https://github.com/pytorch/pytorch/issues/64947;' ' if that issue is successfully resolved, this warning may be irrelevant).' # noqa ' As a workaround, subsample the data, i.e. instead of' '\ncompute_bins(X, ...)' '\ndo' '\ncompute_bins(X[torch.randperm(len(X), device=X.device)[:16_777_216]], ...)' # noqa '\nOn CUDA, the computation can still fail with OOM even after' ' subsampling. If this is the case, try passing features by groups:' '\nbins = sum(' '\n compute_bins(X[:, idx], ...)' '\n for idx in torch.arange(len(X), device=X.device).split(group_size),' # noqa '\n start=[]' '\n)' '\nAnother option is to perform the computation on CPU:' '\ncompute_bins(X.cpu(), ...)' ) del _upper # NOTE[DIFF] # The code below is more correct than the original implementation, # because the original implementation contains an unintentional divergence # from what is written in the paper. That divergence affected only the # quantile-based embeddings, but not the tree-based embeddings. # For historical reference, here is the original, less correct, implementation: # https://github.com/yandex-research/tabular-dl-num-embeddings/blob/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8/bin/train4.py#L612C30-L612C30 # (explanation: limiting the number of quantiles by the number of distinct # values is NOT the same as removing identical quantiles after computing them). bins = [ q.unique() for q in torch.quantile( X, torch.linspace(0.0, 1.0, n_bins + 1).to(X), dim=0 ).T ] _check_bins(bins) return bins else: if sklearn_tree is None: raise RuntimeError( 'The scikit-learn package is missing.' ' See README.md for installation instructions' ) if y is None or regression is None: raise ValueError( 'If tree_kwargs is not None, then y and regression must not be None' ) if y.ndim != 1: raise ValueError(f'y must have exactly one dimension, however: {y.ndim=}') if len(y) != len(X): raise ValueError( f'len(y) must be equal to len(X), however: {len(y)=}, {len(X)=}' ) if y is None or regression is None: raise ValueError( 'If tree_kwargs is not None, then y and regression must not be None' ) if 'max_leaf_nodes' in tree_kwargs: raise ValueError( 'tree_kwargs must not contain the key "max_leaf_nodes"' ' (it will be set to n_bins automatically).' ) if verbose: if tqdm is None: raise ImportError('If verbose is True, tqdm must be installed') tqdm_ = tqdm else: tqdm_ = lambda x: x # noqa: E731 if X.device.type != 'cpu' or y.device.type != 'cpu': warnings.warn( 'Computing tree-based bins involves the conversion of the input PyTorch' ' tensors to NumPy arrays. The provided PyTorch tensors are not' ' located on CPU, so the conversion has some overhead.', UserWarning, ) X_numpy = X.cpu().numpy() y_numpy = y.cpu().numpy() bins = [] for column in tqdm_(X_numpy.T): feature_bin_edges = [float(column.min()), float(column.max())] tree = ( ( sklearn_tree.DecisionTreeRegressor if regression else sklearn_tree.DecisionTreeClassifier )(max_leaf_nodes=n_bins, **tree_kwargs) .fit(column.reshape(-1, 1), y_numpy) .tree_ ) for node_id in range(tree.node_count): # The following condition is True only for split nodes. Source: # https://scikit-learn.org/1.0/auto_examples/tree/plot_unveil_tree_structure.html#tree-structure if tree.children_left[node_id] != tree.children_right[node_id]: feature_bin_edges.append(float(tree.threshold[node_id])) bins.append(torch.as_tensor(feature_bin_edges).unique()) _check_bins(bins) return [x.to(device=X.device, dtype=X.dtype) for x in bins] class _PiecewiseLinearEncodingImpl(nn.Module): """Piecewise-linear encoding. NOTE: THIS CLASS SHOULD NOT BE USED DIRECTLY. In particular, this class does *not* add any positional information to feature encodings. Thus, for Transformer-like models, `PiecewiseLinearEmbeddings` is the only valid option. Note: This is the *encoding* module, not the *embedding* module, so it only implements Equation 1 (Figure 1) from the paper, and does not have trainable parameters. **Shape** * Input: ``(*, n_features)`` * Output: ``(*, n_features, max_n_bins)``, where ``max_n_bins`` is the maximum number of bins over all features: ``max_n_bins = max(len(b) - 1 for b in bins)``. To understand the output structure, consider a feature with the number of bins ``n_bins``. Formally, its piecewise-linear encoding is a vector of the size ``n_bins`` that looks as follows:: x_ple = [1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0] However, this class will instead produce a vector of the size ``max_n_bins``:: x_ple_actual = [*x_ple[:-1], *zeros(max_n_bins - n_bins), x_ple[-1]] In other words: * The last encoding component is **always** located in the end, even if ``n_bins == 1`` (i.e. even if it is the only component). * The leading ``n_bins - 1`` components are located in the beginning. * Everything in-between is always set to zeros (like "padding", but in the middle). This implementation is *significantly* faster than the original one. It relies on two key observations: * The piecewise-linear encoding is just a non-trainable linear transformation followed by a clamp-based activation. Pseudocode: `PiecewiseLinearEncoding(x) = Activation(Linear(x))`. The parameters of the linear transformation are defined by the bin edges. * Aligning the *last* encoding channel across all features allows applying the aforementioned activation simultaneously to all features without the loop over features. """ weight: Tensor """The weight of the linear transformation mentioned in the class docstring.""" bias: Tensor """The bias of the linear transformation mentioned in the class docstring.""" single_bin_mask: Optional[Tensor] """The indicators of the features with only one bin.""" mask: Optional[Tensor] """The indicators of the "valid" (i.e. "non-padding") part of the encoding.""" def __init__(self, bins: list[Tensor]) -> None: """ Args: bins: the bins computed by `compute_bins`. """ assert len(bins) > 0 super().__init__() n_features = len(bins) n_bins = [len(x) - 1 for x in bins] max_n_bins = max(n_bins) self.register_buffer('weight', torch.zeros(n_features, max_n_bins)) self.register_buffer('bias', torch.zeros(n_features, max_n_bins)) single_bin_mask = torch.tensor(n_bins) == 1 self.register_buffer( 'single_bin_mask', single_bin_mask if single_bin_mask.any() else None ) self.register_buffer( 'mask', # The mask is needed if features have different number of bins. None if all(len(x) == len(bins[0]) for x in bins) else torch.row_stack( [ torch.cat( [ # The number of bins for this feature, minus 1: torch.ones((len(x) - 1) - 1, dtype=torch.bool), # Unused components (always zeros): torch.zeros(max_n_bins - (len(x) - 1), dtype=torch.bool), # The last bin: torch.ones(1, dtype=torch.bool), ] ) # x is a tensor containing the bin bounds for a given feature. for x in bins ] ), ) for i, bin_edges in enumerate(bins): # Formally, the piecewise-linear encoding of one feature looks as follows: # `[1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]` # The linear transformation based on the weight and bias defined below # implements the expression in the middle before the clipping to [0, 1]. # Note that the actual encoding layout produced by this class # is slightly different. See the docstring of this class for details. bin_width = bin_edges.diff() w = 1.0 / bin_width b = -bin_edges[:-1] / bin_width # The last encoding component: self.weight[i, -1] = w[-1] self.bias[i, -1] = b[-1] # The leading encoding components: self.weight[i, : n_bins[i] - 1] = w[:-1] self.bias[i, : n_bins[i] - 1] = b[:-1] # All in-between components will always be zeros, # because the weight and bias are initialized with zeros. def get_max_n_bins(self) -> int: return self.weight.shape[-1] def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" x = torch.addcmul(self.bias, self.weight, x[..., None]) if x.shape[-1] > 1: x = torch.cat( [ x[..., :1].clamp_max(1.0), x[..., 1:-1].clamp(0.0, 1.0), ( x[..., -1:].clamp_min(0.0) if self.single_bin_mask is None else torch.where( # For features with only one bin, # the whole "piecewise-linear" encoding effectively behaves # like mix-max scaling # (assuming that the edges of the single bin # are the minimum and maximum feature values). self.single_bin_mask[..., None], x[..., -1:], x[..., -1:].clamp_min(0.0), ) ), ], dim=-1, ) return x class PiecewiseLinearEncoding(nn.Module): """Piecewise-linear encoding. See README for detailed explanation. **Shape** - Input: ``(*, n_features)`` - Output: ``(*, total_n_bins)``, where ``total_n_bins`` is the total number of bins for all features: ``total_n_bins = sum(len(b) - 1 for b in bins)``. Technically, the output of this module is the flattened output of `_PiecewiseLinearEncoding` with all "padding" values removed. """ def __init__(self, bins: list[Tensor]) -> None: """ Args: bins: the bins computed by `compute_bins`. """ super().__init__() self.impl = _PiecewiseLinearEncodingImpl(bins) def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" x = self.impl(x) return x.flatten(-2) if self.impl.mask is None else x[:, self.impl.mask] class PiecewiseLinearEmbeddings(nn.Module): """Piecewise-linear embeddings. **Shape** - Input: ``(batch_size, n_features)`` - Output: ``(batch_size, n_features, d_embedding)`` """ def __init__( self, bins: list[Tensor], d_embedding: int, *, activation: bool, version: Literal[None, 'A', 'B'] = None, ) -> None: """ Args: bins: the bins computed by `compute_bins`. d_embedding: the embedding size. activation: if True, the ReLU activation is additionally applied in the end. version: the preset for various implementation details, such as parametrization and initialization. See README for details. """ if d_embedding <= 0: raise ValueError( f'd_embedding must be a positive integer, however: {d_embedding=}' ) _check_bins(bins) if version is None: warnings.warn( 'The `version` argument is not provided, so version="A" will be used' ' for backward compatibility.' ' See README for recommendations regarding `version`.' ' In future, omitting this argument will result in an exception.' ) version = 'A' super().__init__() n_features = len(bins) # NOTE[DIFF] # version="B" was introduced in a different paper (about the TabM model). is_version_B = version == 'B' self.linear0 = ( LinearEmbeddings(n_features, d_embedding) if is_version_B else None ) self.impl = _PiecewiseLinearEncodingImpl(bins) self.linear = _NLinear( len(bins), self.impl.get_max_n_bins(), d_embedding, # For the version "B", the bias is already presented in self.linear0. bias=not is_version_B, ) if is_version_B: # Because of the following line, at initialization, # the whole embedding behaves like a linear embedding. # The piecewise-linear component is incrementally learnt during training. nn.init.zeros_(self.linear.weight) self.activation = nn.ReLU() if activation else None def forward(self, x: Tensor) -> Tensor: """Do the forward pass.""" if x.ndim != 2: raise ValueError( 'For now, only inputs with exactly one batch dimension are supported.' ) x_linear = None if self.linear0 is None else self.linear0(x) x_ple = self.impl(x) x_ple = self.linear(x_ple) if self.activation is not None: x_ple = self.activation(x_ple) return x_ple if x_linear is None else x_linear + x_ple ================================================ FILE: pytabkit/models/nn_models/rtdl_resnet.py ================================================ import math import numbers import typing as ty import torch import torch.nn as nn import torch.nn.functional as F import torch.nn.init as nn_init from torch import Tensor import numpy as np import pandas as pd import torch.nn as nn import skorch from skorch.callbacks import EarlyStopping, LRScheduler, PrintLog from skorch import NeuralNetRegressor, NeuralNetClassifier from skorch.dataset import Dataset from skorch.callbacks import EpochScoring from skorch.callbacks import WandbLogger from skorch.callbacks import Callback, Checkpoint from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.optim import AdamW, Adam, SGD # import sys # sys.path.append("") import numpy as np import os from functools import partial from copy import deepcopy from .rtdl_num_embeddings import PeriodicEmbeddings # code adapted from https://github.com/yandex-research/rtdl/tree/e5dac7f1bb33078699f5079ce301dc907c5b512a/bin def reglu(x: Tensor) -> Tensor: a, b = x.chunk(2, dim=-1) return a * F.relu(b) def geglu(x: Tensor) -> Tensor: a, b = x.chunk(2, dim=-1) return a * F.gelu(b) def get_activation_fn(name: str) -> ty.Callable[[Tensor], Tensor]: return ( reglu if name == 'reglu' else geglu if name == 'geglu' else torch.sigmoid if name == 'sigmoid' else getattr(F, name) ) def get_nonglu_activation_fn(name: str) -> ty.Callable[[Tensor], Tensor]: return ( F.relu if name == 'reglu' else F.gelu if name == 'geglu' else get_activation_fn(name) ) def print_but_serializable(*args, **kwargs): # this is a dummy function to prevent an obscure error in pickling skorch objects # containing callbacks with sink=print # The error occurs when ray.init() and FunctionProcess() are both used. Error message: # _pickle.PicklingError: Can't pickle : it's not the same object as builtins.print print(*args, **kwargs) class RTDL_MLP(nn.Module): # baseline MLP def __init__( self, *, d_in: int, n_layers: int, d_layers: ty.Union[int, ty.List[int]], d_first_layer: int, d_last_layer: int, dropout: float, d_out: int, categories: ty.Optional[ty.List[int]], d_embedding: int, regression: bool, categorical_indicator, num_emb_type: str = 'none', num_emb_dim: int = 24, num_emb_hidden_dim: int = 48, num_emb_sigma: float = 0.01, num_emb_lite: bool = False ) -> None: super().__init__() self.regression = regression self.categorical_indicator = categorical_indicator # Added self.categories = categories # Added if num_emb_type == 'none': self.num_emb_layer = nn.Identity() elif num_emb_type == 'plr': self.num_emb_layer = nn.Sequential(PeriodicEmbeddings(d_in, num_emb_dim, n_frequencies=num_emb_hidden_dim, frequency_init_scale=num_emb_sigma, lite=num_emb_lite), nn.Flatten()) d_in = d_in * num_emb_dim elif num_emb_type == 'pl': self.num_emb_layer = nn.Sequential(PeriodicEmbeddings(d_in, num_emb_dim, n_frequencies=num_emb_hidden_dim, frequency_init_scale=num_emb_sigma, activation=False, lite=num_emb_lite), nn.Flatten()) d_in = d_in * num_emb_dim else: raise ValueError(f'Unknown numerical embedding type "{num_emb_type}"') if categories is not None and len(categories) > 0: d_in += len(categories) * d_embedding category_offsets = torch.tensor( np.concatenate([[0], np.array(categories[:-1], dtype=np.int64) ]) ).cumsum(0) self.register_buffer("category_offsets", category_offsets) self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding) nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5)) # set the embedding of the last category of each feature to zero # it represents the "missing" category, i.e. the categories that is not present # in the training set for i, c in enumerate(categories): self.category_embeddings.weight.data[ category_offsets[i] + c - 1 ].zero_() if isinstance(d_layers, numbers.Number): d_layers = [d_first_layer] + [d_layers for _ in range(n_layers)] + [d_last_layer] # CHANGED else: assert len(d_layers) == n_layers self.layers = nn.ModuleList( [ nn.Linear(d_layers[i - 1] if i else d_in, x) for i, x in enumerate(d_layers) ] ) self.dropout = dropout self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) def forward(self, x): if not self.categorical_indicator is None: x_num = x[:, ~self.categorical_indicator].float() x_cat = x[:, self.categorical_indicator].long() else: x_num = x x_cat = None # Added: Numerical embeddings x_num = self.num_emb_layer(x_num) x = [] if x_num is not None: x.append(x_num) if x_cat is not None: # replace -1 by the last category for i in range(x_cat.shape[1]): x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1 x.append( self.category_embeddings(x_cat + self.category_offsets[None]).view( x_cat.size(0), -1 ) ) x = torch.cat(x, dim=-1) for layer in self.layers: x = layer(x) x = F.relu(x) if self.dropout: x = F.dropout(x, self.dropout, self.training) x = self.head(x) if not self.regression: x = x.squeeze(-1) return x class ResNet(nn.Module): def __init__( self, *, d_in: int, categories: ty.Optional[ty.List[int]], d_embedding: int, d: int, d_hidden_factor: float, n_layers: int, activation: str, normalization: str, hidden_dropout: float, residual_dropout: float, d_out: int, regression: bool, categorical_indicator ) -> None: super().__init__() def make_normalization(): return {"batchnorm": nn.BatchNorm1d, "layernorm": nn.LayerNorm}[ normalization ](d) self.categorical_indicator = categorical_indicator # Added self.regression = regression self.main_activation = get_activation_fn(activation) self.last_activation = get_nonglu_activation_fn(activation) self.residual_dropout = residual_dropout self.hidden_dropout = hidden_dropout d_hidden = int(d * d_hidden_factor) self.categories = categories if categories is not None and len(categories) > 0: d_in += len(categories) * d_embedding category_offsets = torch.tensor( np.concatenate([[0], np.array(categories[:-1], dtype=np.int64) ]) ).cumsum(0) self.register_buffer("category_offsets", category_offsets) self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding) nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5)) # set the embedding of the last category of each feature to zero # it represents the "missing" category, i.e. the categories that is not present # in the training set for i, c in enumerate(categories): self.category_embeddings.weight.data[ category_offsets[i] + c - 1 ].zero_() self.first_layer = nn.Linear(d_in, d) self.layers = nn.ModuleList( [ nn.ModuleDict( { "norm": make_normalization(), "linear0": nn.Linear( d, d_hidden * (2 if activation.endswith("glu") else 1) ), "linear1": nn.Linear(d_hidden, d), } ) for _ in range(n_layers) ] ) self.last_normalization = make_normalization() self.head = nn.Linear(d, d_out) def forward(self, x) -> Tensor: if not self.categorical_indicator is None: x_num = x[:, ~self.categorical_indicator].float() x_cat = x[:, self.categorical_indicator].long() else: x_num = x x_cat = None x = [] if x_num is not None and x_num.numel() > 0: x.append(x_num) if x_cat is not None and x_cat.numel() > 0: # replace -1 by the last category for i in range(x_cat.shape[1]): x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1 x.append( self.category_embeddings(x_cat + self.category_offsets[None]).view( x_cat.size(0), -1 ) ) x = torch.cat(x, dim=-1) x = self.first_layer(x) for layer in self.layers: layer = ty.cast(ty.Dict[str, nn.Module], layer) z = x z = layer["norm"](z) z = layer["linear0"](z) z = self.main_activation(z) if self.hidden_dropout: z = F.dropout(z, self.hidden_dropout, self.training) z = layer["linear1"](z) if self.residual_dropout: z = F.dropout(z, self.residual_dropout, self.training) x = x + z x = self.last_normalization(x) x = self.last_activation(x) x = self.head(x) if not self.regression: x = x.squeeze(-1) return x class Tokenizer(nn.Module): category_offsets: ty.Optional[Tensor] def __init__( self, d_numerical: int, categories: ty.Optional[ty.List[int]], d_token: int, bias: bool, ) -> None: #categories = None super().__init__() if categories is None: d_bias = d_numerical self.category_offsets = None self.category_embeddings = None else: d_bias = d_numerical + len(categories) category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0) self.register_buffer('category_offsets', category_offsets) self.category_embeddings = nn.Embedding(sum(categories), d_token) nn_init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5)) print(f'{self.category_embeddings.weight.shape=}') # set the embedding of the last category of each feature to zero # it represents the "missing" category, i.e. the categories that is not present # in the training set for i, c in enumerate(categories): self.category_embeddings.weight.data[ category_offsets[i] + c - 1 ].zero_() # take [CLS] token into account self.weight = nn.Parameter(Tensor(d_numerical + 1, d_token)) self.bias = nn.Parameter(Tensor(d_bias, d_token)) if bias else None # The initialization is inspired by nn.Linear nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5)) self.categories = categories @property def n_tokens(self) -> int: return len(self.weight) + ( 0 if self.category_offsets is None else len(self.category_offsets) ) def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor: x_some = x_num if x_cat is None else x_cat assert x_some is not None x_num = torch.cat( [torch.ones(len(x_some), 1, device=x_some.device)] # [CLS] + ([] if x_num is None else [x_num]), dim=1, ) x = self.weight[None] * x_num[:, :, None] if x_cat is not None: # replace -1 by the last category for i in range(x_cat.shape[1]): x_cat[:, i][x_cat[:, i] == -1] = self.categories[i] - 1 x = torch.cat( [x, self.category_embeddings(x_cat + self.category_offsets[None])], dim=1, ) if self.bias is not None: bias = torch.cat( [ torch.zeros(1, self.bias.shape[1], device=x.device), self.bias, ] ) x = x + bias[None] return x class MultiheadAttention(nn.Module): def __init__( self, d: int, n_heads: int, dropout: float, initialization: str ) -> None: if n_heads > 1: assert d % n_heads == 0 assert initialization in ['xavier', 'kaiming'] super().__init__() self.W_q = nn.Linear(d, d) self.W_k = nn.Linear(d, d) self.W_v = nn.Linear(d, d) self.W_out = nn.Linear(d, d) if n_heads > 1 else None self.n_heads = n_heads self.dropout = nn.Dropout(dropout) if dropout else None for m in [self.W_q, self.W_k, self.W_v]: if initialization == 'xavier' and (n_heads > 1 or m is not self.W_v): # gain is needed since W_qkv is represented with 3 separate layers nn_init.xavier_uniform_(m.weight, gain=1 / math.sqrt(2)) nn_init.zeros_(m.bias) if self.W_out is not None: nn_init.zeros_(self.W_out.bias) def _reshape(self, x: Tensor) -> Tensor: batch_size, n_tokens, d = x.shape d_head = d // self.n_heads return ( x.reshape(batch_size, n_tokens, self.n_heads, d_head) .transpose(1, 2) .reshape(batch_size * self.n_heads, n_tokens, d_head) ) def forward( self, x_q: Tensor, x_kv: Tensor, key_compression: ty.Optional[nn.Linear], value_compression: ty.Optional[nn.Linear], ) -> Tensor: q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv) for tensor in [q, k, v]: assert tensor.shape[-1] % self.n_heads == 0 if key_compression is not None: assert value_compression is not None k = key_compression(k.transpose(1, 2)).transpose(1, 2) v = value_compression(v.transpose(1, 2)).transpose(1, 2) else: assert value_compression is None batch_size = len(q) d_head_key = k.shape[-1] // self.n_heads d_head_value = v.shape[-1] // self.n_heads n_q_tokens = q.shape[1] q = self._reshape(q) k = self._reshape(k) attention = F.softmax(q @ k.transpose(1, 2) / math.sqrt(d_head_key), dim=-1) if self.dropout is not None: attention = self.dropout(attention) x = attention @ self._reshape(v) x = ( x.reshape(batch_size, self.n_heads, n_q_tokens, d_head_value) .transpose(1, 2) .reshape(batch_size, n_q_tokens, self.n_heads * d_head_value) ) if self.W_out is not None: x = self.W_out(x) return x class FT_Transformer(nn.Module): """Transformer. References: - https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html - https://github.com/facebookresearch/pytext/tree/master/pytext/models/representations/transformer - https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L19 """ def __init__( self, *, # tokenizer d_in: int, #changed name categories: ty.Optional[ty.List[int]], token_bias: bool, # transformer n_layers: int, d_token: int, n_heads: int, d_ffn_factor: float, attention_dropout: float, ffn_dropout: float, residual_dropout: float, activation: str, prenormalization: bool, initialization: str, # linformer kv_compression: ty.Optional[float], kv_compression_sharing: ty.Optional[str], # d_out: int, regression: bool, categorical_indicator ) -> None: assert (kv_compression is None) ^ (kv_compression_sharing is not None) super().__init__() self.tokenizer = Tokenizer(d_in, categories, d_token, token_bias) n_tokens = self.tokenizer.n_tokens # print("d_token {}".format(d_token)) self.categorical_indicator = categorical_indicator self.regression = regression def make_kv_compression(): assert kv_compression compression = nn.Linear( n_tokens, int(n_tokens * kv_compression), bias=False ) if initialization == 'xavier': nn_init.xavier_uniform_(compression.weight) return compression self.shared_kv_compression = ( make_kv_compression() if kv_compression and kv_compression_sharing == 'layerwise' else None ) def make_normalization(): return nn.LayerNorm(d_token) d_hidden = int(d_token * d_ffn_factor) self.layers = nn.ModuleList([]) for layer_idx in range(n_layers): layer = nn.ModuleDict( { 'attention': MultiheadAttention( d_token, n_heads, attention_dropout, initialization ), 'linear0': nn.Linear( d_token, d_hidden * (2 if activation.endswith('glu') else 1) ), 'linear1': nn.Linear(d_hidden, d_token), 'norm1': make_normalization(), } ) if not prenormalization or layer_idx: layer['norm0'] = make_normalization() if kv_compression and self.shared_kv_compression is None: layer['key_compression'] = make_kv_compression() if kv_compression_sharing == 'headwise': layer['value_compression'] = make_kv_compression() else: assert kv_compression_sharing == 'key-value' self.layers.append(layer) self.activation = get_activation_fn(activation) self.last_activation = get_nonglu_activation_fn(activation) self.prenormalization = prenormalization self.last_normalization = make_normalization() if prenormalization else None self.ffn_dropout = ffn_dropout self.residual_dropout = residual_dropout self.head = nn.Linear(d_token, d_out) def _get_kv_compressions(self, layer): return ( (self.shared_kv_compression, self.shared_kv_compression) if self.shared_kv_compression is not None else (layer['key_compression'], layer['value_compression']) if 'key_compression' in layer and 'value_compression' in layer else (layer['key_compression'], layer['key_compression']) if 'key_compression' in layer else (None, None) ) def _start_residual(self, x, layer, norm_idx): x_residual = x if self.prenormalization: norm_key = f'norm{norm_idx}' if norm_key in layer: x_residual = layer[norm_key](x_residual) return x_residual def _end_residual(self, x, x_residual, layer, norm_idx): if self.residual_dropout: x_residual = F.dropout(x_residual, self.residual_dropout, self.training) x = x + x_residual if not self.prenormalization: x = layer[f'norm{norm_idx}'](x) return x def forward(self, x) -> Tensor: if not self.categorical_indicator is None: x_num = x[:, ~self.categorical_indicator].float() x_cat = x[:, self.categorical_indicator].long() #TODO else: x_num = x x_cat = None #x_cat = None #FIXME x = self.tokenizer(x_num, x_cat) for layer_idx, layer in enumerate(self.layers): is_last_layer = layer_idx + 1 == len(self.layers) layer = ty.cast(ty.Dict[str, nn.Module], layer) x_residual = self._start_residual(x, layer, 0) x_residual = layer['attention']( # for the last attention, it is enough to process only [CLS] (x_residual[:, :1] if is_last_layer else x_residual), x_residual, *self._get_kv_compressions(layer), ) if is_last_layer: x = x[:, : x_residual.shape[1]] x = self._end_residual(x, x_residual, layer, 0) x_residual = self._start_residual(x, layer, 1) x_residual = layer['linear0'](x_residual) x_residual = self.activation(x_residual) if self.ffn_dropout: x_residual = F.dropout(x_residual, self.ffn_dropout, self.training) x_residual = layer['linear1'](x_residual) x = self._end_residual(x, x_residual, layer, 1) assert x.shape[1] == 1 x = x[:, 0] if self.last_normalization is not None: x = self.last_normalization(x) x = self.last_activation(x) x = self.head(x) if not self.regression: x = x.squeeze(-1) return x class InputShapeSetterResnet(skorch.callbacks.Callback): def __init__( self, regression=False, batch_size=None, cat_features=None, categories=None ): self.cat_features = cat_features self.regression = regression self.batch_size = batch_size self.categories = categories def on_train_begin(self, net, X, y): if net.categorical_indicator is None: if self.cat_features is not None: # TODO: it's redundant net.set_categorical_indicator( np.array([i in self.cat_features for i in range(X.shape[1])]) ) else: d_in = X.shape[1] categories = None else: d_in = X.shape[1] - sum(net.categorical_indicator) if self.categories is None: categories = [ # +1 for the unknown category len(set(X[:, i])) + 1 for i in np.where(net.categorical_indicator)[0] ] else: categories = self.categories if self.regression: d_out = 1 else: if hasattr(net, "n_classes"): d_out = net.n_classes else: assert y.max() + 1 == len(set(y)) d_out = int(y.max() + 1) net.set_params( module__d_in=d_in, module__categories=categories, # FIXME #lib.get_categories(X_cat), module__categorical_indicator=torch.BoolTensor(net.categorical_indicator) if net.categorical_indicator is not None else None, module__d_out=d_out, ) class LearningRateLogger(Callback): def on_epoch_begin(self, net, dataset_train=None, dataset_valid=None, **kwargs): callbacks = net.callbacks for callback in callbacks: if isinstance(callback, WandbLogger): callback.wandb_run.log( {"log_lr": np.log10(net.optimizer_.param_groups[0]["lr"])} ) class UniquePrefixCheckpoint(Checkpoint): """ This class has two purposes: - add a unique prefix to the checkpoint file to avoid conflicts between different runs in parallel - remove the checkpoint file when training is finished to avoid having too many files """ def initialize(self): print("Initializing UniquePrefixCheckpoint") self.fn_prefix = str(id(self)) print("fn_prefix is {}".format(self.fn_prefix)) return super(UniquePrefixCheckpoint, self).initialize() # override method to delete the checkpoint file def on_train_end(self, net, **kwargs): print("train end") if not self.load_best or self.monitor is None: return self._sink("Loading best checkpoint after training.", net.verbose) is_regression = isinstance(net, NeuralNetRegressorWrapped) try: net.load_params(checkpoint=self, use_safetensors=self.use_safetensors) # addition print(f"removing {self.dirname}/{self.fn_prefix}params.pt") os.remove(f"{self.dirname}/{self.fn_prefix}params.pt") # if doing regression check if constant_val_mse is better than valid_loss_best # if so, replace the model prediction with constant prediction if is_regression: constant_val_mse = net.history[:, "constant_val_mse"][0] # all the same all_val_mse = net.history[:, "valid_loss"] # remove nan and inf all_val_mse = np.array(all_val_mse)[~np.isnan(all_val_mse)] if not len(all_val_mse) or np.all(all_val_mse > constant_val_mse): print("All valid loss are worse than constant prediction") print("Replacing model prediction with constant prediction") net.set_predict_mean(True) except FileNotFoundError: print("COULD NOT FIND CHECKPOINT FILE") if not is_regression: # this should only happen for regression raise # check that valid loss is always nan or inf valid_loss = net.history[:, "valid_loss"] assert np.all(np.isnan(valid_loss) | np.isinf(valid_loss)) print("valid loss is always nan or inf") print("Replacing model prediction with constant prediction") net.set_predict_mean(True) class MyCustomError(Exception): pass class EarlyStoppingCustomError(EarlyStopping): def on_epoch_end(self, net, **kwargs): current_score = net.history[-1, self.monitor] if not self._is_score_improved(current_score): self.misses_ += 1 else: self.misses_ = 0 self.dynamic_threshold_ = self._calc_new_threshold(current_score) self.best_epoch_ = net.history[-1, "epoch"] if self.load_best: self.best_model_weights_ = deepcopy(net.module_.state_dict()) if self.misses_ == self.patience: if net.verbose: self._sink("Stopping since {} has not improved in the last " "{} epochs.".format(self.monitor, self.patience), verbose=net.verbose) raise MyCustomError class NeuralNetRegressorWrapped(NeuralNetRegressor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.categorical_indicator = None self.predict_mean = False # whether to predict y_train mean if # the network predictions are nan or too bad self.y_train_mean = None def set_categorical_indicator(self, categorical_indicator): self.categorical_indicator = categorical_indicator def set_predict_mean(self, predict_mean): self.predict_mean = predict_mean def set_y_train_mean(self, y_train_mean): self.y_train_mean = y_train_mean def get_default_callbacks(self): callbacks = [cb for cb in super().get_default_callbacks() if not isinstance(cb[1], PrintLog)] callbacks.append(('print_log', PrintLog(sink=print_but_serializable))) print(callbacks) return callbacks def fit(self, X, y): if y.ndim == 1: y = y.reshape(-1, 1) self.set_y_train_mean(np.mean(y)) return super().fit(X, y) def predict(self, X): if self.predict_mean: return np.ones((X.shape[0], 1)) * self.y_train_mean else: return super().predict(X) # adapted from skorch code # to remove ignoring keyboard interrupt # as it can be dangerous for benchmarking # pylint: disable=unused-argument def partial_fit(self, X, y=None, classes=None, **fit_params): if not self.initialized_: self.initialize() self.notify('on_train_begin', X=X, y=y) try: self.fit_loop(X, y, **fit_params) # except KeyboardInterrupt: except MyCustomError: pass self.notify('on_train_end', X=X, y=y) return self class NeuralNetClassifierWrapped(NeuralNetClassifier): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.categorical_indicator = None self.n_classes = None # automatically inferred from train if not set def set_categorical_indicator(self, categorical_indicator): self.categorical_indicator = categorical_indicator def set_n_classes(self, n_classes): self.n_classes = n_classes def fit(self, X, y): y = y.astype(np.int64) return super().fit(X, y) def get_default_callbacks(self): callbacks = [cb for cb in super().get_default_callbacks() if not isinstance(cb[1], PrintLog)] callbacks.append(('print_log', PrintLog(sink=print_but_serializable))) print(callbacks) return callbacks # adapted from skorch code # to remove ignoring keyboard interrupt # as it can be dangerous for benchmarking # pylint: disable=unused-argument def partial_fit(self, X, y=None, classes=None, **fit_params): if not self.initialized_: self.initialize() self.notify('on_train_begin', X=X, y=y) try: self.fit_loop(X, y, **fit_params) # except KeyboardInterrupt: except MyCustomError: pass self.notify('on_train_end', X=X, y=y) return self # for FT-Transformer, we extend the NeuralNet class to allow different weight decay for different # parts of the network def initialize_optimizer_ft_transformer(self, triggered_directly=None): """Initialize the model optimizer. If ``self.optimizer__lr`` is not set, use ``self.lr`` instead. Parameters ---------- triggered_directly Deprecated, don't use it anymore. """ # handle deprecated parameter # if triggered_directly is not None: # warnings.warn( # "The 'triggered_directly' argument to 'initialize_optimizer' is " # "deprecated, please don't use it anymore.", DeprecationWarning) named_parameters = list(self.get_all_learnable_params()) # print no_wd_names = ['tokenizer', '.norm', '.bias'] for x in ['tokenizer', '.norm', '.bias']: assert any(x in a for a in (b[0] for b in named_parameters)) #TODO improve this def needs_wd(name): return all(x not in name for x in no_wd_names) named_parameters_grouped = [ {'params': [v for k, v in named_parameters if needs_wd(k)]}, { 'params': [v for k, v in named_parameters if not needs_wd(k)], 'weight_decay': 0.0, }] args, kwargs = self.get_params_for_optimizer( 'optimizer', named_parameters) # pylint: disable=attribute-defined-outside-init self.optimizer_ = self.optimizer(named_parameters_grouped, **kwargs) return self class NeuralNetClassifierCustomOptim(NeuralNetClassifierWrapped): def initialize_optimizer(self, triggered_directly=None): return initialize_optimizer_ft_transformer(self, triggered_directly) class NeuralNetRegressorCustomOptim(NeuralNetRegressorWrapped): def initialize_optimizer(self, triggered_directly=None): return initialize_optimizer_ft_transformer(self, triggered_directly) def mse_constant_predictor(model, X, y): return np.mean((y - model.y_train_mean) ** 2) def create_regressor_skorch( id=None, wandb_run=None, use_checkpoints=True, cat_features=None, model_name="resnet", checkpoint_dir="skorch_cp", **kwargs ): print("RTDL regressor") if "lr_scheduler" not in kwargs: lr_scheduler = False else: lr_scheduler = kwargs.pop("lr_scheduler") if "es_patience" not in kwargs.keys(): es_patience = 40 else: es_patience = kwargs.pop("es_patience") if "lr_patience" not in kwargs.keys(): lr_patience = 30 else: lr_patience = kwargs.pop("lr_patience") if "optimizer" not in kwargs.keys(): optimizer = "adamw" else: optimizer = kwargs.pop("optimizer") if optimizer == "adam": optimizer = Adam elif optimizer == "adamw": optimizer = AdamW elif optimizer == "sgd": optimizer = SGD if "batch_size" not in kwargs.keys(): batch_size = 128 else: batch_size = kwargs.pop("batch_size") if "categories" not in kwargs.keys(): categories = None else: categories = kwargs.pop("categories") callbacks = [ InputShapeSetterResnet( regression=True, cat_features=cat_features, categories=categories, batch_size=batch_size ), EpochScoring(scoring=mse_constant_predictor, name="constant_val_mse", on_train=False), EarlyStoppingCustomError(monitor="valid_loss", patience=es_patience, sink=print_but_serializable), ] if lr_scheduler: callbacks.append( LRScheduler( policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5, factor=0.2 ) ) # FIXME make customizable if use_checkpoints: callbacks.append( UniquePrefixCheckpoint( dirname=checkpoint_dir, f_params=r"params.pt", f_optimizer=None, f_criterion=None, f_history=None, load_best=True, monitor="valid_loss_best", sink=print_but_serializable, ) ) if not wandb_run is None: callbacks.append(WandbLogger(wandb_run, save_model=False)) callbacks.append(LearningRateLogger()) nn_class = NeuralNetRegressorCustomOptim if model_name == "ft_transformer" else NeuralNetRegressorWrapped if model_name == "ft_transformer": model_class = FT_Transformer elif model_name == "resnet": model_class = ResNet elif model_name == "mlp": model_class = RTDL_MLP else: raise ValueError(f'Model {model_name} not implemented here! Choose from "ft_transformer", "resnet", "mlp"') new_kwargs = dict(optimizer=optimizer, batch_size=max( batch_size, 1 ), # if batch size is float, it will be reset during fit iterator_train__shuffle=True, module__d_in=1, # will be change when fitted module__categories=None, # will be change when fitted module__d_out=1, # idem module__regression=True, module__categorical_indicator=None, # will be change when fitted callbacks=callbacks, **kwargs) # cannot do the try/catch here because params are validated in fit() # try: # # try the torch_load_kwargs but it's only available in newer versions of skorch # model = nn_class( # model_class, # # Shuffle training data on each epoch # **new_kwargs, # torch_load_kwargs={'weights_only': False}, # quick-fix for pickling errors in torch>=2.6 # ) # except ValueError: # model = nn_class( # model_class, # # Shuffle training data on each epoch # **new_kwargs, # ) model = nn_class( model_class, # Shuffle training data on each epoch **new_kwargs, ) return model def create_classifier_skorch( id=None, wandb_run=None, use_checkpoints=True, cat_features=None, model_name="resnet", checkpoint_dir="skorch_cp", val_metric_name: str = 'class_error', **kwargs ): print("RTDL classifier") if "lr_scheduler" not in kwargs: lr_scheduler = False else: lr_scheduler = kwargs.pop("lr_scheduler") if "es_patience" not in kwargs.keys(): es_patience = 40 else: es_patience = kwargs.pop("es_patience") if "lr_patience" not in kwargs.keys(): lr_patience = 30 else: lr_patience = kwargs.pop("lr_patience") if "optimizer" not in kwargs.keys(): optimizer = "adamw" else: optimizer = kwargs.pop("optimizer") if optimizer == "adam": optimizer = Adam elif optimizer == "adamw": optimizer = AdamW elif optimizer == "sgd": optimizer = SGD if "batch_size" not in kwargs.keys(): batch_size = 128 else: batch_size = kwargs.pop("batch_size") if "categories" not in kwargs.keys(): categories = None else: categories = kwargs.pop("categories") callbacks = [ InputShapeSetterResnet( regression=False, cat_features=cat_features, categories=categories, batch_size=batch_size ), EpochScoring(scoring="accuracy", name="train_accuracy", on_train=True), ] if val_metric_name == 'class_error': callbacks.append(EarlyStoppingCustomError(monitor="valid_acc", patience=es_patience, lower_is_better=False, sink=print_but_serializable)) elif val_metric_name == 'cross_entropy': print(f'Using early stopping on cross-entropy loss') callbacks.append(EarlyStoppingCustomError(monitor='valid_loss', patience=es_patience, lower_is_better=True, sink=print_but_serializable)) else: raise ValueError(f'Validation metric {val_metric_name} not implemented here!') if lr_scheduler: callbacks.append( LRScheduler( policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5, factor=0.2 ) ) # FIXME make customizable if use_checkpoints: callbacks.append( UniquePrefixCheckpoint( dirname=checkpoint_dir, f_params=r"params.pt", f_optimizer=None, f_criterion=None, f_history=None, load_best=True, monitor="valid_acc_best" if val_metric_name == 'class_error' else 'valid_loss_best', sink=print_but_serializable, ) ) if not wandb_run is None: callbacks.append(WandbLogger(wandb_run, save_model=False)) callbacks.append(LearningRateLogger()) nn_class = NeuralNetClassifierCustomOptim if model_name == "ft_transformer" else NeuralNetClassifierWrapped if model_name == "ft_transformer": model_class = FT_Transformer elif model_name == "resnet": model_class = ResNet elif model_name == "mlp": model_class = RTDL_MLP else: raise ValueError(f'Model {model_name} not implemented here! Choose from "ft_transformer", "resnet", "mlp"') model = nn_class( model_class, # Shuffle training data on each epoch criterion=nn.CrossEntropyLoss, optimizer=optimizer, batch_size=max( batch_size, 1 ), # if batch size is float, it will be reset during fit iterator_train__shuffle=True, module__d_in=1, # will be change when fitted module__categories=None, # will be change when fitted module__d_out=1, # idem module__regression=False, module__categorical_indicator=None, # will be change when fitted callbacks=callbacks, **kwargs, ) return model create_resnet_regressor_skorch = partial(create_regressor_skorch, model_name="resnet", use_checkpoints=True) create_resnet_classifier_skorch = partial(create_classifier_skorch, model_name="resnet", use_checkpoints=True) create_mlp_regressor_skorch = partial(create_regressor_skorch, model_name="mlp", use_checkpoints=True) create_mlp_classifier_skorch = partial(create_classifier_skorch, model_name="mlp", use_checkpoints=True) create_ft_transformer_regressor_skorch = partial(create_regressor_skorch, model_name="ft_transformer", use_checkpoints=True) create_ft_transformer_classifier_skorch = partial(create_classifier_skorch, model_name="ft_transformer", use_checkpoints=True) ================================================ FILE: pytabkit/models/nn_models/tabm.py ================================================ # License: https://github.com/yandex-research/tabm/blob/main/LICENSE # NOTE # The minimum required versions of the dependencies are specified in README.md. from __future__ import annotations import itertools from typing import Any, Literal, Optional, Union, List, Dict from pytabkit.models.nn_models import rtdl_num_embeddings import pytabkit.models.nn_models.rtdl_num_embeddings import torch import torch.nn as nn from torch import Tensor # ====================================================================================== # Initialization # ====================================================================================== def init_rsqrt_uniform_(x: Tensor, d: int) -> Tensor: assert d > 0 d_rsqrt = d**-0.5 return nn.init.uniform_(x, -d_rsqrt, d_rsqrt) @torch.inference_mode() def init_random_signs_(x: Tensor) -> Tensor: return x.bernoulli_(0.5).mul_(2).add_(-1) # ====================================================================================== # Modules # ====================================================================================== class NLinear(nn.Module): """N linear layers applied in parallel to N disjoint parts of the input. **Shape** - Input: ``(B, N, in_features)`` - Output: ``(B, N, out_features)`` The i-th linear layer is applied to the i-th matrix of the shape (B, in_features). Technically, this is a simplified version of delu.nn.NLinear: https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html. The difference is that this layer supports only 3D inputs with exactly one batch dimension. By contrast, delu.nn.NLinear supports any number of batch dimensions. """ def __init__( self, n: int, in_features: int, out_features: int, bias: bool = True ) -> None: super().__init__() self.weight = nn.Parameter(torch.empty(n, in_features, out_features)) self.bias = nn.Parameter(torch.empty(n, out_features)) if bias else None self.reset_parameters() def reset_parameters(self): d = self.weight.shape[-2] init_rsqrt_uniform_(self.weight, d) if self.bias is not None: init_rsqrt_uniform_(self.bias, d) def forward(self, x: torch.Tensor) -> torch.Tensor: assert x.ndim == 3 assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1] x = x.transpose(0, 1) x = x @ self.weight x = x.transpose(0, 1) if self.bias is not None: x = x + self.bias return x class OneHotEncoding0d(nn.Module): # Input: (*, n_cat_features=len(cardinalities)) # Output: (*, sum(cardinalities)) def __init__(self, cardinalities: List[int]) -> None: super().__init__() self._cardinalities = cardinalities def forward(self, x: Tensor) -> Tensor: assert x.ndim >= 1 assert x.shape[-1] == len(self._cardinalities) return torch.cat( [ # NOTE # This is a quick hack to support out-of-vocabulary categories. # # Recall that lib.data.transform_cat encodes categorical features # as follows: # - In-vocabulary values receive indices from `range(cardinality)`. # - All out-of-vocabulary values (i.e. new categories in validation # and test data that are not presented in the training data) # receive the index `cardinality`. # # As such, the line below will produce the standard one-hot encoding for # known categories, and the all-zeros encoding for unknown categories. # This may not be the best approach to deal with unknown values, # but should be enough for our purposes. nn.functional.one_hot(x[..., i], cardinality + 1)[..., :-1] for i, cardinality in enumerate(self._cardinalities) ], -1, ) class ScaleEnsemble(nn.Module): def __init__( self, k: int, d: int, *, init: Literal['ones', 'normal', 'random-signs'], ) -> None: super().__init__() self.weight = nn.Parameter(torch.empty(k, d)) self._weight_init = init self.reset_parameters() def reset_parameters(self) -> None: if self._weight_init == 'ones': nn.init.ones_(self.weight) elif self._weight_init == 'normal': nn.init.normal_(self.weight) elif self._weight_init == 'random-signs': init_random_signs_(self.weight) else: raise ValueError(f'Unknown weight_init: {self._weight_init}') def forward(self, x: Tensor) -> Tensor: assert x.ndim >= 2 return x * self.weight class LinearEfficientEnsemble(nn.Module): """ This layer is a more configurable version of the "BatchEnsemble" layer from the paper "BatchEnsemble: An Alternative Approach to Efficient Ensemble and Lifelong Learning" (link: https://arxiv.org/abs/2002.06715). First, this layer allows to select only some of the "ensembled" parts: - the input scaling (r_i in the BatchEnsemble paper) - the output scaling (s_i in the BatchEnsemble paper) - the output bias (not mentioned in the BatchEnsemble paper, but is presented in public implementations) Second, the initialization of the scaling weights is configurable through the `scaling_init` argument. NOTE The term "adapter" is used in the TabM paper only to tell the story. The original BatchEnsemble paper does NOT use this term. So this class also avoids the term "adapter". """ r: Optional[Tensor] s: Optional[Tensor] bias: Optional[Tensor] def __init__( self, in_features: int, out_features: int, bias: bool = True, *, k: int, ensemble_scaling_in: bool, ensemble_scaling_out: bool, ensemble_bias: bool, scaling_init: Literal['ones', 'random-signs'], ): assert k > 0 if ensemble_bias: assert bias super().__init__() self.weight = nn.Parameter(torch.empty(out_features, in_features)) self.register_parameter( 'r', ( nn.Parameter(torch.empty(k, in_features)) if ensemble_scaling_in else None ), # type: ignore[code] ) self.register_parameter( 's', ( nn.Parameter(torch.empty(k, out_features)) if ensemble_scaling_out else None ), # type: ignore[code] ) self.register_parameter( 'bias', ( nn.Parameter(torch.empty(out_features)) # type: ignore[code] if bias and not ensemble_bias else nn.Parameter(torch.empty(k, out_features)) if ensemble_bias else None ), ) self.in_features = in_features self.out_features = out_features self.k = k self.scaling_init = scaling_init self.reset_parameters() def reset_parameters(self): init_rsqrt_uniform_(self.weight, self.in_features) scaling_init_fn = {'ones': nn.init.ones_, 'random-signs': init_random_signs_}[ self.scaling_init ] if self.r is not None: scaling_init_fn(self.r) if self.s is not None: scaling_init_fn(self.s) if self.bias is not None: bias_init = torch.empty( # NOTE: the shape of bias_init is (out_features,) not (k, out_features). # It means that all biases have the same initialization. # This is similar to having one shared bias plus # k zero-initialized non-shared biases. self.out_features, dtype=self.weight.dtype, device=self.weight.device, ) bias_init = init_rsqrt_uniform_(bias_init, self.in_features) with torch.inference_mode(): self.bias.copy_(bias_init) def forward(self, x: Tensor) -> Tensor: # x.shape == (B, K, D) assert x.ndim == 3 # >>> The equation (5) from the BatchEnsemble paper (arXiv v2). if self.r is not None: x = x * self.r x = x @ self.weight.T if self.s is not None: x = x * self.s # <<< if self.bias is not None: x = x + self.bias return x class MLP(nn.Module): def __init__( self, *, d_in: Optional[int] = None, d_out: Optional[int] = None, n_blocks: int, d_block: int, dropout: float, activation: str = 'ReLU', ) -> None: super().__init__() d_first = d_block if d_in is None else d_in self.blocks = nn.ModuleList( [ nn.Sequential( nn.Linear(d_first if i == 0 else d_block, d_block), getattr(nn, activation)(), nn.Dropout(dropout), ) for i in range(n_blocks) ] ) self.output = None if d_out is None else nn.Linear(d_block, d_out) def forward(self, x: Tensor) -> Tensor: for block in self.blocks: x = block(x) if self.output is not None: x = self.output(x) return x def make_efficient_ensemble(module: nn.Module, EnsembleLayer, **kwargs) -> None: """Replace linear layers with efficient ensembles of linear layers. NOTE In the paper, there are no experiments with networks with normalization layers. Perhaps, their trainable weights (the affine transformations) also need "ensemblification" as in the paper about "FiLM-Ensemble". Additional experiments are required to make conclusions. """ for name, submodule in list(module.named_children()): if isinstance(submodule, nn.Linear): module.add_module( name, EnsembleLayer( in_features=submodule.in_features, out_features=submodule.out_features, bias=submodule.bias is not None, **kwargs, ), ) else: make_efficient_ensemble(submodule, EnsembleLayer, **kwargs) def _get_first_ensemble_layer(backbone: MLP) -> LinearEfficientEnsemble: if isinstance(backbone, MLP): return backbone.blocks[0][0] # type: ignore[code] else: raise RuntimeError(f'Unsupported backbone: {backbone}') @torch.inference_mode() def _init_first_adapter( weight: Tensor, distribution: Literal['normal', 'random-signs'], init_sections: List[int], ) -> None: """Initialize the first adapter. NOTE The `init_sections` argument is a historical artifact that accidentally leaked from irrelevant experiments to the final models. Perhaps, the code related to `init_sections` can be simply removed, but this was not tested. """ assert weight.ndim == 2 assert weight.shape[1] == sum(init_sections) if distribution == 'normal': init_fn_ = nn.init.normal_ elif distribution == 'random-signs': init_fn_ = init_random_signs_ else: raise ValueError(f'Unknown distribution: {distribution}') section_bounds = [0, *torch.tensor(init_sections).cumsum(0).tolist()] for i in range(len(init_sections)): # NOTE # As noted above, this section-based initialization is an arbitrary historical # artifact. Consider the first adapter of one ensemble member. # This adapter vector is implicitly split into "sections", # where one section corresponds to one feature. The code below ensures that # the adapter weights in one section are initialized with the same random value # from the given distribution. w = torch.empty((len(weight), 1), dtype=weight.dtype, device=weight.device) init_fn_(w) weight[:, section_bounds[i] : section_bounds[i + 1]] = w _CUSTOM_MODULES = { # https://docs.python.org/3/library/stdtypes.html#definition.__name__ CustomModule.__name__: CustomModule for CustomModule in [ rtdl_num_embeddings.LinearEmbeddings, rtdl_num_embeddings.LinearReLUEmbeddings, rtdl_num_embeddings.PeriodicEmbeddings, rtdl_num_embeddings.PiecewiseLinearEmbeddings, MLP, ] } def make_module(type: str, *args, **kwargs) -> nn.Module: Module = getattr(nn, type, None) if Module is None: Module = _CUSTOM_MODULES[type] return Module(*args, **kwargs) # ====================================================================================== # Optimization # ====================================================================================== def default_zero_weight_decay_condition( module_name: str, module: nn.Module, parameter_name: str, parameter: nn.Parameter ): del module_name, parameter return parameter_name.endswith('bias') or isinstance( module, (nn.BatchNorm1d, nn.LayerNorm, nn.InstanceNorm1d, rtdl_num_embeddings.LinearEmbeddings, rtdl_num_embeddings.LinearReLUEmbeddings, rtdl_num_embeddings._Periodic), ) def make_parameter_groups( module: nn.Module, zero_weight_decay_condition=default_zero_weight_decay_condition, custom_groups: Optional[List[Dict[str, Any]]] = None, ) -> List[Dict[str, Any]]: if custom_groups is None: custom_groups = [] custom_params = frozenset( itertools.chain.from_iterable(group['params'] for group in custom_groups) ) assert len(custom_params) == sum( len(group['params']) for group in custom_groups ), 'Parameters in custom_groups must not intersect' zero_wd_params = frozenset( p for mn, m in module.named_modules() for pn, p in m.named_parameters() if p not in custom_params and zero_weight_decay_condition(mn, m, pn, p) ) default_group = { 'params': [ p for p in module.parameters() if p not in custom_params and p not in zero_wd_params ] } return [ default_group, {'params': list(zero_wd_params), 'weight_decay': 0.0}, *custom_groups, ] # ====================================================================================== # The model # ====================================================================================== class Model(nn.Module): """MLP & TabM.""" def __init__( self, *, n_num_features: int, cat_cardinalities: List[int], n_classes: Optional[int], backbone: dict, bins: Optional[List[Tensor]], # For piecewise-linear encoding/embeddings. num_embeddings: Optional[Dict] = None, arch_type: Literal[ # Plain feed-forward network without any kind of ensembling. 'plain', # # TabM 'tabm', # # TabM-mini 'tabm-mini', # # TabM-packed 'tabm-packed', # # TabM. The first adapter is initialized from the normal distribution. # This variant was not used in the paper, but it may be useful in practice. 'tabm-normal', # # TabM-mini. The adapter is initialized from the normal distribution. # This variant was not used in the paper. 'tabm-mini-normal', ], k: Optional[int] = None, share_training_batches: bool = True, ) -> None: # >>> Validate arguments. assert n_num_features >= 0 assert n_num_features or cat_cardinalities if arch_type == 'plain': assert k is None assert ( share_training_batches ), 'If `arch_type` is set to "plain", then `simple` must remain True' else: assert k is not None assert k > 0 super().__init__() # >>> Continuous (numerical) features first_adapter_sections = [] # See the comment in `_init_first_adapter`. if n_num_features == 0: assert bins is None self.num_module = None d_num = 0 elif num_embeddings is None: assert bins is None self.num_module = None d_num = n_num_features first_adapter_sections.extend(1 for _ in range(n_num_features)) else: if bins is None: self.num_module = make_module( **num_embeddings, n_features=n_num_features ) else: assert num_embeddings['type'].startswith('PiecewiseLinearEmbeddings') self.num_module = make_module(**num_embeddings, bins=bins) d_num = n_num_features * num_embeddings['d_embedding'] first_adapter_sections.extend( num_embeddings['d_embedding'] for _ in range(n_num_features) ) # >>> Categorical features self.cat_module = ( OneHotEncoding0d(cat_cardinalities) if cat_cardinalities else None ) first_adapter_sections.extend(cat_cardinalities) d_cat = sum(cat_cardinalities) # >>> Backbone d_flat = d_num + d_cat self.minimal_ensemble_adapter = None # Any backbone can be here but we provide only MLP self.backbone = make_module(d_in=d_flat, **backbone) if arch_type != 'plain': assert k is not None first_adapter_init = ( None if arch_type == 'tabm-packed' else 'normal' if arch_type in ('tabm-mini-normal', 'tabm-normal') # For other arch_types, the initialization depends # on the presence of num_embeddings. else 'random-signs' if num_embeddings is None else 'normal' ) if arch_type in ('tabm', 'tabm-normal'): # Like BatchEnsemble, but all multiplicative adapters, # except for the very first one, are initialized with ones. assert first_adapter_init is not None make_efficient_ensemble( self.backbone, LinearEfficientEnsemble, k=k, ensemble_scaling_in=True, ensemble_scaling_out=True, ensemble_bias=True, scaling_init='ones', ) _init_first_adapter( _get_first_ensemble_layer(self.backbone).r, # type: ignore[code] first_adapter_init, first_adapter_sections, ) elif arch_type in ('tabm-mini', 'tabm-mini-normal'): # MiniEnsemble assert first_adapter_init is not None self.minimal_ensemble_adapter = ScaleEnsemble( k, d_flat, init='random-signs' if num_embeddings is None else 'normal', ) _init_first_adapter( self.minimal_ensemble_adapter.weight, # type: ignore[code] first_adapter_init, first_adapter_sections, ) elif arch_type == 'tabm-packed': # Packed ensemble. # In terms of the Packed Ensembles paper by Laurent et al., # TabM-packed is PackedEnsemble(alpha=k, M=k, gamma=1). assert first_adapter_init is None make_efficient_ensemble(self.backbone, NLinear, n=k) else: raise ValueError(f'Unknown arch_type: {arch_type}') # >>> Output d_block = backbone['d_block'] d_out = 1 if n_classes is None else n_classes self.output = ( nn.Linear(d_block, d_out) if arch_type == 'plain' else NLinear(k, d_block, d_out) # type: ignore[code] ) # >>> self.arch_type = arch_type self.k = k self.share_training_batches = share_training_batches def forward( self, x_num: Optional[Tensor] = None, x_cat: Optional[Tensor] = None ) -> Tensor: x = [] if x_num is not None: x.append(x_num if self.num_module is None else self.num_module(x_num)) if x_cat is None: assert self.cat_module is None else: assert self.cat_module is not None x.append(self.cat_module(x_cat).float()) x = torch.column_stack([x_.flatten(1, -1) for x_ in x]) if self.k is not None: if self.share_training_batches or not self.training: # (B, D) -> (B, K, D) x = x[:, None].expand(-1, self.k, -1) else: # (B * K, D) -> (B, K, D) x = x.reshape(len(x) // self.k, self.k, *x.shape[1:]) if self.minimal_ensemble_adapter is not None: x = self.minimal_ensemble_adapter(x) else: assert self.minimal_ensemble_adapter is None x = self.backbone(x) x = self.output(x) if self.k is None: # Adjust the output shape for plain networks to make them compatible # with the rest of the script (loss, metrics, predictions, ...). # (B, D_OUT) -> (B, 1, D_OUT) x = x[:, None] return x ================================================ FILE: pytabkit/models/nn_models/tabr.py ================================================ import os import inspect import warnings import math from functools import partial import numpy as np import torch from torch import Tensor import torch.optim as optim from torch.utils.data import DataLoader, Dataset import torch.nn.functional as F from pytabkit.models.nn_models import tabr_lib as lib import torch.nn as nn from torchmetrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError, AUROC, MeanAbsoluteError from typing import Any, Optional, Union, Literal, Callable try: import lightning.pytorch as pl except ImportError: import pytorch_lightning as pl class NTPLinearLayer(nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool = True, bias_factor: float = 0.1, linear_init_type: str = 'default'): super().__init__() self.use_bias = bias if linear_init_type == 'default': self.weight = nn.Parameter(-1+2*torch.rand(in_features, out_features)) if self.use_bias: self.bias = nn.Parameter((-1+2*torch.rand(1, out_features)) / np.sqrt(in_features)) elif linear_init_type == 'normal': self.weight = nn.Parameter(torch.randn(in_features, out_features)) if self.use_bias: self.bias = nn.Parameter(torch.randn(1, out_features)) else: raise ValueError(f'Unknown linear_init_type "{linear_init_type}"') self.bias_factor = bias_factor self.weight_factor = 1./np.sqrt(in_features) def forward(self, x): x = self.weight_factor * x @ self.weight if self.use_bias: x = x + self.bias_factor * self.bias return x class ParametricMishActivationLayer(nn.Module): def __init__(self, n_features: int, lr_factor: float = 1.0): super().__init__() self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features)) self.lr_factor = lr_factor def f(self, x): return x.mul(torch.tanh(F.softplus(x))) def forward(self, x): # print(f'{self.weight.mean().item()=:g}') return x + self.lr_factor * (self.f(x) - x) * self.weight class ParametricReluActivationLayer(nn.Module): def __init__(self, n_features: int, lr_factor: float = 1.0): super().__init__() self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features)) self.lr_factor = lr_factor def f(self, x): return torch.relu(x) def forward(self, x): # print(f'{self.weight.mean().item()=:g}') return x + self.lr_factor * (self.f(x) - x) * self.weight class ScalingLayer(nn.Module): def __init__(self, n_features: int, lr_factor: float = 6.0): super().__init__() self.weight = nn.Parameter((1. / lr_factor) * torch.ones(n_features)) self.lr_factor = lr_factor def forward(self, x): return self.lr_factor * x * self.weight[None, :] def bce_with_logits_and_label_smoothing(inputs, *args, ls_eps: float, **kwargs): return (1 - 0.5 * ls_eps) * F.binary_cross_entropy_with_logits(inputs, *args, **kwargs) \ + 0.5 * ls_eps * F.binary_cross_entropy_with_logits(-inputs, *args, **kwargs) # adapted from https://github.com/yandex-research/tabular-dl-tabr/tree/main/bin class TabrModel(nn.Module): def __init__( self, *, # n_num_features: int, n_bin_features: int, cat_cardinalities: list[int], n_classes: Optional[int], # num_embeddings: Optional[dict], # lib.deep.ModuleSpec d_main: int, d_multiplier: float, encoder_n_blocks: int, predictor_n_blocks: int, mixer_normalization: Union[bool, Literal['auto']], context_dropout: float, dropout0: float, dropout1: Union[float, Literal['dropout0']], normalization: str, activation: str, # # The following options should be used only when truly needed. memory_efficient: bool = False, candidate_encoding_batch_size: Optional[int] = None, # extra options not in the original tabr add_scaling_layer: bool = False, scale_lr_factor: float = 6.0, use_ntp_linear: bool = False, linear_init_type: str = 'default', # only relevant if use_ntp_linear=True use_ntp_encoder: bool = False, ) -> None: # import locally so importing this file doesn't cause problems if faiss is not installed # import in constructor as well to make model fail earlier if not installed import faiss import faiss.contrib.torch_utils # noqa << this line makes faiss work with PyTorch if not memory_efficient: assert candidate_encoding_batch_size is None if mixer_normalization == 'auto': mixer_normalization = encoder_n_blocks > 0 if encoder_n_blocks == 0: assert not mixer_normalization super().__init__() if dropout1 == 'dropout0': dropout1 = dropout0 self.one_hot_encoder = ( lib.OneHotEncoder(cat_cardinalities) if cat_cardinalities else None ) self.num_embeddings = ( None if num_embeddings is None else lib.make_module(num_embeddings, n_features=n_num_features) ) print(f'{add_scaling_layer=}') print(f'{activation=}') print(f'{scale_lr_factor=}') # >>> E d_in = ( n_num_features * (1 if num_embeddings is None else num_embeddings['d_embedding']) + n_bin_features + sum(cat_cardinalities) ) d_block = int(d_main * d_multiplier) Normalization = getattr(nn, normalization) if activation == 'pmish': Activation = lambda n_features: ParametricMishActivationLayer(n_features=n_features) elif activation == 'prelu': Activation = lambda n_features: ParametricReluActivationLayer(n_features=n_features) else: Activation = lambda n_features: getattr(nn, activation)() if use_ntp_linear: print(f'Using NTP linear layer with init {linear_init_type}') Linear = lambda in_features, out_features, bias=True: NTPLinearLayer(in_features, out_features, bias=bias, bias_factor=0.1, linear_init_type=linear_init_type) else: Linear = nn.Linear def make_block(prenorm: bool) -> nn.Sequential: return nn.Sequential( *([Normalization(d_main)] if prenorm else []), Linear(d_main, d_block), Activation(d_block), nn.Dropout(dropout0), Linear(d_block, d_main), nn.Dropout(dropout1), ) self.scale = ScalingLayer(d_in, lr_factor=scale_lr_factor) if add_scaling_layer else nn.Identity() self.linear = Linear(d_in, d_main) self.blocks0 = nn.ModuleList( [make_block(i > 0) for i in range(encoder_n_blocks)] ) # >>> R self.normalization = Normalization(d_main) if mixer_normalization else None self.label_encoder = ( Linear(1, d_main) if use_ntp_encoder else nn.Linear(1, d_main) if n_classes is None else nn.Sequential( nn.Embedding(n_classes, d_main), lib.Lambda(lambda x: x.squeeze(-2)) ) ) self.K = Linear(d_main, d_main) self.T = nn.Sequential( Linear(d_main, d_block), Activation(d_block), nn.Dropout(dropout0), Linear(d_block, d_main, bias=False), ) self.dropout = nn.Dropout(context_dropout) # >>> P self.blocks1 = nn.ModuleList( [make_block(True) for _ in range(predictor_n_blocks)] ) self.head = nn.Sequential( Normalization(d_main), Activation(d_main), Linear(d_main, lib.get_d_out(n_classes)), ) # >>> self.search_index = None self.memory_efficient = memory_efficient self.candidate_encoding_batch_size = candidate_encoding_batch_size self.reset_parameters() def reset_parameters(self): if isinstance(self.label_encoder, nn.Linear) or isinstance(self.label_encoder, NTPLinearLayer): bound = 1 / math.sqrt(2.0) nn.init.uniform_(self.label_encoder.weight, -bound, bound) # type: ignore[code] # noqa: E501 nn.init.uniform_(self.label_encoder.bias, -bound, bound) # type: ignore[code] # noqa: E501 else: assert isinstance(self.label_encoder[0], nn.Embedding) nn.init.uniform_(self.label_encoder[0].weight, -1.0, 1.0) # type: ignore[code] # noqa: E501 def _encode(self, x_: dict[str, Tensor]) -> tuple[Tensor, Tensor]: x_num = x_.get('num') x_bin = x_.get('bin') x_cat = x_.get('cat') del x_ x = [] if x_num is None: # assert self.num_embeddings is None pass # changed to make it easier to use with all-categorical datasets else: x.append( x_num if self.num_embeddings is None else self.num_embeddings(x_num).flatten(1) ) if x_bin is not None: x.append(x_bin) if x_cat is None: assert self.one_hot_encoder is None else: assert self.one_hot_encoder is not None x.append(self.one_hot_encoder(x_cat)) assert x x = torch.cat(x, dim=1).float() x = self.scale(x) x = self.linear(x) for block in self.blocks0: x = x + block(x) k = self.K(x if self.normalization is None else self.normalization(x)) return x, k def forward( self, *, x_: dict[str, Tensor], y: Optional[Tensor], candidate_x_: dict[str, Tensor], candidate_y: Tensor, context_size: int, is_train: bool, ) -> Tensor: # print('forward()') # import locally so importing this file doesn't cause problems if faiss is not installed import faiss import faiss.contrib.torch_utils # noqa << this line makes faiss work with PyTorch # >>> with torch.set_grad_enabled( torch.is_grad_enabled() and not self.memory_efficient ): # NOTE: during evaluation, candidate keys can be computed just once, which # looks like an easy opportunity for optimization. However: # - if your dataset is small or/and the encoder is just a linear layer # (no embeddings and encoder_n_blocks=0), then encoding candidates # is not a bottleneck. # - implementing this optimization makes the code complex and/or unobvious, # because there are many things that should be taken into account: # - is the input coming from the "train" part? # - is self.training True or False? # - is PyTorch autograd enabled? # - is saving and loading checkpoints handled correctly? # This is why we do not implement this optimization. # When memory_efficient is True, this potentially heavy computation is # performed without gradients. # Later, it is recomputed with gradients only for the context objects. candidate_k = ( self._encode(candidate_x_)[1] if self.candidate_encoding_batch_size is None else torch.cat( [ self._encode(x)[1] for x in lib.iter_batches( candidate_x_, self.candidate_encoding_batch_size ) ] ) ) x, k = self._encode(x_) if is_train: # NOTE: here, we add the training batch back to the candidates after the # function `apply_model` removed them. The further code relies # on the fact that the first batch_size candidates come from the # training batch. assert y is not None candidate_k = torch.cat([k, candidate_k]) candidate_y = torch.cat([y, candidate_y]) else: assert y is None # >>> # The search below is optimized for larger datasets and is significantly faster # than the naive solution (keep autograd on + manually compute all pairwise # squared L2 distances + torch.topk). # For smaller datasets, however, the naive solution can actually be faster. batch_size, d_main = k.shape device = k.device with torch.no_grad(): if self.search_index is None: # self.search_index = ( # faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main) # if device.type == 'cuda' # else faiss.IndexFlatL2(d_main) # ) if device.type == 'cpu': self.search_index = faiss.IndexFlatL2(d_main) elif device.type == 'cuda': gpu_index = 0 if device.index is None else device.index cfg = faiss.GpuIndexFlatConfig() cfg.device = gpu_index self.search_index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main, cfg) else: raise ValueError() # Updating the index is much faster than creating a new one. self.search_index.reset() self.search_index.add(candidate_k) # type: ignore[code] distances: Tensor context_idx: Tensor distances, context_idx = self.search_index.search( # type: ignore[code] k, context_size + (1 if is_train else 0) ) if is_train: # NOTE: to avoid leakage, the index i must be removed from the i-th row, # (because of how candidate_k is constructed). distances[ context_idx == torch.arange(batch_size, device=device)[:, None] ] = torch.inf # Not the most elegant solution to remove the argmax, but anyway. context_idx = context_idx.gather(-1, distances.argsort()[:, :-1]) if self.memory_efficient and torch.is_grad_enabled(): assert is_train # Repeating the same computation, # but now only for the context objects and with autograd on. context_k = self._encode( { ftype: torch.cat([x_[ftype], candidate_x_[ftype]])[ context_idx ].flatten(0, 1) for ftype in x_ } )[1].reshape(batch_size, context_size, -1) else: context_k = candidate_k[context_idx] # In theory, when autograd is off, the distances obtained during the search # can be reused. However, this is not a bottleneck, so let's keep it simple # and use the same code to compute `similarities` during both # training and evaluation. similarities = ( -k.square().sum(-1, keepdim=True) + (2 * (k[..., None, :] @ context_k.transpose(-1, -2))).squeeze(-2) - context_k.square().sum(-1) ) probs = F.softmax(similarities, dim=-1) probs = self.dropout(probs) context_y_emb = self.label_encoder(candidate_y[context_idx][..., None]) values = context_y_emb + self.T(k[:, None] - context_k) context_x = (probs[:, None] @ values).squeeze(1) x = x + context_x # >>> for block in self.blocks1: x = x + block(x) x = self.head(x) return x def zero_wd_condition( module_name: str, module: nn.Module, parameter_name: str, parameter: nn.parameter.Parameter, ): return ( 'label_encoder' in module_name or 'label_encoder' in parameter_name or lib.default_zero_weight_decay_condition( module_name, module, parameter_name, parameter ) ) class TabrLightning(pl.LightningModule): def __init__(self, model, train_dataset, val_dataset, C, n_classes): super().__init__() self.model = model self.dataset = train_dataset self.val_dataset = val_dataset self.C = C if n_classes == 2: self.task_type = "binary" elif n_classes > 2: self.task_type = "multiclass" else: self.task_type = "regression" ls_eps = self.C.get('ls_eps', 0.0) print(f'{ls_eps=}') self.loss_fn = ( partial(bce_with_logits_and_label_smoothing, ls_eps=ls_eps) if self.task_type == "binary" else partial(F.cross_entropy, label_smoothing=ls_eps) if self.task_type == "multiclass" else F.mse_loss ) # Define metrics for binary and multiclass classification if self.task_type in ["binary", "multiclass"]: self.train_accuracy = Accuracy(task=self.task_type, num_classes=n_classes) self.train_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type) self.train_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type) self.train_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type) self.val_accuracy = Accuracy(task=self.task_type, num_classes=n_classes) self.val_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type) self.val_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type) self.val_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type) # Define metrics for regression elif self.task_type == "regression": self.train_mse = MeanSquaredError() self.val_mse = MeanSquaredError() self.train_mae = MeanAbsoluteError() self.val_mae = MeanAbsoluteError() def setup(self, stage=None): self.train_size = len(self.dataset) self.train_indices = torch.arange(self.train_size, device=self.device) # move the dataset to the device # I think that's what tabr does, but # we could also keep it on the cpu for key in self.dataset.data: if self.dataset.data[key] is not None: self.dataset.data[key] = self.dataset.data[key].to(self.device) for key in self.val_dataset.data: if self.val_dataset.data[key] is not None: self.val_dataset.data[key] = self.val_dataset.data[key].to(self.device) def get_Xy(self, part: str, idx) -> tuple[dict[str, Tensor], Tensor]: if self.val_dataset.data['Y'].get_device() == -1: # is still on CPU self.setup() if part == "train": dataset = self.dataset elif part == "val": dataset = self.val_dataset batch = ( { key[2:]: dataset.data[key] for key in dataset.data if key.startswith('X_') }, dataset.data["Y"], ) return ( batch if idx is None else ({k: v[idx] for k, v in batch[0].items()}, batch[1][idx]) ) def training_step(self, batch, batch_idx): # batch should contain dictionaries with keys # "x_num", "x_bin", "x_cat", "y" and "indices" batch_indices = batch["indices"] # batch_idx is the id of the batch itself # batch_indices contains the ids of the samples in the batch x, y = self.get_Xy('train', batch_indices) # we're in training mode # Remove the training batch from the candidates candidate_indices = self.train_indices[~torch.isin(self.train_indices, batch_indices)] candidate_x, candidate_y = self.get_Xy('train', candidate_indices) # Call the model's forward method output = self.model( x_=x, y=y, candidate_x_=candidate_x, candidate_y=candidate_y, context_size=self.C["context_size"], is_train=True ).squeeze(-1) y = y.float() if self.task_type == "regression" else y.long() # binary cross entropy with logits needs float loss = self.loss_fn(output, y.float() \ if self.task_type == "binary" \ else y) # Log the loss and return it self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True) if self.task_type in ["binary", "multiclass"]: self.train_accuracy.update(output, y) self.train_precision.update(output, y) self.train_recall.update(output, y) self.train_f1_score.update(output, y) self.log('train_accuracy', self.train_accuracy, on_epoch=True, prog_bar=True) self.log('train_precision', self.train_precision, on_epoch=True) self.log('train_recall', self.train_recall, on_epoch=True) self.log('train_f1_score', self.train_f1_score, on_epoch=True) elif self.task_type == "regression": self.train_mse.update(output, y) self.train_mae.update(output, y) self.log('train_mse', self.train_mse, on_epoch=True) self.log('train_mae', self.train_mae, on_epoch=True, prog_bar=True) return loss def validation_step(self, batch, batch_idx): if batch_idx == 0: print(f'Validation in epoch {self.current_epoch}', flush=True) # print(f'Validation step', flush=True) # TODO: do like test to save gpu memory? batch_indices = batch["indices"] # batch_idx is the idxs of the batch samples x, y = self.get_Xy("val", batch_indices) candidate_indices = self.train_indices candidate_x, candidate_y = self.get_Xy('train', candidate_indices) output = self.model( x_=x, y=None, candidate_x_=candidate_x, candidate_y=candidate_y, context_size=self.C["context_size"], is_train=False, ).squeeze(-1) y = y.float() if self.task_type == "regression" else y.long() # binary cross entropy with logits needs float loss = self.loss_fn(output, y.float() \ if self.task_type == "binary" \ else y) self.log('val_loss', loss, on_epoch=True, prog_bar=True) # Log validation loss if self.task_type in ["binary", "multiclass"]: self.val_accuracy.update(output, y) self.val_precision.update(output, y) self.val_recall.update(output, y) self.val_f1_score(output, y) self.log('val_accuracy', self.val_accuracy, on_epoch=True, prog_bar=True) self.log('val_precision', self.val_precision, on_epoch=True) self.log('val_recall', self.val_recall, on_epoch=True) self.log('val_f1_score', self.val_f1_score, on_epoch=True) elif self.task_type == "regression": self.val_mse.update(output, y) self.log('val_mse', self.val_mse, on_epoch=True) self.val_mae.update(output, y) self.log('val_mae', self.val_mae, on_epoch=True, prog_bar=True) return loss def predict_step(self, batch, batch_idx, dataloader_idx=None): # here batch shouldn't contain indices nor y x = { key[2:]: batch[key] for key in batch if key.startswith('X_') } candidate_indices = self.train_indices candidate_x, candidate_y = self.get_Xy('train', candidate_indices) output = self.model( x_=x, y=None, candidate_x_=candidate_x, candidate_y=candidate_y, context_size=self.C["context_size"], is_train=False, ).squeeze(-1) # in binary case, we need to convert it to 2-class logits if self.task_type == "binary": # it will be passed to a softmax, so we need to add a 0 # to make the probabilities right output = torch.stack([torch.zeros_like(output), output], dim=1) elif self.task_type == "regression": output = output.unsqueeze(1) return output def configure_optimizers(self): optimizer_config = self.C["optimizer"].copy() optimizer = lib.make_optimizer( self.model, **optimizer_config, zero_weight_decay_condition=zero_wd_condition ) return optimizer def train_dataloader(self): return DataLoader(self.dataset, batch_size=self.C["batch_size"], shuffle=True, num_workers=0, #max(1, min(self.C["n_threads"] - 1, 8)), persistent_workers=False) def val_dataloader(self): return DataLoader(self.val_dataset, batch_size=self.C["eval_batch_size"], shuffle=False, num_workers=0, #max(1, min(self.C["n_threads"] - 1, 8)), persistent_workers=False) ================================================ FILE: pytabkit/models/nn_models/tabr_context_freeze.py ================================================ import os import inspect import warnings import math from functools import partial import torch from torch import Tensor import torch.optim as optim from torch.utils.data import DataLoader, Dataset import torch.nn.functional as F from pytabkit.models.nn_models import tabr_lib as lib import torch.nn as nn from torchmetrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError, AUROC, MeanAbsoluteError from typing import Any, Optional, Union, Literal, Callable, NamedTuple from tqdm import tqdm try: import lightning.pytorch as pl except ImportError: import pytorch_lightning as pl from pytabkit.models.nn_models.tabr import ParametricMishActivationLayer, ParametricReluActivationLayer, ScalingLayer, \ bce_with_logits_and_label_smoothing # taken from https://github.com/yandex-research/tabular-dl-tabr/tree/main/bin # and https://github.com/yandex-research/tabular-dl-tabr/blob/main/bin/tabr_scaling.py class TabrModelContextFreeze(nn.Module): class ForwardOutput(NamedTuple): y_pred: Tensor context_idx: Tensor context_probs: Tensor def __init__( self, *, # n_num_features: int, n_bin_features: int, cat_cardinalities: list[int], n_classes: Optional[int], # num_embeddings: Optional[dict], # lib.deep.ModuleSpec d_main: int, d_multiplier: float, encoder_n_blocks: int, predictor_n_blocks: int, mixer_normalization: Union[bool, Literal['auto']], context_dropout: float, dropout0: float, dropout1: Union[float, Literal['dropout0']], normalization: str, activation: str, # # The following options should be used only when truly needed. memory_efficient: bool = False, candidate_encoding_batch_size: Optional[int] = None, add_scaling_layer: bool = False, scale_lr_factor: float = 6.0, ) -> None: # import locally so importing this file doesn't cause problems if faiss is not installed # import in constructor as well to make model fail earlier if not installed import faiss import faiss.contrib.torch_utils # noqa << this line makes faiss work with PyTorch if not memory_efficient: assert candidate_encoding_batch_size is None if mixer_normalization == 'auto': mixer_normalization = encoder_n_blocks > 0 if encoder_n_blocks == 0: assert not mixer_normalization super().__init__() if dropout1 == 'dropout0': dropout1 = dropout0 self.one_hot_encoder = ( lib.OneHotEncoder(cat_cardinalities) if cat_cardinalities else None ) self.num_embeddings = ( None if num_embeddings is None else lib.make_module(num_embeddings, n_features=n_num_features) ) print(f'{add_scaling_layer=}') print(f'{activation=}') print(f'{scale_lr_factor=}') # >>> E d_in = ( n_num_features * (1 if num_embeddings is None else num_embeddings['d_embedding']) + n_bin_features + sum(cat_cardinalities) ) d_block = int(d_main * d_multiplier) Normalization = getattr(nn, normalization) if activation == 'pmish': Activation = lambda n_features: ParametricMishActivationLayer(n_features=n_features) elif activation == 'prelu': Activation = lambda n_features: ParametricReluActivationLayer(n_features=n_features) else: Activation = lambda n_features: getattr(nn, activation)() def make_block(prenorm: bool) -> nn.Sequential: return nn.Sequential( *([Normalization(d_main)] if prenorm else []), nn.Linear(d_main, d_block), Activation(d_block), nn.Dropout(dropout0), nn.Linear(d_block, d_main), nn.Dropout(dropout1), ) self.scale = ScalingLayer(d_in, lr_factor=scale_lr_factor) if add_scaling_layer else nn.Identity() self.linear = nn.Linear(d_in, d_main) self.blocks0 = nn.ModuleList( [make_block(i > 0) for i in range(encoder_n_blocks)] ) # >>> R self.normalization = Normalization(d_main) if mixer_normalization else None self.label_encoder = ( nn.Linear(1, d_main) if n_classes is None else nn.Sequential( nn.Embedding(n_classes, d_main), lib.Lambda(lambda x: x.squeeze(-2)) ) ) self.K = nn.Linear(d_main, d_main) self.T = nn.Sequential( nn.Linear(d_main, d_block), Activation(d_block), nn.Dropout(dropout0), nn.Linear(d_block, d_main, bias=False), ) self.dropout = nn.Dropout(context_dropout) # >>> P self.blocks1 = nn.ModuleList( [make_block(True) for _ in range(predictor_n_blocks)] ) self.head = nn.Sequential( Normalization(d_main), Activation(d_main), nn.Linear(d_main, lib.get_d_out(n_classes)), ) # >>> self.search_index = None self.memory_efficient = memory_efficient self.candidate_encoding_batch_size = candidate_encoding_batch_size self.reset_parameters() def reset_parameters(self): if isinstance(self.label_encoder, nn.Linear): bound = 1 / math.sqrt(2.0) nn.init.uniform_(self.label_encoder.weight, -bound, bound) # type: ignore[code] # noqa: E501 nn.init.uniform_(self.label_encoder.bias, -bound, bound) # type: ignore[code] # noqa: E501 else: assert isinstance(self.label_encoder[0], nn.Embedding) nn.init.uniform_(self.label_encoder[0].weight, -1.0, 1.0) # type: ignore[code] # noqa: E501 def _encode(self, x_: dict[str, Tensor]) -> tuple[Tensor, Tensor]: x_num = x_.get('num') x_bin = x_.get('bin') x_cat = x_.get('cat') del x_ x = [] if x_num is None: # assert self.num_embeddings is None pass # changed to make it easier to use with all-categorical datasets else: x.append( x_num if self.num_embeddings is None else self.num_embeddings(x_num).flatten(1) ) if x_bin is not None: x.append(x_bin) if x_cat is None: assert self.one_hot_encoder is None else: assert self.one_hot_encoder is not None x.append(self.one_hot_encoder(x_cat)) assert x x = torch.cat(x, dim=1).float() x = self.scale(x) x = self.linear(x) for block in self.blocks0: x = x + block(x) k = self.K(x if self.normalization is None else self.normalization(x)) return x, k def forward( self, *, x_: dict[str, Tensor], y: Optional[Tensor], idx: Optional[Tensor], candidate_x_: dict[str, Tensor], candidate_y: Tensor, candidate_idx: Tensor, context_size: int, context_idx: Optional[Tensor], is_train: bool, ): # import locally so importing this file doesn't cause problems if faiss is not installed import faiss import faiss.contrib.torch_utils # noqa << this line makes faiss work with PyTorch # >>> E with torch.set_grad_enabled( torch.is_grad_enabled() and not self.memory_efficient ): candidate_k = ( self._encode(candidate_x_)[1] if self.candidate_encoding_batch_size is None else torch.cat( [ self._encode(x)[1] for x in lib.iter_batches( candidate_x_, self.candidate_encoding_batch_size ) ] ) ) x, k = self._encode(x_) if is_train: assert y is not None assert idx is not None if context_idx is None: candidate_k = torch.cat([k, candidate_k]) candidate_y = torch.cat([y, candidate_y]) candidate_idx = torch.cat([idx, candidate_idx]) else: assert y is None assert idx is None # >>> batch_size, d_main = k.shape device = k.device if context_idx is None: with torch.no_grad(): if self.search_index is None: # self.search_index = ( # faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main) # if device.type == 'cuda' # else faiss.IndexFlatL2(d_main) # ) if device.type == 'cpu': self.search_index = faiss.IndexFlatL2(d_main) elif device.type == 'cuda': gpu_index = 0 if device.index is None else device.index cfg = faiss.GpuIndexFlatConfig() cfg.device = gpu_index self.search_index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d_main, cfg) else: raise ValueError() self.search_index.reset() self.search_index.add(candidate_k) # type: ignore[code] distances: Tensor distances, context_idx = self.search_index.search( # type: ignore[code] k, context_size + (1 if is_train else 0) ) assert isinstance(context_idx, Tensor) if is_train: distances[ context_idx == torch.arange(batch_size, device=device)[:, None] ] = torch.inf context_idx = context_idx.gather(-1, distances.argsort()[:, :-1]) # print("context_idx", context_idx) # "absolute" means "not relative", i.e. the original indices in the train set. absolute_context_idx = candidate_idx[context_idx] if self.memory_efficient and torch.is_grad_enabled(): assert is_train context_k = self._encode( { ftype: torch.cat([x_[ftype], candidate_x_[ftype]])[ context_idx ].flatten(0, 1) for ftype in x_ } )[1].reshape(batch_size, context_size, -1) else: context_k = candidate_k[context_idx] similarities = ( -k.square().sum(-1, keepdim=True) + (2 * (k[..., None, :] @ context_k.transpose(-1, -2))).squeeze(-2) - context_k.square().sum(-1) ) raw_probs = F.softmax(similarities, dim=-1) probs = self.dropout(raw_probs) context_y_emb: Tensor = self.label_encoder(candidate_y[context_idx][..., None]) values: Tensor = context_y_emb + self.T(k[:, None] - context_k) context_x = (probs[:, None] @ values).squeeze(1) x = x + context_x # >>> for block in self.blocks1: x: Tensor = x + block(x) x: Tensor = self.head(x) return TabrModelContextFreeze.ForwardOutput(x, absolute_context_idx, raw_probs) def zero_wd_condition( module_name: str, module: nn.Module, parameter_name: str, parameter: nn.parameter.Parameter, ): return ( 'label_encoder' in module_name or 'label_encoder' in parameter_name or lib.default_zero_weight_decay_condition( module_name, module, parameter_name, parameter ) ) class TabrLightningContextFreeze(pl.LightningModule): def __init__(self, model, train_dataset, val_dataset, C, n_classes): super().__init__() self.model = model self.dataset = train_dataset self.val_dataset = val_dataset self.C = C if n_classes == 2: self.task_type = "binary" elif n_classes > 2: self.task_type = "multiclass" else: self.task_type = "regression" ls_eps = self.C.get('ls_eps', 0.0) print(f'{ls_eps=}') self.loss_fn = ( partial(bce_with_logits_and_label_smoothing, ls_eps=ls_eps) if self.task_type == "binary" else partial(F.cross_entropy, label_smoothing=ls_eps) if self.task_type == "multiclass" else F.mse_loss ) # Define metrics for binary and multiclass classification if self.task_type in ["binary", "multiclass"]: self.train_accuracy = Accuracy(task=self.task_type, num_classes=n_classes) self.train_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type) self.train_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type) self.train_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type) self.val_accuracy = Accuracy(task=self.task_type, num_classes=n_classes) self.val_precision = Precision(average='macro', num_classes=n_classes, task=self.task_type) self.val_recall = Recall(average='macro', num_classes=n_classes, task=self.task_type) self.val_f1_score = F1Score(average='macro', num_classes=n_classes, task=self.task_type) # Define metrics for regression elif self.task_type == "regression": self.train_mse = MeanSquaredError() self.val_mse = MeanSquaredError() self.train_mae = MeanAbsoluteError() self.val_mae = MeanAbsoluteError() self.frozen_contexts = None def setup(self, stage=None): self.train_size = len(self.dataset) self.train_indices = torch.arange(self.train_size, device=self.device) # move the dataset to the device # I think that's what tabr does, but # we could also keep it on the cpu for key in self.dataset.data: if self.dataset.data[key] is not None: self.dataset.data[key] = self.dataset.data[key].to(self.device) for key in self.val_dataset.data: if self.val_dataset.data[key] is not None: self.val_dataset.data[key] = self.val_dataset.data[key].to(self.device) def get_Xy(self, part: str, idx) -> tuple[dict[str, Tensor], Tensor]: if self.val_dataset.data['Y'].get_device() == -1: # is still on CPU self.setup() if part == "train": dataset = self.dataset elif part == "val": dataset = self.val_dataset batch = ( { key[2:]: dataset.data[key] for key in dataset.data if key.startswith('X_') }, dataset.data["Y"], ) return ( batch if idx is None else ({k: v[idx] for k, v in batch[0].items()}, batch[1][idx]) ) def apply_model(self, part, batch, batch_idx, training): # batch should contain dictionaries with keys # "x_num", "x_bin", "x_cat", "y" and "indices" batch_indices = batch["indices"].to(self.device) # batch_idx is the id of the batch itself # batch_indices contains the ids of the samples in the batch # batch_indices contains the ids of the samples in the batch x, y = self.get_Xy(part, batch_indices) is_train = part == 'train' if training and self.frozen_contexts is not None: candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique( return_inverse=True ) else: # Importantly, `training`, not `is_train` should be used to choose the queue candidate_indices = self.train_indices context_idx = None if is_train: # This is not done when there are frozen contexts, because they are # already valid. candidate_indices = candidate_indices[ ~torch.isin(candidate_indices, batch_indices) ] candidate_x, candidate_y = self.get_Xy( 'train', candidate_indices, # TODO check ) fwd_out = self.model( x_=x, y=y if is_train else None, idx=batch_indices if is_train else None, candidate_x_=candidate_x, candidate_y=candidate_y, candidate_idx=candidate_indices, context_idx=context_idx, context_size=self.C["context_size"], is_train=is_train, ) return fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1)), y def training_step(self, batch, batch_idx): if batch_idx == 0 and self.current_epoch == self.C["freeze_contexts_after_n_epochs"]: # freeze the contexts print(f'Freezing contexts after {self.current_epoch} epochs', flush=True) # Get context_ids using evaluate? _, _, context_idx, _, _ = self.evaluate(self.C["eval_batch_size"], progress_bar=True # TODO ) self.frozen_contexts = torch.tensor(context_idx['train'], device=self.device) # # batch should contain dictionaries with keys # # "x_num", "x_bin", "x_cat", "y" and "indices" # batch_indices = batch["indices"] # batch_idx is the id of the batch itself # # batch_indices contains the ids of the samples in the batch # x, y = self.get_Xy('train', batch_indices) # if self.frozen_contexts is not None: # candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique( # return_inverse=True # ) # else: # context_idx = None # # we're in training mode # # Remove the training batch from the candidates # # This is not done when there are frozen contexts, because they are # # already valid. # candidate_indices = self.train_indices[~torch.isin(self.train_indices, batch_indices)] # candidate_x, candidate_y = self.get_Xy('train', candidate_indices) #TODO check # fwd_out = self.model( # x_=x, # y=y, # idx=batch_indices, # candidate_x_=candidate_x, # candidate_y=candidate_y, # candidate_idx=candidate_indices, # context_idx=context_idx, # context_size=self.C["context_size"], # is_train=True # ) # fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1)) fwd_out, y = self.apply_model("train", batch, batch_idx, training=True) output, _, _ = fwd_out y = y.float() if self.task_type == "regression" else y.long() # binary cross entropy with logits needs float loss = self.loss_fn(output, y.float() \ if self.task_type == "binary" \ else y) # Log the loss and return it self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True) if self.task_type in ["binary", "multiclass"]: self.train_accuracy.update(output, y) self.train_precision.update(output, y) self.train_recall.update(output, y) self.train_f1_score.update(output, y) self.log('train_accuracy', self.train_accuracy, on_epoch=True, prog_bar=True) self.log('train_precision', self.train_precision, on_epoch=True) self.log('train_recall', self.train_recall, on_epoch=True) self.log('train_f1_score', self.train_f1_score, on_epoch=True) elif self.task_type == "regression": self.train_mse.update(output, y) self.train_mae.update(output, y) self.log('train_mse', self.train_mse, on_epoch=True) self.log('train_mae', self.train_mae, on_epoch=True, prog_bar=True) return loss def validation_step(self, batch, batch_idx): if batch_idx == 0: print(f'Validation in epoch {self.current_epoch}', flush=True) # print(f'Validation step', flush=True) # TODO: do like test to save gpu memory? # batch_indices = batch["indices"] # batch_idx is the idxs of the batch samples # x, y = self.get_Xy("val", batch_indices) # if self.frozen_contexts is not None: # candidate_indices, context_idx = self.frozen_contexts[batch_indices].unique( # return_inverse=True # ) # else: # context_idx = None # candidate_indices = self.train_indices # candidate_x, candidate_y = self.get_Xy('train', candidate_indices) # fwd_out = self.model( # x_=x, # y=None, # idx=None, # candidate_x_=candidate_x, # candidate_y=candidate_y, # candidate_idx=candidate_indices, # context_idx=context_idx, # context_size=self.C["context_size"], # is_train=False # ) # fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1)) fwd_out, y = self.apply_model("val", batch, batch_idx, training=False) output, _, _ = fwd_out y = y.float() if self.task_type == "regression" else y.long() # binary cross entropy with logits needs float loss = self.loss_fn(output, y.float() \ if self.task_type == "binary" \ else y) self.log('val_loss', loss, on_epoch=True, prog_bar=True) # Log validation loss if self.task_type in ["binary", "multiclass"]: self.val_accuracy.update(output, y) self.val_precision.update(output, y) self.val_recall.update(output, y) self.val_f1_score(output, y) self.log('val_accuracy', self.val_accuracy, on_epoch=True, prog_bar=True) self.log('val_precision', self.val_precision, on_epoch=True) self.log('val_recall', self.val_recall, on_epoch=True) self.log('val_f1_score', self.val_f1_score, on_epoch=True) elif self.task_type == "regression": self.val_mse.update(output, y) self.log('val_mse', self.val_mse, on_epoch=True) self.val_mae.update(output, y) self.log('val_mae', self.val_mae, on_epoch=True, prog_bar=True) return loss def predict_step(self, batch, batch_idx, dataloader_idx=None): # here batch shouldn't contain indices nor y # TODO: use apply_model x = { key[2:]: batch[key] for key in batch if key.startswith('X_') } context_idx = None candidate_indices = self.train_indices candidate_x, candidate_y = self.get_Xy('train', candidate_indices) fwd_out = self.model( x_=x, y=None, idx=None, candidate_x_=candidate_x, candidate_y=candidate_y, candidate_idx=candidate_indices, context_idx=context_idx, context_size=self.C["context_size"], is_train=False ) fwd_out = fwd_out._replace(y_pred=fwd_out.y_pred.squeeze(-1)) # fwd_out, y = self.apply_model("test", batch, batch_idx, training=False) output, _, _ = fwd_out # in binary case, we need to convert it to 2-class logits if self.task_type == "binary": # it will be passed to a softmax, so we need to add a 0 # to make the probabilities right output = torch.stack([torch.zeros_like(output), output], dim=1) elif self.task_type == "regression": output = output.unsqueeze(1) return output # here we only use it to get context_idx for the frozen contexts # so we only need to do it on train @torch.inference_mode() def evaluate(self, eval_batch_size: int, *, progress_bar: bool = False): self.eval() predictions = {} context_idx = {} context_probs = {} while eval_batch_size: try: # fwd_out = [] # for idx in tqdm( # torch.arange(len(self.dataset), device=self.device).split( # eval_batch_size # ), # desc=f'Evaluation ("train"))', # disable=not progress_bar, # ): # batch = { # key: self.dataset.data[key][idx] # for key in self.dataset.data # } # x = { # key[2:]: batch[key] # for key in batch # if key.startswith('X_') # } # #TODO check # fwd_out.append( # self.model( # x_=x, # y=None, # idx=None, # candidate_x_=x, # candidate_y=batch['Y'], # candidate_idx=idx, # context_idx=None, # context_size=self.C["context_size"], # is_train=False # ) # ) fwd_out = lib.cat( [ self.apply_model("train", batch, batch_idx, training=False)[0] for batch_idx, batch in enumerate( DataLoader( self.dataset, batch_size=eval_batch_size, shuffle=False ) ) ] ) # fwd_out = lib.cat(fwd_out) predictions["train"], context_idx["train"], context_probs["train"] = ( e.cpu().numpy() for e in fwd_out ) except RuntimeError as err: if not lib.is_oom_exception(err): raise eval_batch_size //= 2 print(f'eval_batch_size = {eval_batch_size}') else: break if not eval_batch_size: RuntimeError('Not enough memory even for eval_batch_size=1') metrics = None self.train() return metrics, predictions, context_idx, context_probs, eval_batch_size def configure_optimizers(self): optimizer_config = self.C["optimizer"].copy() optimizer = lib.make_optimizer( self.model, **optimizer_config, zero_weight_decay_condition=zero_wd_condition ) return optimizer def train_dataloader(self): return DataLoader(self.dataset, batch_size=self.C["batch_size"], shuffle=True, num_workers=max(1, min(self.C["n_threads"] - 1, 8)), persistent_workers=True) def val_dataloader(self): return DataLoader(self.val_dataset, batch_size=self.C["eval_batch_size"], shuffle=False, num_workers=max(1, min(self.C["n_threads"] - 1, 8)), persistent_workers=True) ================================================ FILE: pytabkit/models/nn_models/tabr_lib.py ================================================ import math import inspect import warnings import dataclasses from typing import Any, Callable, Optional, Union, cast, Iterator, Iterable, List, TypeVar import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch import Tensor from torch.nn.parameter import Parameter # we copied this file from https://github.com/yandex-research/tabular-dl-tabr/blob/main/lib/deep.py # to limit the number of dependencies # ====================================================================================== # >>> modules <<< # ====================================================================================== # When an instance of ModuleSpec is a dict, # it must contain the key "type" with a string value ModuleSpec = Union[str, dict[str, Any], Callable[..., nn.Module]] T = TypeVar('T') def _initialize_embeddings(weight: Tensor, d: Optional[int]) -> None: if d is None: d = weight.shape[-1] d_sqrt_inv = 1 / math.sqrt(d) nn.init.uniform_(weight, a=-d_sqrt_inv, b=d_sqrt_inv) def make_trainable_vector(d: int) -> Parameter: x = torch.empty(d) _initialize_embeddings(x, None) return Parameter(x) # class OneHotEncoder(nn.Module): # cardinalities: Tensor # def __init__(self, cardinalities: list[int]) -> None: # # cardinalities[i]`` is the number of unique values for the i-th categorical feature. # super().__init__() # self.register_buffer('cardinalities', torch.tensor(cardinalities)) # def forward(self, x: Tensor) -> Tensor: # encoded_columns = [ # F.one_hot(x[..., column], cardinality) # for column, cardinality in zip(range(x.shape[-1]), self.cardinalities) # ] # return torch.cat(encoded_columns, -1) # This is modified to allow to encode unknown categories with zeros class OneHotEncoder(nn.Module): cardinalities: torch.Tensor def __init__(self, cardinalities: list[int]) -> None: super().__init__() self.register_buffer('cardinalities', torch.tensor(cardinalities)) def forward(self, x: torch.Tensor) -> torch.Tensor: encoded_columns = [] for column, cardinality in enumerate(self.cardinalities): column_values = x[..., column] # Replace -1 with a temporary valid index (e.g., 0) temp_index = torch.where(column_values == -1, 0, column_values) # Perform one-hot encoding one_hot = F.one_hot(temp_index, cardinality) # Zero out the vectors where original value was -1 mask = column_values == -1 one_hot[mask] = 0 encoded_columns.append(one_hot) return torch.cat(encoded_columns, -1) class CLSEmbedding(nn.Module): def __init__(self, d_embedding: int) -> None: super().__init__() self.weight = make_trainable_vector(d_embedding) def forward(self, x: Tensor) -> Tensor: assert x.ndim == 3 assert x.shape[-1] == len(self.weight) return torch.cat([self.weight.expand(len(x), 1, -1), x], dim=1) class CatEmbeddings(nn.Module): def __init__( self, _cardinalities_and_maybe_dimensions: Union[list[int], list[tuple[int, int]]], d_embedding: Optional[int] = None, *, stack: bool = False, ) -> None: assert _cardinalities_and_maybe_dimensions spec = _cardinalities_and_maybe_dimensions if not ( (isinstance(spec[0], tuple) and d_embedding is None) or (isinstance(spec[0], int) and d_embedding is not None) ): raise ValueError( 'Invalid arguments. Valid combinations are:' ' (1) the first argument is a list of (cardinality, embedding)-tuples AND d_embedding is None' ' (2) the first argument is a list of cardinalities AND d_embedding is an integer' ) if stack and d_embedding is None: raise ValueError('stack can be True only when d_embedding is not None') super().__init__() spec_ = cast( list[tuple[int, int]], spec if d_embedding is None else [(x, d_embedding) for x in spec], ) self._embeddings = nn.ModuleList() for cardinality, d_embedding in spec_: self._embeddings.append(nn.Embedding(cardinality, d_embedding)) self.stack = stack self.reset_parameters() def reset_parameters(self) -> None: for module in self._embeddings: _initialize_embeddings(module.weight, None) # type: ignore[code] def forward(self, x: Tensor) -> Tensor: assert x.ndim == 2 assert x.shape[1] == len(self._embeddings) out = [module(column) for module, column in zip(self._embeddings, x.T)] return torch.stack(out, dim=1) if self.stack else torch.cat(out, dim=1) class LinearEmbeddings(nn.Module): def __init__(self, n_features: int, d_embedding: int, bias: bool = True): super().__init__() self.weight = Parameter(Tensor(n_features, d_embedding)) self.bias = Parameter(Tensor(n_features, d_embedding)) if bias else None self.reset_parameters() def reset_parameters(self) -> None: for parameter in [self.weight, self.bias]: if parameter is not None: _initialize_embeddings(parameter, parameter.shape[-1]) def forward(self, x: Tensor) -> Tensor: assert x.ndim == 2 x = self.weight[None] * x[..., None] if self.bias is not None: x = x + self.bias[None] return x class PeriodicEmbeddings(nn.Module): def __init__( self, n_features: int, n_frequencies: int, frequency_scale: float ) -> None: super().__init__() self.frequencies = Parameter( torch.normal(0.0, frequency_scale, (n_features, n_frequencies)) ) def forward(self, x: Tensor) -> Tensor: assert x.ndim == 2 x = 2 * torch.pi * self.frequencies[None] * x[..., None] x = torch.cat([torch.cos(x), torch.sin(x)], -1) return x class NLinear(nn.Module): def __init__( self, n_features: int, d_in: int, d_out: int, bias: bool = True ) -> None: super().__init__() self.weight = Parameter(Tensor(n_features, d_in, d_out)) self.bias = Parameter(Tensor(n_features, d_out)) if bias else None with torch.no_grad(): for i in range(n_features): layer = nn.Linear(d_in, d_out) self.weight[i] = layer.weight.T if self.bias is not None: self.bias[i] = layer.bias def forward(self, x): assert x.ndim == 3 x = x[..., None] * self.weight[None] x = x.sum(-2) if self.bias is not None: x = x + self.bias[None] return x class LREmbeddings(nn.Sequential): """The LR embeddings from the paper 'On Embeddings for Numerical Features in Tabular Deep Learning'.""" # noqa: E501 def __init__(self, n_features: int, d_embedding: int) -> None: super().__init__(LinearEmbeddings(n_features, d_embedding), nn.ReLU()) class PLREmbeddings(nn.Sequential): """The PLR embeddings from the paper 'On Embeddings for Numerical Features in Tabular Deep Learning'. Additionally, the 'lite' option is added. Setting it to `False` gives you the original PLR embedding from the above paper. We noticed that `lite=True` makes the embeddings noticeably more lightweight without critical performance loss, and we used that for our model. """ # noqa: E501 def __init__( self, n_features: int, n_frequencies: int, frequency_scale: float, d_embedding: int, lite: bool, ) -> None: super().__init__( PeriodicEmbeddings(n_features, n_frequencies, frequency_scale), ( nn.Linear(2 * n_frequencies, d_embedding) if lite else NLinear(n_features, 2 * n_frequencies, d_embedding) ), nn.ReLU(), ) class PBLDEmbeddings(nn.Module): def __init__(self, n_features: int, n_frequencies: int, frequency_scale: float, d_embedding: int, plr_act_name: str = 'linear', plr_use_densenet: bool = True): super().__init__() print(f'Constructing PBLD embeddings') hidden_2 = d_embedding-1 if plr_use_densenet else d_embedding self.weight_1 = nn.Parameter(frequency_scale * torch.randn(n_features, 1, n_frequencies)) self.weight_2 = nn.Parameter((-1 + 2 * torch.rand(n_features, n_frequencies, hidden_2)) / np.sqrt(n_frequencies)) self.bias_1 = nn.Parameter(np.pi * (-1 + 2 * torch.rand(n_features, 1, n_frequencies))) self.bias_2 = nn.Parameter((-1 + 2 * torch.rand(n_features, 1, hidden_2)) / np.sqrt(n_frequencies)) self.plr_act_name = plr_act_name self.plr_use_densenet = plr_use_densenet def forward(self, x): # transpose to treat the continuous feature dimension like a batched dimension # then add a new channel dimension # shape will be (vectorized..., n_cont, batch, 1) x_orig = x x = x.transpose(-1, -2).unsqueeze(-1) x = 2 * torch.pi * x.matmul(self.weight_1) # matmul is automatically batched x = x + self.bias_1 # x = torch.sin(x) x = torch.cos(x) x = x.matmul(self.weight_2) # matmul is automatically batched x = x + self.bias_2 if self.plr_act_name == 'relu': x = torch.relu(x) elif self.plr_act_name == 'linear': pass else: raise ValueError(f'Unknown plr_act_name "{self.plr_act_name}"') # bring back n_cont dimension after n_batch # then flatten the last two dimensions x = x.transpose(-2, -3) x = x.reshape(*x.shape[:-2], x.shape[-2] * x.shape[-1]) if self.plr_use_densenet: x = torch.cat([x, x_orig], dim=-1) return x class MLP(nn.Module): class Block(nn.Module): def __init__( self, *, d_in: int, d_out: int, bias: bool, activation: str, dropout: float, ) -> None: super().__init__() self.linear = nn.Linear(d_in, d_out, bias) self.activation = make_module(activation) self.dropout = nn.Dropout(dropout) def forward(self, x: Tensor) -> Tensor: return self.dropout(self.activation(self.linear(x))) Head = nn.Linear def __init__( self, *, d_in: int, d_out: Optional[int], n_blocks: int, d_layer: int, activation: str, dropout: float, ) -> None: assert n_blocks > 0 super().__init__() self.blocks = nn.Sequential( *[ MLP.Block( d_in=d_layer if block_i else d_in, d_out=d_layer, bias=True, activation=activation, dropout=dropout, ) for block_i in range(n_blocks) ] ) self.head = None if d_out is None else MLP.Head(d_layer, d_out) @property def d_out(self) -> int: return ( self.blocks[-1].linear.out_features # type: ignore[code] if self.head is None else self.head.out_features ) def forward(self, x: Tensor) -> Tensor: x = self.blocks(x) if self.head is not None: x = self.head(x) return x _CUSTOM_MODULES = { x.__name__: x for x in [ LinearEmbeddings, LREmbeddings, PLREmbeddings, PBLDEmbeddings, MLP, ] } def register_module(key: str, f: Callable[..., nn.Module]) -> None: assert key not in _CUSTOM_MODULES _CUSTOM_MODULES[key] = f def make_module(spec: ModuleSpec, *args, **kwargs) -> nn.Module: """ >>> make_module('ReLU') >>> make_module(nn.ReLU) >>> make_module('Linear', 1, out_features=2) >>> make_module((lambda *args: nn.Linear(*args)), 1, out_features=2) >>> make_module({'type': 'Linear', 'in_features' 1}, out_features=2) """ if isinstance(spec, str): Module = getattr(nn, spec, None) if Module is None: Module = _CUSTOM_MODULES[spec] else: assert spec not in _CUSTOM_MODULES return make_module(Module, *args, **kwargs) elif isinstance(spec, dict): assert not (set(spec) & set(kwargs)) spec = spec.copy() return make_module(spec.pop('type'), *args, **spec, **kwargs) elif callable(spec): return spec(*args, **kwargs) else: raise ValueError() def get_n_parameters(m: nn.Module): return sum(x.numel() for x in m.parameters() if x.requires_grad) def get_d_out(n_classes: Optional[int]) -> int: return 1 if n_classes is None or n_classes == 2 else n_classes # ====================================================================================== # >>> optimization <<< # ====================================================================================== def default_zero_weight_decay_condition( module_name: str, module: nn.Module, parameter_name: str, parameter: Parameter ): del module_name, parameter return parameter_name.endswith('bias') or isinstance( module, ( nn.BatchNorm1d, nn.LayerNorm, nn.InstanceNorm1d, LinearEmbeddings, PeriodicEmbeddings, ), ) def make_parameter_groups( model: nn.Module, zero_weight_decay_condition, custom_groups: dict[tuple[str], dict], # [(fullnames, options), ...] ) -> list[dict[str, Any]]: custom_fullnames = set() custom_fullnames.update(*custom_groups) assert sum(map(len, custom_groups)) == len( custom_fullnames ), 'Custom parameter groups must not intersect' parameters_info = {} # fullname -> (parameter, needs_wd) for module_name, module in model.named_modules(): for name, parameter in module.named_parameters(): fullname = f'{module_name}.{name}' if module_name else name parameters_info.setdefault(fullname, (parameter, []))[1].append( not zero_weight_decay_condition(module_name, module, name, parameter) ) parameters_info = {k: (v[0], all(v[1])) for k, v in parameters_info.items()} params_with_wd = {'params': []} params_without_wd = {'params': [], 'weight_decay': 0.0} custom_params = {k: {'params': []} | v for k, v in custom_groups.items()} for fullname, (parameter, needs_wd) in parameters_info.items(): for fullnames, group in custom_params.items(): if fullname in fullnames: custom_fullnames.remove(fullname) group['params'].append(parameter) break else: (params_with_wd if needs_wd else params_with_wd)['params'].append(parameter) assert ( not custom_fullnames ), f'Some of the custom parameters were not found in the model: {custom_fullnames}' return [params_with_wd, params_without_wd] + list(custom_params.values()) def make_optimizer( module: nn.Module, type: str, *, zero_weight_decay_condition=default_zero_weight_decay_condition, custom_parameter_groups: Optional[dict[tuple[str], dict]] = None, **optimizer_kwargs, ) -> torch.optim.Optimizer: if custom_parameter_groups is None: custom_parameter_groups = {} Optimizer = getattr(optim, type) parameter_groups = make_parameter_groups( module, zero_weight_decay_condition, custom_parameter_groups ) print(f'{optimizer_kwargs=}') return Optimizer(parameter_groups, **optimizer_kwargs) def get_lr(optimizer: optim.Optimizer) -> float: return next(iter(optimizer.param_groups))['lr'] def set_lr(optimizer: optim.Optimizer, lr: float) -> None: for group in optimizer.param_groups: group['lr'] = lr ## We also package useful delu functions to limit the number of dependencies # copied from https://github.com/Yura52/delu/blob/5f0015cbdff86f64aff8199123012a9663538fcf/delu/nn.py class Lambda(torch.nn.Module): """A wrapper for functions from `torch` and methods of `torch.Tensor`. An important "feature" of this module is that it is intentionally limited: - Only the functions from the `torch` module and the methods of `torch.Tensor` are allowed. - The passed callable must accept a single `torch.Tensor` and return a single `torch.Tensor`. - The allowed keyword arguments must be of simple types (see the docstring). **Usage** >>> m = delu.nn.Lambda(torch.squeeze) >>> m(torch.randn(2, 1, 3, 1)).shape torch.Size([2, 3]) >>> m = delu.nn.Lambda(torch.squeeze, dim=1) >>> m(torch.randn(2, 1, 3, 1)).shape torch.Size([2, 3, 1]) >>> m = delu.nn.Lambda(torch.Tensor.abs_) >>> m(torch.tensor(-1.0)) tensor(1.) Custom functions are not allowed (technically, they are **temporarily** allowed, but this functionality is deprecated and will be removed in future releases): >>> # xdoctest: +SKIP >>> m = delu.nn.Lambda(lambda x: torch.abs(x)) Traceback (most recent call last): ... ValueError: fn must be a function from `torch` or a method of `torch.Tensor`, but ... Non-trivial keyword arguments are not allowed: >>> m = delu.nn.Lambda(torch.mul, other=torch.tensor(2.0)) Traceback (most recent call last): ... ValueError: For kwargs, the allowed value types include: ... """ # noqa: E501 def __init__(self, fn: Callable[..., torch.Tensor], /, **kwargs) -> None: """ Args: fn: the callable. kwargs: the keyword arguments for ``fn``. The allowed values types include: None, bool, int, float, bytes, str and (nested) tuples of these simple types. """ super().__init__() if not callable(fn) or ( fn not in vars(torch).values() and ( fn not in (member for _, member in inspect.getmembers(torch.Tensor)) or inspect.ismethod(fn) # Check if fn is a @classmethod ) ): warnings.warn( 'Passing custom functions to delu.nn.Lambda is deprecated' ' and will be removed in future releases.' ' Only functions from the `torch` module and methods of `torch.Tensor`' ' are allowed', DeprecationWarning, ) # NOTE: in future releases, replace the above warning with this exception: # raise ValueError( # 'fn must be a function from `torch` or a method of `torch.Tensor`,' # f' but this is not true for the passed {fn=}' # ) def is_valid_value(x): return ( x is None or isinstance(x, (bool, int, float, bytes, str)) or isinstance(x, tuple) and all(map(is_valid_value, x)) ) for k, v in kwargs.items(): if not is_valid_value(v): raise ValueError( 'For kwargs, the allowed value types include:' ' None, bool, int, float, bytes, str and (nested) tuples containing' ' values of these simple types. This is not true for the passed' f' argument {k} with the value {v}' ) self._function = fn self._function_kwargs = kwargs def forward(self, x: torch.Tensor) -> torch.Tensor: """Do the forward pass.""" return self._function(x, **self._function_kwargs) # copied from https://github.com/Yura52/delu/blob/5f0015cbdff86f64aff8199123012a9663538fcf/delu/_tensor_ops.py#L339 def _make_index_batches( x: torch.Tensor, batch_size: int, shuffle: bool, generator: Optional[torch.Generator], drop_last: bool, ) -> Iterable[torch.Tensor]: size = len(x) if not size: raise ValueError('data must not contain empty tensors') batch_indices = ( torch.randperm(size, generator=generator, device=x.device) if shuffle else torch.arange(size, device=x.device) ).split(batch_size) return ( batch_indices[:-1] if batch_indices and drop_last and len(batch_indices[-1]) < batch_size else batch_indices ) def iter_batches( data: T, /, batch_size: int, *, shuffle: bool = False, generator: Optional[torch.Generator] = None, drop_last: bool = False, ) -> Iterator[T]: """Iterate over a tensor or a collection of tensors by (random) batches. The function makes batches along the first dimension of the tensors in ``data``. TL;DR (assuming that ``X`` and ``Y`` denote full tensors and ``xi`` and ``yi`` denote batches): - ``delu.iter_batches: X -> [x1, x2, ..., xN]`` - ``delu.iter_batches: (X, Y) -> [(x1, y1), (x2, y2), ..., (xN, yN)]`` - ``delu.iter_batches: {'x': X, 'y': Y} -> [{'x': x1, 'y': y1}, ...]`` - Same for named tuples. - Same for dataclasses. .. note:: `delu.iter_batches` is significantly faster for in-memory tensors than `torch.utils.data.DataLoader`, because, when building batches, it uses batched indexing instead of one-by-one indexing. **Usage** >>> X = torch.randn(12, 32) >>> Y = torch.randn(12) `delu.iter_batches` can be applied to tensors: >>> for x in delu.iter_batches(X, batch_size=5): ... print(len(x)) 5 5 2 `delu.iter_batches` can be applied to tuples: >>> # shuffle=True can be useful for training. >>> dataset = (X, Y) >>> for x, y in delu.iter_batches(dataset, batch_size=5, shuffle=True): ... print(len(x), len(y)) 5 5 5 5 2 2 >>> # Drop the last incomplete batch. >>> for x, y in delu.iter_batches( ... dataset, batch_size=5, shuffle=True, drop_last=True ... ): ... print(len(x), len(y)) 5 5 5 5 >>> # The last batch is complete, so drop_last=True does not have any effect. >>> batches = [] >>> for x, y in delu.iter_batches(dataset, batch_size=6, drop_last=True): ... print(len(x), len(y)) ... batches.append((x, y)) 6 6 6 6 By default, ``shuffle`` is set to `False`, i.e. the order of items is preserved: >>> X2, Y2 = delu.cat(list(delu.iter_batches((X, Y), batch_size=5))) >>> print((X == X2).all().item(), (Y == Y2).all().item()) True True `delu.iter_batches` can be applied to dictionaries: >>> dataset = {'x': X, 'y': Y} >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True): ... print(isinstance(batch, dict), len(batch['x']), len(batch['y'])) True 5 5 True 5 5 True 2 2 `delu.iter_batches` can be applied to named tuples: >>> from typing import NamedTuple >>> class Data(NamedTuple): ... x: torch.Tensor ... y: torch.Tensor >>> dataset = Data(X, Y) >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True): ... print(isinstance(batch, Data), len(batch.x), len(batch.y)) True 5 5 True 5 5 True 2 2 `delu.iter_batches` can be applied to dataclasses: >>> from dataclasses import dataclass >>> @dataclass ... class Data: ... x: torch.Tensor ... y: torch.Tensor >>> dataset = Data(X, Y) >>> for batch in delu.iter_batches(dataset, batch_size=5, shuffle=True): ... print(isinstance(batch, Data), len(batch.x), len(batch.y)) True 5 5 True 5 5 True 2 2 Args: data: the tensor or the non-empty collection of tensors. If data is a collection, then the tensors must be of the same size along the first dimension. batch_size: the batch size. If ``drop_last`` is False, then the last batch can be smaller than ``batch_size``. shuffle: if True, iterate over random batches (without replacement), not sequentially. generator: when ``shuffle`` is True, passing ``generator`` makes the function reproducible. drop_last: when ``True`` and the last batch is smaller then ``batch_size``, then this last batch is not returned (in other words, same as the ``drop_last`` argument for `torch.utils.data.DataLoader`). Returns: the iterator over batches. """ if not shuffle and generator is not None: raise ValueError('When shuffle is False, generator must be None.') constructor: Callable[[Any], T] args = (batch_size, shuffle, generator, drop_last) if isinstance(data, torch.Tensor): item = data for idx in _make_index_batches(item, *args): yield data[idx] # type: ignore elif isinstance(data, tuple): if not data: raise ValueError('data must be non-empty') item = data[0] for x in data: if not isinstance(x, torch.Tensor) or len(x) != len(item): raise ValueError( 'If data is a tuple, it must contain only tensors,' ' and they must have the same first dimension' ) constructor = type(data) # type: ignore constructor = getattr(constructor, '_make', constructor) # Handle named tuples. for idx in _make_index_batches(item, *args): yield constructor(x[idx] for x in data) elif isinstance(data, dict): if not data: raise ValueError('data must be non-empty') item = next(iter(data.values())) for x in data.values(): if not isinstance(x, torch.Tensor) or len(x) != len(item): raise ValueError( 'If data is a dict, it must contain only tensors,' ' and they must have the same first dimension' ) constructor = type(data) # type: ignore for idx in _make_index_batches(item, *args): yield constructor((k, v[idx]) for k, v in data.items()) elif dataclasses.is_dataclass(data): fields = list(dataclasses.fields(data)) if not fields: raise ValueError('data must be non-empty') item = getattr(data, fields[0].name) for field in fields: if field.type is not torch.Tensor: raise ValueError('All dataclass fields must be tensors.') if len(getattr(data, field.name)) != len(item): raise ValueError( 'All dataclass tensors must have the same first dimension.' ) constructor = type(data) # type: ignore for idx in _make_index_batches(item, *args): yield constructor( **{field.name: getattr(data, field.name)[idx] for field in fields} # type: ignore ) else: raise ValueError(f'The collection {type(data)} is not supported.') def cat(data: List[T], /, dim: int = 0) -> T: """Concatenate a sequence of collections of tensors. `delu.cat` is a generalized version of `torch.cat` for concatenating not only tensors, but also (nested) collections of tensors. **Usage** Let's see how a sequence of model outputs for batches can be concatenated into a output tuple for the whole dataset: >>> from torch.utils.data import DataLoader, TensorDataset >>> dataset = TensorDataset(torch.randn(320, 24)) >>> batch_size = 32 >>> >>> # The model returns not only predictions, but also embeddings. >>> def model(x_batch): ... # A dummy forward pass. ... embeddings_batch = torch.randn(batch_size, 16) ... y_pred_batch = torch.randn(batch_size) ... return (y_pred_batch, embeddings_batch) ... >>> y_pred, embeddings = delu.cat( ... [model(batch) for batch in DataLoader(dataset, batch_size, shuffle=True)] ... ) >>> len(y_pred) == len(dataset) True >>> len(embeddings) == len(dataset) True The same works for dictionaries: >>> def model(x_batch): ... return { ... 'y_pred': torch.randn(batch_size), ... 'embeddings': torch.randn(batch_size, 16) ... } ... >>> outputs = delu.cat( ... [model(batch) for batch in DataLoader(dataset, batch_size, shuffle=True)] ... ) >>> len(outputs['y_pred']) == len(dataset) True >>> len(outputs['embeddings']) == len(dataset) True The same works for sequences of named tuples, dataclasses, tensors and nested combinations of all mentioned collection types. *Below, additional technical examples are provided.* The common setup: >>> # First batch. >>> x1 = torch.randn(64, 10) >>> y1 = torch.randn(64) >>> # Second batch. >>> x2 = torch.randn(64, 10) >>> y2 = torch.randn(64) >>> # The last (incomplete) batch. >>> x3 = torch.randn(7, 10) >>> y3 = torch.randn(7) >>> total_size = len(x1) + len(x2) + len(x3) `delu.cat` can be applied to tuples: >>> batches = [(x1, y1), (x2, y2), (x3, y3)] >>> X, Y = delu.cat(batches) >>> len(X) == total_size and len(Y) == total_size True `delu.cat` can be applied to dictionaries: >>> batches = [ ... {'x': x1, 'y': y1}, ... {'x': x2, 'y': y2}, ... {'x': x3, 'y': y3}, ... ] >>> result = delu.cat(batches) >>> isinstance(result, dict) True >>> len(result['x']) == total_size and len(result['y']) == total_size True `delu.cat` can be applied to named tuples: >>> from typing import NamedTuple >>> class Data(NamedTuple): ... x: torch.Tensor ... y: torch.Tensor ... >>> batches = [Data(x1, y1), Data(x2, y2), Data(x3, y3)] >>> result = delu.cat(batches) >>> isinstance(result, Data) True >>> len(result.x) == total_size and len(result.y) == total_size True `delu.cat` can be applied to dataclasses: >>> from dataclasses import dataclass >>> @dataclass ... class Data: ... x: torch.Tensor ... y: torch.Tensor ... >>> batches = [Data(x1, y1), Data(x2, y2), Data(x3, y3)] >>> result = delu.cat(batches) >>> isinstance(result, Data) True >>> len(result.x) == total_size and len(result.y) == total_size True `delu.cat` can be applied to nested collections: >>> batches = [ ... (x1, {'a': {'b': y1}}), ... (x2, {'a': {'b': y2}}), ... (x3, {'a': {'b': y3}}), ... ] >>> X, Y_nested = delu.cat(batches) >>> len(X) == total_size and len(Y_nested['a']['b']) == total_size True **Lists are not supported:** >>> # This does not work. Instead, use tuples. >>> # batches = [[x1, y1], [x2, y2], [x3, y3]] >>> # delu.cat(batches) # Error Args: data: the list of collections of tensors. All items of the list must be of the same type, structure and layout, only the ``dim`` dimension can vary (same as for `torch.cat`). All the "leaf" values must be of the type `torch.Tensor`. dim: the dimension along which the tensors are concatenated. Returns: The concatenated items of the list. """ if not isinstance(data, list): raise ValueError('The input must be a list') if not data: raise ValueError('The input must be non-empty') first = data[0] if isinstance(first, torch.Tensor): return torch.cat(data, dim=dim) # type: ignore elif isinstance(first, tuple): constructor = type(first) constructor = getattr(constructor, '_make', constructor) # Handle named tuples. return constructor( cat([x[i] for x in data], dim=dim) for i in range(len(first)) # type: ignore ) elif isinstance(first, dict): return type(first)((key, cat([x[key] for x in data], dim=dim)) for key in first) # type: ignore elif dataclasses.is_dataclass(first): return type(first)( **{ field.name: cat([getattr(x, field.name) for x in data], dim=dim) for field in dataclasses.fields(first) } ) # type: ignore else: raise ValueError(f'The collection type {type(first)} is not supported.') def is_oom_exception(err: RuntimeError) -> bool: return isinstance(err, torch.cuda.OutOfMemoryError) or any( x in str(err) for x in [ 'CUDA out of memory', 'CUBLAS_STATUS_ALLOC_FAILED', 'CUDA error: out of memory', ] ) ================================================ FILE: pytabkit/models/optim/__init__.py ================================================ ================================================ FILE: pytabkit/models/optim/adopt.py ================================================ # taken from https://github.com/iShohei220/adopt/blob/main/adopt.py # Apache 2.0 license # requires torch >= 2.4 # mypy: allow-untyped-decorators # mypy: allow-untyped-defs from typing import cast, List, Optional, Tuple, Union import torch from torch import Tensor from torch.optim.optimizer import ( _capturable_doc, _default_to_fused_or_foreach, _device_dtype_check_for_fused, _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, _fused_doc, _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _stack_if_compiling, _use_grad_for_differentiable, _view_as_real, DeviceDict, Optimizer, ParamsT, ) __all__ = ["ADOPT", "adopt"] class ADOPT(Optimizer): def __init__( self, params: ParamsT, lr: Union[float, Tensor] = 1e-3, betas: Tuple[float, float] = (0.9, 0.9999), eps: float = 1e-6, weight_decay: float = 0.0, decoupled: bool = False, *, foreach: Optional[bool] = None, maximize: bool = False, capturable: bool = False, differentiable: bool = False, fused: Optional[bool] = None, ): if isinstance(lr, Tensor): if foreach and not capturable: raise ValueError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) if lr.numel() != 1: raise ValueError("Tensor lr must be 1-element") if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") if not 0.0 <= weight_decay: raise ValueError(f"Invalid weight_decay value: {weight_decay}") defaults = dict( lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, decoupled=decoupled, maximize=maximize, foreach=foreach, capturable=capturable, differentiable=differentiable, fused=fused, ) super().__init__(params, defaults) if fused: # TODO: support fused raise RuntimeError("`fused` is not currently supported") if differentiable: raise RuntimeError("`fused` does not support `differentiable`") self._step_supports_amp_scaling = True # TODO(crcrpar): [low prec params & their higher prec copy] # Support AMP with FP16/BF16 model params which would need # higher prec copy of params to do update math in higher prec to # alleviate the loss of information. if foreach: raise RuntimeError("`fused` and `foreach` cannot be `True` together.") def __setstate__(self, state): super().__setstate__(state) for group in self.param_groups: group.setdefault("maximize", False) group.setdefault("foreach", None) group.setdefault("capturable", False) group.setdefault("differentiable", False) fused = group.setdefault("fused", None) for p in group["params"]: p_state = self.state.get(p, []) if len(p_state) != 0 and not torch.is_tensor(p_state["step"]): step_val = float(p_state["step"]) p_state["step"] = ( torch.tensor( step_val, dtype=_get_scalar_dtype(is_fused=fused), device=p.device, ) if group["capturable"] or group["fused"] else torch.tensor(step_val, dtype=_get_scalar_dtype()) ) def _init_group( self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, ): has_complex = False for p in group["params"]: if p.grad is not None: has_complex |= torch.is_complex(p) params_with_grad.append(p) if p.grad.is_sparse: raise RuntimeError( "ADOPT does not support sparse gradients" ) grads.append(p.grad) state = self.state[p] # Lazy state initialization if len(state) == 0: if group["fused"]: _device_dtype_check_for_fused(p) # note(crcrpar): [special device hosting for step] # Deliberately host `step` on CPU if both capturable and fused are off. # This is because kernel launches are costly on CUDA and XLA. state["step"] = ( torch.zeros( (), dtype=_get_scalar_dtype(is_fused=group["fused"]), device=p.device, ) if group["capturable"] or group["fused"] else torch.tensor(0.0, dtype=_get_scalar_dtype()) ) # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like( p, memory_format=torch.preserve_format ) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format ) exp_avgs.append(state["exp_avg"]) exp_avg_sqs.append(state["exp_avg_sq"]) if group["differentiable"] and state["step"].requires_grad: raise RuntimeError( "`requires_grad` is not supported for `step` in differentiable mode" ) # Foreach without capturable does not support a tensor lr if ( group["foreach"] and torch.is_tensor(group["lr"]) and not group["capturable"] ): raise RuntimeError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) state_steps.append(state["step"]) return has_complex @_use_grad_for_differentiable def step(self, closure=None): """Perform a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. """ self._cuda_graph_capture_health_check() loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: params_with_grad: List[Tensor] = [] grads: List[Tensor] = [] exp_avgs: List[Tensor] = [] exp_avg_sqs: List[Tensor] = [] state_steps: List[Tensor] = [] beta1, beta2 = group["betas"] has_complex = self._init_group( group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, ) adopt( params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps, has_complex=has_complex, beta1=beta1, beta2=beta2, lr=group["lr"], weight_decay=group["weight_decay"], decoupled=group["decoupled"], eps=group["eps"], maximize=group["maximize"], foreach=group["foreach"], capturable=group["capturable"], differentiable=group["differentiable"], fused=group["fused"], grad_scale=getattr(self, "grad_scale", None), found_inf=getattr(self, "found_inf", None), ) return loss def _single_tensor_adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], grad_scale: Optional[Tensor], found_inf: Optional[Tensor], *, has_complex: bool, beta1: float, beta2: float, lr: Union[float, Tensor], weight_decay: float, decoupled: bool, eps: float, maximize: bool, capturable: bool, differentiable: bool, ): assert grad_scale is None and found_inf is None if torch.jit.is_scripting(): # this assert is due to JIT being dumb and not realizing that the ops below # have overloads to handle both float and Tensor lrs, so we just assert it's # a float since most people using JIT are using floats assert isinstance(lr, float) for i, param in enumerate(params): grad = grads[i] if not maximize else -grads[i] exp_avg = exp_avgs[i] exp_avg_sq = exp_avg_sqs[i] step_t = state_steps[i] # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: capturable_supported_devices = _get_capturable_supported_devices() assert ( param.device.type == step_t.device.type and param.device.type in capturable_supported_devices ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." # update step step_t += 1 if weight_decay != 0: if decoupled: param.add_(param, alpha=-lr*weight_decay) else: grad = grad.add(param, alpha=weight_decay) if torch.is_complex(param): grad = torch.view_as_real(grad) if exp_avg is not None: exp_avg = torch.view_as_real(exp_avg) if exp_avg_sq is not None: exp_avg_sq = torch.view_as_real(exp_avg_sq) param = torch.view_as_real(param) step = step_t if capturable or differentiable else _get_value(step_t) if step == 1: exp_avg_sq.addcmul_(grad, grad.conj()) continue denom = torch.clamp(exp_avg_sq.sqrt(), eps) if step == 2: exp_avg.addcdiv_(grad, denom) else: exp_avg.mul_(beta1).addcdiv_(grad, denom, value=1 - beta1) param.add_(exp_avg, alpha=-lr) exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2) def _multi_tensor_adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], grad_scale: Optional[Tensor], found_inf: Optional[Tensor], *, has_complex: bool, beta1: float, beta2: float, lr: Union[float, Tensor], weight_decay: float, decoupled: bool, eps: float, maximize: bool, capturable: bool, differentiable: bool, ): if len(params) == 0: return if isinstance(lr, Tensor) and not capturable: raise RuntimeError( "lr as a Tensor is not supported for capturable=False and foreach=True" ) # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: capturable_supported_devices = _get_capturable_supported_devices( supports_xla=False ) assert all( p.device.type == step.device.type and p.device.type in capturable_supported_devices for p, step in zip(params, state_steps) ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." assert grad_scale is None and found_inf is None assert not differentiable, "_foreach ops don't support autograd" grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_avg_sqs, state_steps] # type: ignore[list-item] ) for ( device_params_, device_grads_, device_exp_avgs_, device_exp_avg_sqs_, device_state_steps_, ), _ in grouped_tensors.values(): device_params = cast(List[Tensor], device_params_) device_grads = cast(List[Tensor], device_grads_) device_exp_avgs = cast(List[Tensor], device_exp_avgs_) device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_) device_state_steps = cast(List[Tensor], device_state_steps_) # Handle complex parameters if has_complex: _view_as_real( device_params, device_grads, device_exp_avgs, device_exp_avg_sqs ) if maximize: device_grads = torch._foreach_neg(device_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just # wrapped it once now. The alpha is required to assure we go to the right overload. if not torch._utils.is_compiling() and device_state_steps[0].is_cpu: torch._foreach_add_( device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0 ) else: torch._foreach_add_(device_state_steps, 1) if weight_decay != 0: if decoupled: torch._foreach_add_(device_params, device_params, alpha=-lr*weight_decay) else: # Re-use the intermediate memory (device_grads) already allocated for maximize if maximize: torch._foreach_add_(device_grads, device_params, alpha=weight_decay) else: device_grads = torch._foreach_add( # type: ignore[assignment] device_grads, device_params, alpha=weight_decay ) if device_state_steps[0] == 1: torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads) continue exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs) exp_avg_sq_sqrt = torch._foreach_maximum(exp_avg_sq_sqrt, eps) if device_state_steps[0] == 2: torch._foreach_addcdiv_(device_exp_avgs, device_grads, exp_avg_sq_sqrt) else: torch._foreach_mul_(device_exp_avgs, beta1) torch._foreach_addcdiv_( device_exp_avgs, device_grads, exp_avg_sq_sqrt, value=1 - beta1 ) torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr) torch._foreach_mul_(device_exp_avg_sqs, beta2) torch._foreach_addcmul_( device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2 ) @_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt) def adopt( params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], state_steps: List[Tensor], # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 # setting this as kwarg for now as functional API is compiled by torch/distributed/optim foreach: Optional[bool] = None, capturable: bool = False, differentiable: bool = False, fused: Optional[bool] = None, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None, has_complex: bool = False, *, beta1: float, beta2: float, lr: Union[float, Tensor], weight_decay: float, decoupled: bool, eps: float, maximize: bool, ): r"""Functional API that performs ADOPT algorithm computation. """ # Respect when the user inputs False/True for foreach or fused. We only want to change # the default when neither have been user-specified. Note that we default to foreach # and pass False to use_fused. This is not a mistake--we want to give the fused impl # bake-in time before making it the default, even if it is typically faster. if fused is None and foreach is None: _, foreach = _default_to_fused_or_foreach( params, differentiable, use_fused=False ) # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False. if foreach and isinstance(lr, Tensor) and not capturable: foreach = False if fused is None: fused = False if foreach is None: foreach = False # this check is slow during compilation, so we skip it # if it's strictly needed we can add this check back in dynamo if not torch._utils.is_compiling() and not all( isinstance(t, torch.Tensor) for t in state_steps ): raise RuntimeError( "API has changed, `state_steps` argument must contain a list of singleton tensors" ) if foreach and torch.jit.is_scripting(): raise RuntimeError("torch.jit.script not supported with foreach optimizers") if fused and torch.jit.is_scripting(): raise RuntimeError("torch.jit.script not supported with fused optimizers") if fused and not torch.jit.is_scripting(): func = _fused_adopt elif foreach and not torch.jit.is_scripting(): func = _multi_tensor_adopt else: func = _single_tensor_adopt func( params, grads, exp_avgs, exp_avg_sqs, state_steps, has_complex=has_complex, beta1=beta1, beta2=beta2, lr=lr, weight_decay=weight_decay, decoupled=decoupled, eps=eps, maximize=maximize, capturable=capturable, differentiable=differentiable, grad_scale=grad_scale, found_inf=found_inf, ) ================================================ FILE: pytabkit/models/optim/optimizers.py ================================================ import warnings from collections import defaultdict from copy import deepcopy from itertools import chain from typing import Optional, Dict, Any, Set, DefaultDict, Iterable import torch import torch.optim as optim from torch.optim.optimizer import required, StateDict from pytabkit.models.training.coord import HyperparamManager from pytabkit.models.optim.scheduling_adam import SchedulingAdam class OptimizerBase(torch.optim.Optimizer): def __init__(self, opt, hyper_mappings, hp_manager: HyperparamManager): self.hp_manager = hp_manager self.hyper_getters = {} self.n_groups = len(opt.param_groups) for names, opt_name, defaults in hyper_mappings: if isinstance(names, str): names = (names,) defaults = (defaults,) for name, default in zip(names, defaults): self.hyper_getters[name] = [self.hp_manager.register_hyper(name, group['params'][0].context.scope, default=default) for group in opt.param_groups] super().__init__(opt.param_groups, defaults={}) self.hyper_mappings = hyper_mappings self.opt = opt def get_hyper_values(self, name, i, use_hyper_factor=True): value = self.hyper_getters[name][i]() param = self.opt.param_groups[i]['params'][0] # should only be one param if use_hyper_factor and name in param.hyper_factors: value *= param.hyper_factors[name] return value def step(self, closure=None, loss: Optional[torch.Tensor] = None): unhandled_mappings = [] for names, opt_name, defaults in self.hyper_mappings: if opt_name is None: unhandled_mappings.append((names, opt_name, defaults)) continue if isinstance(names, tuple): for i, group in enumerate(self.opt.param_groups): group[opt_name] = tuple(self.get_hyper_values(name, i) for name in names) elif isinstance(names, str): for i, group in enumerate(self.opt.param_groups): group[opt_name] = self.get_hyper_values(names, i) else: raise RuntimeError('Could not understand mapping key {}'.format(names)) for names, opt_name, defaults in unhandled_mappings: if names == 'wd': with torch.no_grad(): for i, group in enumerate(self.opt.param_groups): wd = self.get_hyper_values('wd', i) lr = self.get_hyper_values('lr', i) if wd != 0.0: for p in group['params']: p.mul_(1.0 - wd * lr * p.hyper_factors.get('wd', 1.0) * p.hyper_factors.get('lr', 1.0)) else: raise RuntimeError('Could not understand mapping {}'.format((names, opt_name, defaults))) self._opt_step_with_loss(loss) def train(self): if hasattr(self.opt, 'train') and callable(self.opt.train): # print('opt train') self.opt.train() def eval(self): if hasattr(self.opt, 'eval') and callable(self.opt.eval): # print('opt eval') self.opt.eval() def _opt_step_with_loss(self, loss: Optional[torch.Tensor]): self.opt.step() def __getstate__(self) -> Dict[str, Any]: # override the pickling method since otherwise self.opt is not restored return {'__dict__': self.__dict__} def __setstate__(self, state: Dict[str, Any]) -> None: # override the pickling method since otherwise self.opt is not restored self.__dict__ = state['__dict__'] class AdamOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager): super().__init__(optim.Adam(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) class SchedulingAdamOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager): super().__init__(SchedulingAdam(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) class AMSGradOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager): super().__init__(optim.Adam(param_groups, amsgrad=True), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) class AdamaxOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager): super().__init__(optim.Adamax(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) class SGDOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager): super().__init__(optim.SGD(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), ('mom', 'momentum', 0.0), ('wd', None, 0.0)], hp_manager=hp_manager) class SFAdamOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager: HyperparamManager): from schedulefree import AdamWScheduleFree super().__init__(AdamWScheduleFree(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0), ('weight_decay', 'weight_decay', 0.0), ('warmup_steps', 'warmup_steps', 0)], hp_manager=hp_manager) class MoMoAdamOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager: HyperparamManager): from momo import MomoAdam super().__init__(MomoAdam(param_groups), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) def _opt_step_with_loss(self, loss: Optional[torch.Tensor]): self.opt.step(loss=loss) class AdoptOptimizer(OptimizerBase): def __init__(self, param_groups, hp_manager: HyperparamManager): from .adopt import ADOPT super().__init__(ADOPT(param_groups, decoupled=True), hyper_mappings=[('lr', 'lr', 1e-3), (('mom', 'sq_mom'), 'betas', (0.9, 0.999)), ('opt_eps', 'eps', 1e-8), ('wd', None, 0.0)], hp_manager=hp_manager) def get_opt_class(opt_name): if opt_name == 'adam': return AdamOptimizer elif opt_name == 'adamax': return AdamaxOptimizer elif opt_name == 'sgd': return SGDOptimizer elif opt_name == 'amsgrad': return AMSGradOptimizer elif opt_name == 'sched_adam': return SchedulingAdamOptimizer elif opt_name == 'sfadam': return SFAdamOptimizer elif opt_name == 'momoadam': return MoMoAdamOptimizer elif opt_name == 'adopt': return AdoptOptimizer else: raise ValueError(f'Unknown optimizer "{opt_name}"') ================================================ FILE: pytabkit/models/optim/scheduling_adam.py ================================================ import torch from torch.optim import Optimizer import math # modification of normal adam to properly handle varying betas class SchedulingAdam(Optimizer): r"""Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. The implementation of the L2 penalty follows changes proposed in `Decoupled Weight Decay Regularization`_. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(SchedulingAdam, self).__init__(params, defaults) def __setstate__(self, state): super(SchedulingAdam, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: params_with_grad = [] grads = [] exp_avgs = [] exp_avg_sqs = [] state_sums = [] max_exp_avg_sqs = [] state_steps = [] beta1, beta2 = group['betas'] for p in group['params']: if p.grad is not None: params_with_grad.append(p) if p.grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') grads.append(p.grad) state = self.state[p] # Lazy state initialization if len(state) == 0: state['step'] = 0 state['beta1_prod'] = 1.0 state['beta2_prod'] = 1.0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if group['amsgrad']: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) exp_avgs.append(state['exp_avg']) exp_avg_sqs.append(state['exp_avg_sq']) if group['amsgrad']: max_exp_avg_sqs.append(state['max_exp_avg_sq']) # update the steps for each param group update state['step'] += 1 state['beta1_prod'] *= beta1 state['beta2_prod'] *= beta2 # record the step after step update state_steps.append(state['step']) lr = group['lr'] weight_decay = group['weight_decay'] eps = group['eps'] amsgrad = group['amsgrad'] for i, param in enumerate(params_with_grad): grad = grads[i] exp_avg = exp_avgs[i] exp_avg_sq = exp_avg_sqs[i] bias_correction1 = 1 - self.state[param]['beta1_prod'] bias_correction2 = 1 - self.state[param]['beta2_prod'] if weight_decay != 0: grad = grad.add(param, alpha=weight_decay) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i]) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) step_size = lr / bias_correction1 param.addcdiv_(exp_avg, denom, value=-step_size) return loss ================================================ FILE: pytabkit/models/sklearn/__init__.py ================================================ ================================================ FILE: pytabkit/models/sklearn/default_params.py ================================================ import numpy as np from pytabkit.models import utils class DefaultParams: RealMLP_TD_CLASS = dict( hidden_sizes=[256] * 3, max_one_hot_cat_size=9, embedding_size=8, weight_param='ntk', bias_lr_factor=0.1, act='selu', use_parametric_act=True, act_lr_factor=0.1, block_str='w-b-a-d', p_drop=0.15, p_drop_sched='flat_cos', add_front_scale=True, scale_lr_factor=6.0, bias_init_mode='he+5', weight_init_mode='std', wd=2e-2, wd_sched='flat_cos', bias_wd_factor=0.0, use_ls=True, ls_eps=0.1, num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1, lr=4e-2, tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], n_epochs=256, lr_sched='coslog4', opt='adam', sq_mom=0.95 ) RealMLP_TD_S_CLASS = dict( hidden_sizes=[256] * 3, weight_param='ntk', bias_lr_factor=0.1, act='selu', block_str='w-b-a', add_front_scale=True, scale_lr_factor=6.0, bias_init_mode='normal', weight_init_mode='normal', last_layer_config=dict(bias_init_mode='zeros', weight_init_mode='zeros'), use_ls=True, ls_eps=0.1, tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip'], n_epochs=256, lr=4e-2, lr_sched='coslog4', opt='adam', sq_mom=0.95 ) RealMLP_TD_REG = dict( hidden_sizes=[256] * 3, max_one_hot_cat_size=9, embedding_size=8, weight_param='ntk', weight_init_mode='std', bias_init_mode='he+5', bias_lr_factor=0.1, act='mish', use_parametric_act=True, act_lr_factor=0.1, wd=2e-2, wd_sched='flat_cos', bias_wd_factor=0.0, block_str='w-b-a-d', p_drop=0.15, p_drop_sched='flat_cos', add_front_scale=True, scale_lr_factor=6.0, tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1, clamp_output=True, normalize_output=True, lr=0.2, n_epochs=256, lr_sched='coslog4', opt='adam', sq_mom=0.95 ) RealMLP_TD_S_REG = dict( hidden_sizes=[256] * 3, weight_param='ntk', bias_lr_factor=0.1, bias_init_mode='normal', weight_init_mode='normal', last_layer_config=dict(bias_init_mode='zeros', weight_init_mode='zeros'), act='mish', normalize_output=True, block_str='w-b-a', add_front_scale=True, scale_lr_factor=6.0, tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip'], n_epochs=256, lr=7e-2, lr_sched='coslog4', opt='adam', sq_mom=0.95 ) # -------- GBDTs ------------ LGBM_TD_CLASS = dict( n_estimators=1000, lr=4e-2, subsample=0.75, colsample_bytree=1.0, num_leaves=50, bagging_freq=1, min_data_in_leaf=40, min_sum_hessian_in_leaf=1e-7, max_bin=255, early_stopping_rounds=300, ) LGBM_TD_REG = dict( n_estimators=1000, lr=5e-2, subsample=0.7, colsample_bytree=1.0, num_leaves=100, max_bin=255, bagging_freq=1, min_data_in_leaf=3, min_sum_hessian_in_leaf=1e-7, early_stopping_rounds=300, ) XGB_TD_CLASS = dict( n_estimators=1000, lr=8e-2, min_child_weight=5e-6, reg_lambda=0.0, max_depth=6, colsample_bylevel=0.9, subsample=0.65, tree_method='hist', max_bin=256, early_stopping_rounds=300, ) XGB_TD_REG = dict( n_estimators=1000, max_depth=9, tree_method='hist', max_bin=256, lr=5e-2, min_child_weight=2.0, reg_lambda=0.0, subsample=0.7, early_stopping_rounds=300, ) # from Probst, Boulestix, and Bischl, "Tunability: Importance of ..." XGB_PBB_CLASS = dict( n_estimators=4168, lr=0.018, min_child_weight=2.06, max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839, colsample_bytree=0.752, colsample_bylevel=0.585, tree_method='hist', max_n_threads=64, tfms=['one_hot'], max_one_hot_cat_size=20 ) CB_TD_CLASS = dict( n_estimators=1000, lr=8e-2, l2_leaf_reg=1e-5, boosting_type='Plain', bootstrap_type='Bernoulli', subsample=0.9, max_depth=7, random_strength=0.8, one_hot_max_size=15, leaf_estimation_iterations=1, max_bin=254, early_stopping_rounds=300, ) CB_TD_REG = dict( n_estimators=1000, lr=9e-2, l2_leaf_reg=1e-5, boosting_type='Plain', bootstrap_type='Bernoulli', subsample=0.9, max_depth=9, random_strength=0.0, max_bin=254, one_hot_max_size=20, leaf_estimation_iterations=20, early_stopping_rounds=300, ) # RTDL params RESNET_RTDL_D_CLASS_Grinsztajn = { "lr_scheduler": False, "module_activation": "reglu", "module_normalization": "batchnorm", "module_n_layers": 8, "module_d": 256, "module_d_hidden_factor": 2, "module_hidden_dropout": 0.2, "module_residual_dropout": 0.2, "lr": 1e-3, "optimizer_weight_decay": 1e-7, "optimizer": "adamw", "module_d_embedding": 128, "batch_size": 256, "max_epochs": 300, "use_checkpoints": True, "es_patience": 40, "lr_patience": 30, "verbose": 0, 'tfms': ['quantile'], } RESNET_RTDL_D_REG_Grinsztajn = {**RESNET_RTDL_D_CLASS_Grinsztajn, "transformed_target": True} MLP_RTDL_D_CLASS_Grinsztajn = { "lr_scheduler": False, "module_n_layers": 8, "module_d_layers": 256, "module_d_first_layer": 128, "module_d_last_layer": 128, "module_dropout": 0.2, "lr": 1e-3, "optimizer": "adamw", "module_d_embedding": 128, "batch_size": 256, "max_epochs": 300, "use_checkpoints": True, "es_patience": 40, "lr_patience": 30, "verbose": 0, 'tfms': ['quantile'], } MLP_RTDL_D_REG_Grinsztajn = {**MLP_RTDL_D_CLASS_Grinsztajn, "transformed_target": True} FTT_D_CLASS = { "lr_scheduler": False, "module_d_token": 192, "module_d_ffn_factor": 4. / 3., "module_n_layers": 3, "module_n_heads": 8, "module_activation": "reglu", "module_token_bias": True, "module_attention_dropout": 0.2, "module_initialization": "kaiming", "module_ffn_dropout": 0.1, "module_residual_dropout": 0.0, "module_prenormalization": True, "module_kv_compression": None, "module_kv_compression_sharing": None, "lr": 1e-4, "optimizer": "adamw", "optimizer_weight_decay": 1e-5, "batch_size": 256, # default in Grinsztajn is 512? "max_epochs": 300, # todo: keep it? "use_checkpoints": True, "es_patience": 16, # value from Gorishniy et al. "lr_patience": 30, "verbose": 0, "tfms": ['quantile_tabr'], } FTT_D_REG = {**FTT_D_CLASS, "transformed_target": True} # Default parameters for rtdl models based on https://github.com/naszilla/tabzilla/blob/main/TabZilla/models/rtdl.py RESNET_RTDL_D_CLASS_TabZilla = { "lr_scheduler": False, "module_activation": "relu", "module_normalization": "batchnorm", "module_n_layers": 2, "module_d": 128, "module_d_hidden_factor": 2, "module_hidden_dropout": 0.25, # DROPOUT_FIRST "module_residual_dropout": 0.1, # DROPOUT_SECOND "lr": 1e-3, "optimizer_weight_decay": 0.01, # for tabzilla they don't set it which means 0.01 (which seems high compared # to rtdl hp space?) "optimizer": "adamw", "module_d_embedding": 8, "batch_size": 128, # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129 "max_epochs": 1000, # same "use_checkpoints": True, "es_patience": 20, # same "lr_patience": 30, "verbose": 0, 'tfms': ['quantile_tabr'], } RESNET_RTDL_D_REG_TabZilla = {**RESNET_RTDL_D_CLASS_TabZilla, "transformed_target": True} MLP_RTDL_D_CLASS_TabZilla = { "lr_scheduler": False, "module_n_layers": 3, "module_d_first_layer": 128, # ignored by the code since d_layers is a list "module_d_last_layer": 128, # ignored by the code since d_layers is a list "module_d_layers": [128, 256, 128], "module_dropout": 0.1, # module_activation # module_dropout # optimizer_weight_decay "lr": 1e-3, "optimizer": "adamw", "module_d_embedding": 8, "batch_size": 128, # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129 "max_epochs": 1000, # same "use_checkpoints": True, "es_patience": 20, # same "lr_patience": 30, "verbose": 0, 'tfms': ['quantile_tabr'], } MLP_RTDL_D_REG_TabZilla = {**MLP_RTDL_D_CLASS_TabZilla, "transformed_target": True} MLP_PLR_D_CLASS = { # adapted from TabZilla version of MLP_RTDL_D and the defaults of the rtdl_num_embeddings library "lr_scheduler": False, "module_n_layers": 3, "module_d_first_layer": 128, # ignored by the code since d_layers is a list "module_d_last_layer": 128, # ignored by the code since d_layers is a list "module_d_layers": [128, 256, 128], "module_dropout": 0.1, "lr": 1e-3, "optimizer": "adamw", "module_d_embedding": 8, "batch_size": 128, # default param in https://github.com/naszilla/tabzilla/blob/4949a1dea3255c1a794d89aa2422ef1f8c9ae265/README.md?plain=1#L129 "max_epochs": 1000, # same "use_checkpoints": True, "es_patience": 20, # same "lr_patience": 30, "verbose": 0, 'tfms': ['quantile_tabr'], "module_num_emb_type": 'plr', "module_num_emb_dim": 24, "module_num_emb_hidden_dim": 48, "module_num_emb_sigma": 0.01, "module_num_emb_lite": False } MLP_PLR_D_REG = {**MLP_PLR_D_CLASS, "transformed_target": True} TABR_S_D_CLASS = { "num_embeddings": None, "d_main": 265, "context_dropout": 0.38920071545944357, # named mixer_dropout sometimes I think "d_multiplier": 2.0, "encoder_n_blocks": 0, "predictor_n_blocks": 1, "mixer_normalization": "auto", "dropout0": 0.38852797479169876, "dropout1": 0.0, "normalization": "LayerNorm", "activation": "ReLU", "batch_size": "auto", # adapt given the dataset size "eval_batch_size": 4096, # TODO: automatically infer given memory "patience": 16, "n_epochs": 100_000, # inf in paper "context_size": 96, "freeze_contexts_after_n_epochs": None, "optimizer": { "type": "AdamW", "lr": 0.0003121273641315169, "weight_decay": 1.2260352006404615e-06 }, 'tfms': ['quantile_tabr'], } TABR_S_D_REG = {**TABR_S_D_CLASS, "transformed_target": True} TABR_S_D_CLASS_FREEZE = { **TABR_S_D_CLASS, "freeze_contexts_after_n_epochs": 4, } TABR_S_D_REG_FREEZE = { **TABR_S_D_REG, "freeze_contexts_after_n_epochs": 4, } RealTABR_D_CLASS = { "d_main": 265, "context_dropout": 0.38920071545944357, # named mixer_dropout sometimes I think "d_multiplier": 2.0, "encoder_n_blocks": 0, "predictor_n_blocks": 1, "mixer_normalization": "auto", "dropout0": 0.38852797479169876, "dropout1": 0.0, "normalization": "LayerNorm", "activation": "ReLU", "batch_size": "auto", # adapt given the dataset size "eval_batch_size": 4096, "patience": 16, "n_epochs": 100_000, # inf in paper "context_size": 96, "freeze_contexts_after_n_epochs": None, 'num_embeddings': { 'type': "PBLDEmbeddings", 'n_frequencies': 8, # not 16 because of RAM issues on meta-test 'd_embedding': 4, 'frequency_scale': 0.1, }, 'tfms': ['median_center', 'robust_scale', 'smooth_clip'], 'optimizer': { "type": "AdamW", "lr": 0.0003121273641315169, "weight_decay": 1.2260352006404615e-06, "betas": (0.9, 0.95), }, 'add_scaling_layer': True, 'scale_lr_factor': 96, 'ls_eps': 0.1, } RealTABR_D_REG = {**RealTABR_D_CLASS, "transformed_target": True} TABM_D_CLASS = { # from https://github.com/yandex-research/tabm/blob/main/example.ipynb 'arch_type': 'tabm', 'tabm_k': 32, 'num_emb_type': 'none', 'num_emb_n_bins': 48, 'batch_size': 256, 'lr': 2e-3, 'weight_decay': 0.0, 'n_epochs': 1_000_000_000, 'patience': 16, 'd_embedding': 16, 'd_block': 512, 'n_blocks': 'auto', 'dropout': 0.1, 'compile_model': False, 'allow_amp': False, 'tfms': ['quantile_tabr'], 'gradient_clipping_norm': None, # set to 1.0 in TabR paper experiments } TABM_D_REG = TABM_D_CLASS VANILLA_MLP_CLASS = dict( hidden_sizes=[256] * 3, p_drop=0.0, wd=0.0, block_str='w-b-a-d', opt='adam', tfms=['quantile', 'one_hot'], batch_size=256, n_epochs=256, act='relu', weight_param='standard', weight_init_mode='uniform', weight_init_gain=1. / np.sqrt(3.), bias_init_mode='pytorch-default', lr=1e-3, lr_sched='constant', max_n_vectorized=1, # this is because of the preprocessing use_last_best_epoch=False, ) VANILLA_MLP_REG = utils.join_dicts(VANILLA_MLP_CLASS, dict(normalize_output=True)) XRFM_D_CLASS = dict( bandwidth=10.0, p_interp=1.0, exponent=1.0, reg=1e-3, iters=5, diag=True, bandwidth_mode='constant', kernel_type='l2', max_leaf_samples=60_000, early_stop_rfm=True, early_stop_multiplier=1.1, classification_mode='prevalence', M_batch_size=8000, ) XRFM_D_REG = XRFM_D_CLASS # ----- sklearn versions ------ LGBM_D = dict( n_estimators=100, ) XGB_D = dict( n_estimators=100, tree_method='hist', ) CB_D = dict( n_estimators=1000, ) RF_SKL_D = dict( tfms=['ordinal_encoding'], permute_ordinal_encoding=True, ) MLP_SKL_D = dict( tfms=['mean_center', 'l2_normalize', 'one_hot'] ) ================================================ FILE: pytabkit/models/sklearn/sklearn_base.py ================================================ import copy from pathlib import Path from typing import Dict, Any, Optional, Union, List from warnings import warn from packaging.version import Version import numpy as np import pandas as pd import scipy.sparse import sklearn import torch import multiprocessing as mp from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.exceptions import DataConversionWarning from sklearn.metrics._dist_metrics import check_array from sklearn.preprocessing import OrdinalEncoder from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_is_fitted, check_X_y from pytabkit.models import utils from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources from pytabkit.models.data.data import DictDataset, TensorInfo from pytabkit.models.data.splits import RandomSplitter, KFoldSplitter from pytabkit.models.data.conversion import ToDictDatasetConverter from pytabkit.models.torch_utils import get_available_device_names from pytabkit.models.training.logging import StdoutLogger def to_df(x) -> pd.DataFrame: try: return pd.DataFrame(x) except: pass return pd.DataFrame(np.array(x)) def to_normal_type(x) -> Any: if isinstance(x, pd.DataFrame) or isinstance(x, list) or isinstance(x, np.ndarray) or isinstance(x, pd.Series): return x return np.asarray(x) def concat_arrays(x1, x2) -> Any: if type(x1) != type(x2): raise ValueError(f'Arrays must have the same type, but got {type(x1)=} and {type(x2)=}') if isinstance(x1, pd.DataFrame) or isinstance(x1, pd.Series): return pd.concat([x1, x2], axis=0, ignore_index=True) return np.concatenate([x1, x2], axis=0) def check_X_y_wrapper(*args, **kwargs): if Version(sklearn.__version__) >= Version("1.8.0"): if 'force_all_finite' in kwargs: kwargs['ensure_all_finite'] = kwargs['force_all_finite'] del kwargs['force_all_finite'] else: if 'ensure_all_finite' in kwargs: kwargs['force_all_finite'] = kwargs['ensure_all_finite'] del kwargs['ensure_all_finite'] check_X_y(*args, **kwargs) def check_array_wrapper(*args, **kwargs): if Version(sklearn.__version__) >= Version("1.8.0"): if 'force_all_finite' in kwargs: kwargs['ensure_all_finite'] = kwargs['force_all_finite'] del kwargs['force_all_finite'] else: if 'ensure_all_finite' in kwargs: kwargs['force_all_finite'] = kwargs['ensure_all_finite'] del kwargs['ensure_all_finite'] check_array(*args, **kwargs) class AlgInterfaceEstimator(BaseEstimator): """ Base class for wrapping AlgInterface subclasses with a scikit-learn compatible interface. """ def _create_alg_interface(self, n_cv: int) -> AlgInterface: # override this raise NotImplementedError() def _supports_multioutput(self) -> bool: # only relevant for regression, override this if multioutput is not supported return True def _supports_single_class(self) -> bool: # only relevant for classification, # override this if training with only a single class in the training set is not supported return True def _supports_single_sample(self) -> bool: return True def _non_deterministic_tag(self) -> bool: return False def _is_classification(self) -> bool: raise NotImplementedError() def _get_default_params(self) -> Dict[str, Any]: # override this in subclasses to handle default parameters that should not be treated in the constructor # e.g. because their default values are mutable (list/dict/...) return dict() def _allowed_device_names(self) -> List[str]: # override in subclasses that allow to run on a GPU or mps return ['cpu'] def _more_tags(self): return dict(non_deterministic=self._non_deterministic_tag()) def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.non_deterministic = self._non_deterministic_tag() return tags def get_config(self) -> Dict[str, Any]: """ Augments the result from self.get_params() with the parameters from self._get_default_params(). Uses _preprocess_config_key() to change the names from self.get_params() if implemented. Default parameters are used if the value in get_params() is either None or not present. :return: Dictionary of parameters augmented with default parameters. """ params = {key: value for key, value in self.get_params(deep=False).items()} default_params = self._get_default_params() for key, value in default_params.items(): if key not in params or params[key] is None: params[key] = value # print(f'{params=}') # return params # remove None values return {key: value for key, value in params.items() if value is not None} def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Optional[np.ndarray] = None, cat_indicator: Optional[Union[List[bool], np.ndarray]] = None, cat_col_names: Optional[List[str]] = None, time_to_fit_in_seconds: Optional[int] = None) -> BaseEstimator: """ Fit the estimator. :param X: Inputs (covariates). pandas DataFrame, numpy array, or similar array-like. :param y: Labels (targets, variates). pandas DataFrame/Series, numpy array, or similar array-like. :param X_val: Inputs for validation set. Can only be used if n_cv is not set to a value other than 1, and if val_idxs is not used. If X_val is used, X will be used for the training set only, instead of getting validation data from X. :param y_val: Labels for the validation set. :param val_idxs: Indices of validation set elements within X and y (optional). Can be an array of shape (n_val_samples,) or (n_val_splits,n_val_samples_per_split). In the latter case, the results of the models on the validation splits will be ensembled. :param cat_indicator: Which features/columns are categorical, specified as a list or array of booleans. If this is not specified, all columns with category/string/object dtypes are interpreted as categorical and all others as numerical. :param cat_col_names: List of column names that should be treated as categorical (if X is a pd.DataFrame). Can be specified instead of cat_indicator. :param time_to_fit_in_seconds: Time limit in seconds for fitting. Currently only implemented for RealMLP (default=None). If None, no time limit will be applied. :return: Returns self. """ # do a first check, this includes to check if X or y are not None before other things are done to them check_X_y_wrapper(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None) # if X is None: # raise ValueError(f'This estimator requires X to be passed, but X is None') # if y is None: # # this message has to include the special text # # "requires y to be passed, but the target y is None" # # or one of the other particular messages # # for the estimator test "check_requires_y_none" to not fail # # it doesn't work automatically because of the to_normal_type(y) before the check_X_y # raise ValueError(f'This estimator requires y to be passed, but the target y is None') for arr in [X, y, X_val, y_val]: if scipy.sparse.issparse(arr): raise ValueError(f'Sparse arrays are not supported!') # print(f'{X=}') # print(f'{y=}') X = to_normal_type(X) y = to_normal_type(y) # need to convert array-like objects to arrays for self.is_y_1d_ params = self.get_config() n_cv = params.get('n_cv', 1) n_repeats = params.get('n_repeats', 1) # val_fraction is only relevant for n_cv == 1 val_fraction = params.get('val_fraction', 0.2) n_refit = params.get('n_refit', 0) if X_val is not None and y_val is None: raise ValueError(f'X_val is not None but y_val is None') elif X_val is None and y_val is not None: raise ValueError(f'X_val is None but y_val is not None') if X_val is not None and y_val is not None: if val_idxs is not None: raise ValueError(f'both val_idxs and X_val, y_val were provided') X_val = to_normal_type(X_val) y_val = to_normal_type(y_val) val_idxs = np.arange(len(X), len(X) + len(X_val)) X = concat_arrays(X, X_val) y = concat_arrays(y, y_val) # check again with the validation set concatenated check_X_y_wrapper(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None) if self._is_classification(): # classes_ is overridden later, but this raises an error when y is a regression target, so it is useful self.classes_ = unique_labels(y) self.is_y_1d_ = isinstance(y, pd.Series) or (isinstance(y, np.ndarray) and len(y.shape) == 1) if isinstance(y, list): if len(np.asarray(y).shape) == 1: self.is_y_1d_ = True # if not (isinstance(y, np.ndarray) or isinstance(y, list) # or isinstance(y, pd.DataFrame) or isinstance(y, pd.Series)): # raise ValueError(f'y has type {type(y)}, but should be one of np.ndarray, list, pd.DataFrame, or pd.Series') # y_df = pd.DataFrame(y) X_df = to_df(X).copy() y_df = to_df(y).copy() # self.y_encoder_.fit_transform(y) if cat_col_names is not None: if cat_indicator is not None: raise ValueError(f'Specified both cat_col_names and cat_indicator') cat_indicator = [col_name in cat_col_names for col_name in X_df.columns] self.x_converter_ = ToDictDatasetConverter(cat_features=cat_indicator, verbosity=params.get('verbosity', 0)) self.y_encoder_ = OrdinalEncoder(dtype=np.int64) # only used for classification if not self._supports_single_sample() and len(X_df) == 1: raise ValueError('Training with one sample is not supported!') x_ds = self.x_converter_.fit_transform(X_df) if torch.any(torch.isnan(x_ds.tensors['x_cont'])): raise ValueError('NaN values in continuous columns are currently not allowed!') self.is_y_float64_ = False # checked later in the regression case # convert y if self._is_classification(): self.y_encoder_ = OrdinalEncoder(dtype=np.int64) y_tfmd = self.y_encoder_.fit_transform(y_df) if len(y_tfmd.shape) == 1: y_tfmd = y_tfmd[:, None] if len(y_tfmd.shape) != 2: raise ValueError('len(y.shape) != 2') if y_tfmd.shape[1] != 1: raise ValueError('Multilabel classification is not supported!') if not self.is_y_1d_: warn( ( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel()." ), DataConversionWarning, stacklevel=2, ) y_ds = DictDataset(tensors={'y': torch.as_tensor(y_tfmd, dtype=torch.long)}, tensor_infos={'y': TensorInfo(cat_sizes=[int(np.max(y_tfmd) + 1)])}) self.classes_ = self.y_encoder_.categories_[0] if not self._supports_single_class() and len(self.classes_) == 1: raise ValueError(f'Training with only one class in the training set is not supported!') else: # regression if y_df[y_df.columns[0]].dtype == np.float64: self.is_y_float64_ = True y_tfmd = y_df.to_numpy(dtype=np.float32) if len(y_tfmd.shape) == 1: y_tfmd = y_tfmd[:, None] if len(y_tfmd.shape) != 2: raise ValueError('len(y.shape) != 2') y_ds = DictDataset(tensors={'y': torch.as_tensor(y_tfmd, dtype=torch.float32)}, tensor_infos={'y': TensorInfo(feat_shape=[y_tfmd.shape[1]])}) if not self._supports_multioutput() and not self.is_y_1d_: warn( ( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel()." ), DataConversionWarning, stacklevel=2, ) if not self._supports_multioutput() and y_ds.tensor_infos['y'].get_n_features() > 1: raise ValueError('Multioutput regression is not supported, ' 'please wrap this estimator with the MultiOutputRegressor ' 'from scikit-learn.') ds = DictDataset.join(x_ds, y_ds) # set n_features_in_ as required by https://scikit-learn.org/stable/developers/develop.html self.n_features_in_ = ds.tensor_infos['x_cont'].get_n_features() + ds.tensor_infos['x_cat'].get_n_features() self.cv_alg_interface_ = self._create_alg_interface(n_cv=n_cv) # ----- get random seeds ----- random_state = params.get('random_state', None) if isinstance(random_state, int): seed = random_state elif random_state is None: seed = int(np.random.randint(0, 2 ** 31 - 1)) elif isinstance(random_state, np.random.RandomState): seed = int(random_state.randint(0, 2 ** 31 - 1)) else: raise ValueError(f'random_state type {type(random_state)} ' f'is not one of [NoneType, int, np.random.RandomState]') split_seed = seed refit_split_seed = seed + 1 sub_split_seeds = list(np.random.RandomState(split_seed).randint(0, 2 ** 31 - 1, size=n_cv * n_repeats)) sub_split_seeds = [int(seed) for seed in sub_split_seeds] refit_sub_split_seeds = list( np.random.RandomState(refit_split_seed).randint(0, 2 ** 31 - 1, size=n_refit)) refit_sub_split_seeds = [int(seed) for seed in refit_sub_split_seeds] # ----- get train/val split ----- if not isinstance(n_cv, int) or n_cv <= 0: raise ValueError(f'Expected n_cv to be an int >= 1, but got {n_cv=}') if val_idxs is not None: if n_repeats != 1: raise ValueError(f'Providing a validation split requires n_repeats=1, but got {n_repeats=}') # provided split val_idxs = torch.as_tensor(val_idxs, dtype=torch.long) if len(val_idxs.shape) == 1: val_idxs = val_idxs[None, :] train_idxs_list = [] for i in range(val_idxs.shape[0]): is_val_idx = torch.zeros(ds.n_samples, dtype=torch.bool) is_val_idx[val_idxs[i]] = True train_idxs_list.append(torch.argwhere(~is_val_idx).squeeze(-1)) train_idxs = torch.stack(train_idxs_list, dim=0) if val_idxs.shape[0] == 1 and n_cv > 1: # replicate according to n_cv, such that an ensemble can be created train_idxs = train_idxs.expand(n_cv, -1) val_idxs = val_idxs.expand(n_cv, -1) elif n_cv != val_idxs.shape[0]: raise ValueError(f'Value provided for {n_cv=} is not equal to {val_idxs.shape[0]=}') else: train_idxs_list = [] val_idxs_list = [] for i in range(n_repeats): if n_cv == 1: # random split splitter = RandomSplitter(seed=split_seed + i, first_fraction=1.0 - val_fraction) train_idxs, val_idxs = splitter.get_idxs(ds) train_idxs_list.append(train_idxs[None, :]) val_idxs_list.append(val_idxs[None, :]) else: splitter = KFoldSplitter(k=n_cv, seed=split_seed + i, stratified=self._is_classification()) idxs_tuples = splitter.get_idxs(ds) train_idxs_list.append(torch.stack([t[0] for t in idxs_tuples], dim=0)) val_idxs_list.append(torch.stack([t[1] for t in idxs_tuples], dim=0)) train_idxs = torch.cat(train_idxs_list, dim=0) val_idxs = torch.cat(val_idxs_list, dim=0) if val_idxs.shape[1] == 0: val_idxs = None # no validation set # print(f'{val_idxs=}') # print(f'{np.mean(X / (1e-8 + np.linalg.norm(X, axis=0, keepdims=True)))=}') idxs_list = [SplitIdxs(train_idxs=train_idxs, val_idxs=val_idxs, test_idxs=None, split_seed=split_seed, sub_split_seeds=sub_split_seeds, split_id=0)] # ----- resources ----- try: import psutil n_physical_threads = psutil.cpu_count(logical=False) except ImportError: # this assumes that there are 2 logical threads per physical thread n_logical_threads = mp.cpu_count() n_physical_threads = max(1, n_logical_threads // 2) device = params.get('device', None) if device == 'cuda': device = 'cuda:0' # 'cuda' doesn't work with some of the code n_threads = params.get('n_threads', n_physical_threads) self.n_threads_ = n_threads old_torch_n_threads = torch.get_num_threads() torch.set_num_threads(n_threads) gpu_devices = [] device_names = get_available_device_names() if device is None: allowed_device_names = [name for name in device_names if name.split(':')[0] in self._allowed_device_names()] if 'cuda:0' in allowed_device_names: gpu_devices.append('cuda:0') elif 'mps' in allowed_device_names: gpu_devices.append('mps') # print(f'{gpu_devices=}') # print(f'{self._allowed_device_names()=}') # print(f'{allowed_device_names=}') # print(f'{device_names=}') elif device != 'cpu': if device not in device_names: raise ValueError(f'Unknown device name "{device}", known device names are {device_names}') gpu_devices.append(device) tmp_folder: Optional[str] = params.get('tmp_folder', None) if tmp_folder is None: tmp_folders = [None] refit_tmp_folders = [None] else: tmp_path = Path(tmp_folder) # make sure that the refit stage doesn't load the models from the cv stage tmp_folders = [tmp_path / 'cv'] refit_tmp_folders = [tmp_path / 'refit'] logger = StdoutLogger(verbosity_level=params.get('verbosity', 0)) interface_resources = InterfaceResources(n_threads=n_threads, gpu_devices=gpu_devices, time_in_seconds=time_to_fit_in_seconds) self.cv_alg_interface_.fit(ds=ds, idxs_list=idxs_list, interface_resources=interface_resources, logger=logger, tmp_folders=tmp_folders, name=self.__class__.__name__) # todo: put alg_interface on the CPU after fit() (for saving)? How to do it? # todo: currently, there is only one alg_interface which may fit in parallel (for the NNs), # but we could add an option to make them fit sequentially for RAM reasons or so # (maybe this is best done via a MultiSplitWrapper or so) if n_refit > 0: self.refit_alg_interface_ = self.cv_alg_interface_.get_refit_interface(n_refit=n_refit) train_idxs = torch.arange(ds.n_samples, dtype=torch.long)[None, :].expand(n_refit, -1) refit_idxs_list = [SplitIdxs(train_idxs=train_idxs, val_idxs=None, test_idxs=None, split_seed=refit_split_seed, sub_split_seeds=refit_sub_split_seeds, split_id=0)] self.refit_alg_interface_.fit(ds=ds, idxs_list=refit_idxs_list, interface_resources=interface_resources, logger=logger, tmp_folders=refit_tmp_folders, name=self.__class__.__name__ + ' [refit]') self.alg_interface_ = self.refit_alg_interface_ else: self.alg_interface_ = self.cv_alg_interface_ if hasattr(self.alg_interface_, 'fit_params') and len(self.alg_interface_.fit_params) > 0: self.fit_params_ = self.alg_interface_.fit_params[0] torch.set_num_threads(old_torch_n_threads) return self def _predict_raw(self, X) -> torch.Tensor: """ Predicts logits (for classification) or mean outputs (for regression) :param X: Input data. :return: Returns a tensor of shape [n_ensemble, n_samples, output_dim]. """ # Check is fit had been called check_is_fitted(self, ['alg_interface_', 'x_converter_']) old_torch_n_threads = torch.get_num_threads() torch.set_num_threads(self.n_threads_) # Input validation # if isinstance(X, np.ndarray): check_array_wrapper(X, force_all_finite='allow-nan', dtype=None) x_ds = self.x_converter_.transform(to_df(X)) if torch.any(torch.isnan(x_ds.tensors['x_cont'])): raise ValueError('NaN values in continuous columns are currently not allowed!') y_preds = self.alg_interface_.predict(x_ds).detach().cpu() torch.set_num_threads(old_torch_n_threads) return y_preds def to(self, device: str) -> None: """ Move the model (only implemented for RealMLP at the moment) to the specified device. :param device: PyTorch-compatible device name. """ self.cv_alg_interface_.to(device) if hasattr(self, 'refit_alg_interface_'): self.refit_alg_interface_.to(device) class AlgInterfaceClassifier(ClassifierMixin, AlgInterfaceEstimator): # inheritance order is important in scikit-learn 1.6 # otherwise sklearn.base.is_classifier(...) returns False def _is_classification(self) -> bool: return True def predict_proba(self, X) -> np.ndarray: y_preds = self._predict_raw(X) # y_preds are logits, so take the softmax and the mean over the ensemble dimension afterward y_probs = torch.softmax(y_preds, dim=-1).mean(dim=0) return y_probs.numpy() def predict_proba_ensemble(self, X) -> np.ndarray: # same as predict_proba but does not average over ensemble members y_preds = self._predict_raw(X) # y_preds are logits, so take the softmax and the mean over the ensemble dimension afterward y_probs = torch.softmax(y_preds, dim=-1) return y_probs.numpy() def predict(self, X): """ Predict labels. Parameters ---------- X : array-like, shape (n_samples, n_features) The input samples. Returns ------- y : ndarray, shape (n_samples,) The label for each sample is the label of the closest sample seen during fit. """ y_probs = self.predict_proba(X) class_idxs = np.argmax(y_probs, axis=-1) return np.asarray(self.classes_)[class_idxs] def predict_ensemble(self, X): y_probs = self.predict_proba_ensemble(X) class_idxs = np.argmax(y_probs, axis=-1) return np.asarray(self.classes_)[class_idxs] class AlgInterfaceRegressor(RegressorMixin, AlgInterfaceEstimator): # inheritance order is important in scikit-learn 1.6 # otherwise sklearn.base.regressor(...) or is_regressor(...) returns False def _is_classification(self) -> bool: return False def _more_tags(self): return utils.join_dicts(super()._more_tags(), {'multioutput': self._supports_multioutput()}) def __sklearn_tags__(self): # from sklearn version 1.6+ tags = super().__sklearn_tags__() tags.target_tags.multi_output = self._supports_multioutput() return tags def predict(self, X): y_preds = self._predict_raw(X) y_np = y_preds.mean(dim=0).numpy() # print(f'{self.is_y_1d_=}') if self.is_y_1d_ and y_np.shape[1] == 1: y_np = y_np[:, 0] if self.is_y_float64_: y_np = y_np.astype(np.float64) return y_np def predict_ensemble(self, X): y_preds = self._predict_raw(X) y_np = y_preds.numpy() # print(f'{self.is_y_1d_=}') if self.is_y_1d_: y_np = y_np[:, :, 0] if self.is_y_float64_: y_np = y_np.astype(np.float64) return y_np ================================================ FILE: pytabkit/models/sklearn/sklearn_interfaces.py ================================================ import pathlib from typing import Optional, Any, Union, List, Dict, Literal import numpy as np from pytabkit.models import utils from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.sklearn.sklearn_base import AlgInterfaceRegressor, AlgInterfaceClassifier from pytabkit.models.alg_interfaces.sub_split_interfaces import SingleSplitWrapperAlgInterface from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface, RandomParamsNNAlgInterface from pytabkit.models.alg_interfaces.ensemble_interfaces import AlgorithmSelectionAlgInterface, \ CaruanaEnsembleAlgInterface # the list of methods can be auto-generated using scripts/get_sklearn_names.py __all__ = ["CatBoost_D_Classifier", "CatBoost_D_Regressor", "CatBoost_HPO_Classifier", "CatBoost_HPO_Regressor", "CatBoost_HPO_TPE_Classifier", "CatBoost_HPO_TPE_Regressor", "CatBoost_TD_Classifier", "CatBoost_TD_Regressor", "Ensemble_TD_Classifier", "Ensemble_TD_Regressor", "Ensemble_HPO_Classifier", "Ensemble_HPO_Regressor", "FTT_D_Classifier", "FTT_D_Regressor", "FTT_HPO_Classifier", "FTT_HPO_Regressor", "LGBM_D_Classifier", "LGBM_D_Regressor", "LGBM_HPO_Classifier", "LGBM_HPO_Regressor", "LGBM_HPO_TPE_Classifier", "LGBM_HPO_TPE_Regressor", "LGBM_TD_Classifier", "LGBM_TD_Regressor", "MLP_PLR_D_Classifier", "MLP_PLR_D_Regressor", "MLP_PLR_HPO_Classifier", "MLP_PLR_HPO_Regressor", "MLP_RTDL_D_Classifier", "MLP_RTDL_D_Regressor", "MLP_RTDL_HPO_Classifier", "MLP_RTDL_HPO_Regressor", "MLP_SKL_D_Classifier", "MLP_SKL_D_Regressor", "RF_HPO_Classifier", "RF_HPO_Regressor", "RF_SKL_D_Classifier", "RF_SKL_D_Regressor", "RealMLP_HPO_Classifier", "RealMLP_HPO_Regressor", "RealMLP_TD_Classifier", "RealMLP_TD_Regressor", "RealMLP_TD_S_Classifier", "RealMLP_TD_S_Regressor", "RealTabR_D_Classifier", "RealTabR_D_Regressor", "Resnet_RTDL_D_Classifier", "Resnet_RTDL_D_Regressor", "Resnet_RTDL_HPO_Classifier", "Resnet_RTDL_HPO_Regressor", "TabR_HPO_Classifier", "TabR_HPO_Regressor", "TabR_S_D_Classifier", "TabR_S_D_Regressor", "TabM_D_Classifier", "TabM_D_Regressor", "TabM_HPO_Classifier", "TabM_HPO_Regressor", "XRFM_D_Classifier", "XRFM_D_Regressor", "XRFM_HPO_Classifier", "XRFM_HPO_Regressor", "XGB_D_Classifier", "XGB_D_Regressor", "XGB_HPO_Classifier", "XGB_HPO_Regressor", "XGB_HPO_TPE_Classifier", "XGB_HPO_TPE_Regressor", "XGB_PBB_D_Classifier", "XGB_TD_Classifier", "XGB_TD_Regressor"] class RealMLPConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, train_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None, n_epochs: Optional[int] = None, batch_size: Optional[int] = None, predict_batch_size: Optional[int] = None, hidden_sizes: Optional[Union[List[int], Literal['rectangular']]] = None, n_hidden_layers: Optional[int] = None, hidden_width: Optional[int] = None, tfms: Optional[List[str]] = None, num_emb_type: Optional[str] = None, use_plr_embeddings: Optional[bool] = None, plr_sigma: Optional[float] = None, plr_hidden_1: Optional[int] = None, plr_hidden_2: Optional[int] = None, plr_act_name: Optional[str] = None, plr_use_densenet: Optional[bool] = None, plr_use_cos_bias: Optional[bool] = None, plr_lr_factor: Optional[float] = None, max_one_hot_cat_size: Optional[int] = None, embedding_size: Optional[int] = None, act: Optional[str] = None, use_parametric_act: Optional[bool] = None, act_lr_factor: Optional[float] = None, weight_param: Optional[str] = None, weight_init_mode: Optional[str] = None, weight_init_gain: Optional[str] = None, weight_lr_factor: Optional[float] = None, bias_init_mode: Optional[str] = None, bias_lr_factor: Optional[float] = None, bias_wd_factor: Optional[float] = None, add_front_scale: Optional[bool] = None, scale_lr_factor: Optional[float] = None, first_layer_lr_factor: Optional[float] = None, block_str: Optional[str] = None, first_layer_config: Optional[Dict[str, Any]] = None, last_layer_config: Optional[Dict[str, Any]] = None, middle_layer_config: Optional[Dict[str, Any]] = None, p_drop: Optional[float] = None, p_drop_sched: Optional[str] = None, wd: Optional[float] = None, wd_sched: Optional[str] = None, opt: Optional[str] = None, lr: Optional[Union[float, Dict[str, float]]] = None, lr_sched: Optional[str] = None, mom: Optional[float] = None, mom_sched: Optional[str] = None, sq_mom: Optional[float] = None, sq_mom_sched: Optional[str] = None, opt_eps: Optional[float] = None, opt_eps_sched: Optional[str] = None, normalize_output: Optional[bool] = None, clamp_output: Optional[bool] = None, use_ls: Optional[bool] = None, ls_eps: Optional[float] = None, ls_eps_sched: Optional[str] = None, use_early_stopping: Optional[bool] = None, early_stopping_additive_patience: Optional[int] = None, early_stopping_multiplicative_patience: Optional[float] = None, calibration_method: Optional[str] = None, sort_quantile_predictions: Optional[bool] = None, stop_epoch: Optional[int] = None, use_best_mean_epoch_for_cv: Optional[bool] = None, n_ens: Optional[int] = None, ens_av_before_softmax: Optional[int] = None, ): """ Constructor for RealMLP, using the default parameters from RealMLP-TD. For lists of default parameters, we refer to pytabkit.models.sklearn.default_params.DefaultParams. RealMLP-TD does automatic preprocessing, so no manual preprocessing is necessary except for imputing missing numerical values. Tips for modifications: * For faster training: For large datasets (say >50K samples), especially on GPUs, increase batch_size. It can also help to decrease n_epochs, set use_plr_embeddings=False (in case of many numerical features), increase max_one_hot_cat_size (in case of large-cardinality categories), or set use_parametric_act=False * For more accuracy: You can try increasing n_epochs or hidden_sizes while also decreasing lr. * For classification, if you care about metrics like cross-entropy or AUC instead of accuracy, we recommend setting val_metric_name='cross_entropy' and use_ls=False. :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None). If None, 'cuda' will be used if available, otherwise 'cpu'. :param random_state: Random state to use for random number generation (splitting, initialization, batch shuffling). If None, the behavior is not deterministic. :param n_cv: Number of cross-validation splits to use (default=1). If validation set indices or an explicit validation set are given in fit(), `n_cv` models will be fitted using different random seeds. Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). For n_cv=1, a single train-validation split will be used, where `val_fraction` controls the fraction of validation samples. If `n_refit=0` is set, the prediction will use the average of the models fitted during cross-validation. (Averaging is over probabilities for classification, and over outputs for regression.) Otherwise, refitted models will be used. :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0). If zero, only the models from the cross-validation stage are used. If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit()) and their predictions will be averaged during predict(). :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1). Values != 1 are only allowed when no custom validation split is provided. Larger number of repeats make things slower but reduce the potential for validation set overfitting, especially on smaller datasets. :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1). Only used if `n_cv==1` and no validation split is provided in fit(). :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores). :param tmp_folder: Temporary folder in which data can be stored during fit(). (Currently unused for RealMLP-TD and variants.) If None, methods generally try to not store intermediate data. Note that HPO and ensemble methods can use this to reduce RAM usage by storing fitted models, and will need this folder to be available whenever they are used. :param verbosity: Verbosity level (default=0, higher means more verbose). Set to 2 to see logs from intermediate epochs. :param train_metric_name: Name of the training metric (default='cross_entropy' for classification and 'mse' for regression). Currently most other metrics are not available for training. :param val_metric_name: Name of the validation metric (used for selecting the best epoch). Defaults are 'class_error' for classification and 'rmse' for regression. Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo', '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'. Main available regression metrics: 'rmse', 'mae', 'max_error', 'pinball(0.95)' (also works with other quantiles specified directly in the string). For more metrics, we refer to `models.training.metrics.Metrics.apply()`. :param n_epochs: Number of epochs to train the model for (default=256) :param batch_size: Batch size to be used for fit(), default=256. :param predict_batch_size: Batch size to be used for predict(), default=1024. :param hidden_sizes: List of numbers of neurons for each hidden layer, default=[256, 256, 256]. If this is set to 'rectangular', then [hidden_width] * n_hidden_layers will be used instead. :param n_hidden_layers: Number of hidden layers, default=3. Only used if hidden_sizes=='rectangular'. :param hidden_width: Width of each hidden layer, default=256. Only used if hidden_sizes=='rectangular'. :param tfms: List of preprocessing transformations, default=`['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding']`. Other possible transformations include: 'median_center', 'l2_normalize', 'l1_normalize', 'quantile', 'kdi'. :param num_emb_type: Type of numerical embeddings used (default='pbld'). If not set to 'ignore', it overrides the parameters `use_plr_embeddings`, `plr_act_name`, `plr_use_densenet`, `plr_use_cos_bias`. Possible values: 'ignore', 'none' (no numerical embeddings), 'pl', 'plr', 'pbld', 'pblrd'. :param use_plr_embeddings: Whether PLR (or PL) numerical embeddings should be used (default=True). :param plr_sigma: Initialization standard deviation for first PLR embedding layer (default=0.1). :param plr_hidden_1: (Half of the) number of hidden neurons in the first PLR hidden layer (default=8). This number will be doubled since there are sin() and cos() versions for each hidden neuron. :param plr_hidden_2: Number of output neurons of the PLR hidden layer, excluding the optional densenet connection (default=7). :param plr_act_name: Name of PLR activation function (default='linear'). Use 'relu' for the PLR version and 'linear' for the PL version. :param plr_use_densenet: Whether to append the original feature to the numerical embeddings (default=True). :param plr_use_cos_bias: Whether to use the cos(wx+b) version for the periodic embeddings instead of the (sin(wx), cos(wx)) version (default=True). :param plr_lr_factor: Learning rate factor for PLR embeddings (default=0.1). Gets multiplied with lr and with the value of the schedule. :param max_one_hot_cat_size: Maximum category size that one-hot encoding should be applied to, including the category for missing/unknown values (default=9). :param embedding_size: Number of output features of categorical embedding layers (default=8). :param act: Activation function (default='selu' for classification and 'mish' for regression). Can also be 'relu' or 'silu'. :param use_parametric_act: Whether to use a parametric activation as described in the paper (default=True). :param act_lr_factor: Learning rate factor for parametric activation (default=0.1). :param weight_param: Weight parametrization (default='ntk'). See models.nn.WeightFitter() for more options. :param weight_init_mode: Weight initialization mode (default='std'). See models.nn.WeightFitter() for more options. :param weight_init_gain: Multiplier for the weight initialization standard deviation. (Does not apply to 'std' initialization mode.) :param weight_lr_factor: Learning rate factor for weights. :param bias_init_mode: Bias initialization mode (default='he+5'). See models.nn.BiasFitter() for more options. :param bias_lr_factor: Bias learning rate factor. :param bias_wd_factor: Bias weight decay factor. :param add_front_scale: Whether to add a scaling layer (diagonal weight matrix) before the linear layers (default=True). If set to true and a scaling layer is already configured in the block_str, this will create an additional scaling layer. :param scale_lr_factor: Scaling layer learning rate factor (default=1.0 but will be overridden by default for the first layer in first_layer_config). :param first_layer_lr_factor: First layer learning rate factor (default=1.0). :param block_str: String describing the default hidden layer components. The default is 'w-b-a-d' for weight, bias, activation, dropout. By default, the last layer config will override it with 'w-b' and the first layer config will override it with 's-w-b-a-d', where the 's' stands for the scaling layer. :param first_layer_config: Dictionary with more options that can override the other options for the construction of the first MLP layer specifically. The default is dict(block_str='s-w-b-a-d', scale_lr_factor=6.0), using a scaling layer at the beginning of the first layer with lr factor 6.0. :param last_layer_config: Dictionary with more options that can override the other options for the construction of the last MLP layer specifically. The default is an empty dict, in which case the block_str will still be overridden by 'w-b'. :param middle_layer_config: Dictionary with more options that can override the other options for the construction of the layers except first and last MLP layer. The default is an empty dict. :param p_drop: Dropout probability (default=0.15). Needs to be in [0, 1). :param p_drop_sched: Dropout schedule (default='flat_cos'). :param wd: Weight decay implemented as in the PyTorch AdamW but works with all optimizers (default=0.0 for regression and 1e-2 for classification). Weight decay is implemented as param -= current_lr_value * current_wd_value * param where the current lr and wd values are determined using the base values (lr and wd), factors for the given parameter if available, and the respective schedule. Note that this is not identical to the original AdamW paper, where the lr base value is not included in the update equation. :param wd_sched: Weight decay schedule. :param opt: Optimizer (default='adam'). See optim.optimizers.get_opt_class(). :param lr: Learning rate base value (default=0.04 for classification and 0.14 for regression). :param lr_sched: Learning rate schedule (default='coslog4'). See training.scheduling.get_schedule(). :param mom: Momentum parameter, aka :math:`\\beta_1` for Adam (default=0.9). :param mom_sched: Momentum schedule (default='constant'). :param sq_mom: Momentum of squared gradients, aka :math:`\\beta_2` for Adam (default=0.95). :param sq_mom_sched: Schedule for sq_mom (default='constant'). :param opt_eps: Epsilon parameter of the optimizer (default=1e-8 for Adam). :param opt_eps_sched: Schedule for opt_eps (default='constant'). :param normalize_output: Whether to standardize the target for regression (default=True for regression). :param clamp_output: Whether to clamp the output for predict() for regression to the min/max range seen during training (default=True for regression). :param use_ls: Whether to use label smoothing for classification (default=True for classification). :param ls_eps: Epsilon parameter for label smoothing (default=0.1 for classification) :param ls_eps_sched: Schedule for ls_eps (default='constant'). :param use_early_stopping: Whether to use early stopping (default=False). Note that even without early stopping, the best epoch on the validation set is selected if there is a validation set. Training is stopped if the epoch exceeds early_stopping_multiplicative_patience * best_epoch + early_stopping_additive_patience. :param early_stopping_additive_patience: See use_early_stopping (default=20). :param early_stopping_multiplicative_patience: See use_early_stopping (default=2). We recommend to set it to 1 for monotone learning rate schedules but to keep it at 2 for the default schedule. :param calibration_method: Post-hoc calibration method (only for classification). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. :param sort_quantile_predictions: If val_metric_name=='multi_pinball(...)', decides whether the predicted quantiles will be sorted to avoid quantile crossover. Default is True. :param stop_epoch: Epoch at which training should be stopped (for refitting). The total length of training used for the schedules will be determined by n_epochs, but the stopping epoch will be min(stop_epoch, n_epochs). :param use_best_mean_epoch_for_cv: If training an ensemble, whether they should all use a checkpoint from the same epoch with the best average loss, instead of using the best individual epochs (default=False). :param n_ens: Number of ensemble members that should be used per train-validation split (default=1). For best-epoch selection, the validation scores of averaged predictions will be used. :param ens_av_before_softmax: When using classifiction with n_ens>1, whether to average the ensemble predictions on each train-val split before taking the softmax (default=False). We recommend using False as it is representative of the averaging of models across train-val splits. """ super().__init__() # call the constructor of the other superclass for multiple inheritance self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.train_metric_name = train_metric_name self.val_metric_name = val_metric_name self.n_epochs = n_epochs self.batch_size = batch_size self.predict_batch_size = predict_batch_size self.hidden_sizes = hidden_sizes self.n_hidden_layers = n_hidden_layers self.hidden_width = hidden_width self.tfms = tfms self.max_one_hot_cat_size = max_one_hot_cat_size self.embedding_size = embedding_size self.num_emb_type = num_emb_type self.use_plr_embeddings = use_plr_embeddings self.plr_sigma = plr_sigma self.plr_hidden_1 = plr_hidden_1 self.plr_hidden_2 = plr_hidden_2 self.plr_act_name = plr_act_name self.plr_use_densenet = plr_use_densenet self.plr_use_cos_bias = plr_use_cos_bias self.plr_lr_factor = plr_lr_factor self.act = act self.use_parametric_act = use_parametric_act self.act_lr_factor = act_lr_factor self.weight_param = weight_param self.weight_init_mode = weight_init_mode self.weight_init_gain = weight_init_gain self.weight_lr_factor = weight_lr_factor self.bias_init_mode = bias_init_mode self.bias_lr_factor = bias_lr_factor self.bias_wd_factor = bias_wd_factor self.add_front_scale = add_front_scale self.scale_lr_factor = scale_lr_factor self.first_layer_lr_factor = first_layer_lr_factor self.block_str = block_str self.first_layer_config = first_layer_config self.last_layer_config = last_layer_config self.middle_layer_config = middle_layer_config self.p_drop = p_drop self.p_drop_sched = p_drop_sched self.wd = wd self.wd_sched = wd_sched self.opt = opt self.lr = lr self.lr_sched = lr_sched self.mom = mom self.mom_sched = mom_sched self.sq_mom = sq_mom self.sq_mom_sched = sq_mom_sched self.opt_eps = opt_eps self.opt_eps_sched = opt_eps_sched self.normalize_output = normalize_output self.clamp_output = clamp_output self.use_ls = use_ls self.ls_eps = ls_eps self.ls_eps_sched = ls_eps_sched self.use_early_stopping = use_early_stopping self.early_stopping_additive_patience = early_stopping_additive_patience self.early_stopping_multiplicative_patience = early_stopping_multiplicative_patience self.calibration_method = calibration_method self.sort_quantile_predictions = sort_quantile_predictions self.stop_epoch = stop_epoch self.use_best_mean_epoch_for_cv = use_best_mean_epoch_for_cv self.n_ens = n_ens self.ens_av_before_softmax = ens_av_before_softmax class RealMLP_TD_Classifier(RealMLPConstructorMixin, AlgInterfaceClassifier): """ MLP-TD classifier. For constructor parameters, see `MLPConstructorMixin`. """ def _get_default_params(self): return DefaultParams.RealMLP_TD_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: return NNAlgInterface(**self.get_config()) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealMLP_TD_S_Classifier(RealMLPConstructorMixin, AlgInterfaceClassifier): """ MLP-TD-S classifier. For constructor parameters, see `MLPConstructorMixin`. """ def _get_default_params(self): return DefaultParams.RealMLP_TD_S_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: return NNAlgInterface(**self.get_config()) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealMLP_TD_Regressor(RealMLPConstructorMixin, AlgInterfaceRegressor): """ MLP-TD regressor. For constructor parameters, see `MLPConstructorMixin`. """ def _get_default_params(self): return DefaultParams.RealMLP_TD_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: return NNAlgInterface(**self.get_config()) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealMLP_TD_S_Regressor(RealMLPConstructorMixin, AlgInterfaceRegressor): """ MLP-TD-S regressor. For constructor parameters, see `MLPConstructorMixin`. """ def _get_default_params(self): return DefaultParams.RealMLP_TD_S_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: return NNAlgInterface(**self.get_config()) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] # --------------------------------- GBDTs ----------------------------------- class LGBMConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_estimators: Optional[int] = None, max_depth: Optional[int] = None, num_leaves: Optional[int] = None, lr: Optional[float] = None, subsample: Optional[float] = None, colsample_bytree: Optional[float] = None, bagging_freq: Optional[float] = None, min_data_in_leaf: Optional[int] = None, min_sum_hessian_in_leaf: Optional[int] = None, lambda_l1: Optional[float] = None, lambda_l2: Optional[float] = None, boosting: Optional[str] = None, max_bin: Optional[int] = None, cat_smooth: Optional[float] = None, cat_l2: Optional[float] = None, val_metric_name: Optional[str] = None, calibration_method: Optional[str] = None, ): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_estimators = n_estimators self.max_depth = max_depth self.num_leaves = num_leaves self.lr = lr self.subsample = subsample self.colsample_bytree = colsample_bytree self.bagging_freq = bagging_freq self.min_data_in_leaf = min_data_in_leaf self.min_sum_hessian_in_leaf = min_sum_hessian_in_leaf self.lambda_l1 = lambda_l1 self.lambda_l2 = lambda_l2 self.boosting = boosting self.max_bin = max_bin self.cat_smooth = cat_smooth self.cat_l2 = cat_l2 self.val_metric_name = val_metric_name self.calibration_method = calibration_method class LGBM_TD_Classifier(LGBMConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.LGBM_TD_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) class LGBM_D_Classifier(LGBMConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.LGBM_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) class LGBM_TD_Regressor(LGBMConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.LGBM_TD_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_multioutput(self) -> bool: return False class LGBM_D_Regressor(LGBMConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.LGBM_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface return SingleSplitWrapperAlgInterface([LGBMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_multioutput(self) -> bool: return False class XGBConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, train_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None, n_estimators: Optional[int] = None, max_depth: Optional[int] = None, lr: Optional[float] = None, subsample: Optional[float] = None, colsample_bytree: Optional[float] = None, colsample_bylevel: Optional[float] = None, colsample_bynode: Optional[float] = None, min_child_weight: Optional[float] = None, alpha: Optional[float] = None, reg_lambda: Optional[float] = None, gamma: Optional[float] = None, tree_method: Optional[str] = None, max_delta_step: Optional[float] = None, max_cat_to_onehot: Optional[int] = None, num_parallel_tree: Optional[int] = None, max_bin: Optional[int] = None, multi_strategy: Optional[str] = None, calibration_method: Optional[str] = None, ): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.train_metric_name = train_metric_name self.val_metric_name = val_metric_name self.n_estimators = n_estimators self.max_depth = max_depth self.lr = lr self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.colsample_bynode = colsample_bynode self.min_child_weight = min_child_weight self.alpha = alpha self.reg_lambda = reg_lambda self.gamma = gamma self.tree_method = tree_method self.max_delta_step = max_delta_step self.max_cat_to_onehot = max_cat_to_onehot self.num_parallel_tree = num_parallel_tree self.max_bin = max_bin self.multi_strategy = multi_strategy self.calibration_method = calibration_method class XGB_TD_Classifier(XGBConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.XGB_TD_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class XGB_D_Classifier(XGBConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.XGB_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class XGB_PBB_D_Classifier(XGBConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.XGB_PBB_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class XGB_TD_Regressor(XGBConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.XGB_TD_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] def _supports_multioutput(self) -> bool: return False class XGB_D_Regressor(XGBConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.XGB_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface return SingleSplitWrapperAlgInterface([XGBSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] def _supports_multioutput(self) -> bool: return False class CatBoostConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_estimators: Optional[int] = None, max_depth: Optional[int] = None, lr: Optional[float] = None, subsample: Optional[float] = None, colsample_bylevel: Optional[float] = None, random_strength: Optional[float] = None, bagging_temperature: Optional[float] = None, leaf_estimation_iterations: Optional[int] = None, bootstrap_type: Optional[str] = None, boosting_type: Optional[str] = None, min_data_in_leaf: Optional[int] = None, grow_policy: Optional[str] = None, num_leaves: Optional[int] = None, max_bin: Optional[int] = None, # renamed from border_count since it is named max_bin in the default parameters l2_leaf_reg: Optional[float] = None, one_hot_max_size: Optional[int] = None, val_metric_name: Optional[str] = None, train_metric_name: Optional[str] = None, calibration_method: Optional[str] = None, ): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_estimators = n_estimators self.max_depth = max_depth self.lr = lr self.subsample = subsample self.colsample_bylevel = colsample_bylevel self.random_strength = random_strength self.bagging_temperature = bagging_temperature self.leaf_estimation_iterations = leaf_estimation_iterations self.bootstrap_type = bootstrap_type self.boosting_type = boosting_type self.min_data_in_leaf = min_data_in_leaf self.grow_policy = grow_policy self.num_leaves = num_leaves self.max_bin = max_bin self.l2_leaf_reg = l2_leaf_reg self.one_hot_max_size = one_hot_max_size self.val_metric_name = val_metric_name self.train_metric_name = train_metric_name self.calibration_method = calibration_method class CatBoost_TD_Classifier(CatBoostConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.CB_TD_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_D_Classifier(CatBoostConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.CB_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_TD_Regressor(CatBoostConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.CB_TD_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_D_Regressor(CatBoostConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.CB_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface return SingleSplitWrapperAlgInterface([CatBoostSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class RFConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_estimators: Optional[int] = None, calibration_method: Optional[str] = None, ): """ Validation set is not used. :param device: :param random_state: :param n_cv: :param n_refit: :param n_repeats: :param val_fraction: :param n_threads: :param tmp_folder: :param verbosity: :param n_estimators: :param calibration_method: Post-hoc calibration method (only for classification). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_estimators = n_estimators self.calibration_method = calibration_method class RF_SKL_D_Classifier(RFConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.RF_SKL_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**self.get_config()) for i in range(n_cv)]) class RF_SKL_D_Regressor(RFConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.RF_SKL_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface return SingleSplitWrapperAlgInterface([RFSubSplitInterface(**self.get_config()) for i in range(n_cv)]) class MLPSKLConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, calibration_method: Optional[str] = None, ): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.calibration_method = calibration_method class MLP_SKL_D_Classifier(MLPSKLConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.MLP_SKL_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import SklearnMLPSubSplitInterface return SingleSplitWrapperAlgInterface([SklearnMLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) class MLP_SKL_D_Regressor(MLPSKLConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.MLP_SKL_D def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import SklearnMLPSubSplitInterface return SingleSplitWrapperAlgInterface([SklearnMLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) # HPO methods class GBDTHPOConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_estimators: Optional[int] = None, hpo_space_name: Optional[str] = None, n_hyperopt_steps: Optional[int] = None, calibration_method: Optional[str] = None, use_caruana_ensembling: Optional[bool] = None, time_limit_s: Optional[float] = None, ): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_estimators = n_estimators self.hpo_space_name = hpo_space_name self.n_hyperopt_steps = n_hyperopt_steps self.calibration_method = calibration_method self.use_caruana_ensembling = use_caruana_ensembling self.time_limit_s = time_limit_s class XGB_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface([RandomParamsXGBAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class XGB_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, tree_method='hist', hpo_space_name='grinsztajn') def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBHyperoptAlgInterface return XGBHyperoptAlgInterface(**self.get_config()) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class XGB_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface([RandomParamsXGBAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_multioutput(self) -> bool: return False class XGB_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, tree_method='hist', hpo_space_name='grinsztajn') def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBHyperoptAlgInterface return XGBHyperoptAlgInterface(**self.get_config()) def _supports_multioutput(self) -> bool: return False class LGBM_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface([RandomParamsLGBMAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) class LGBM_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, hpo_space_name='catboost_quality_benchmarks') def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMHyperoptAlgInterface return LGBMHyperoptAlgInterface(**self.get_config()) class LGBM_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface([RandomParamsLGBMAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_multioutput(self) -> bool: return False class LGBM_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, hpo_space_name='catboost_quality_benchmarks') def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMHyperoptAlgInterface return LGBMHyperoptAlgInterface(**self.get_config()) def _supports_multioutput(self) -> bool: return False class CatBoost_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface( [RandomParamsCatBoostAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_HPO_TPE_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, hpo_space_name='shwartz-ziv') def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostHyperoptAlgInterface return CatBoostHyperoptAlgInterface(**self.get_config()) def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface( [RandomParamsCatBoostAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class CatBoost_HPO_TPE_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_estimators=1000, n_hyperopt_steps=50, early_stopping_rounds=300, hpo_space_name='shwartz-ziv') def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostHyperoptAlgInterface return CatBoostHyperoptAlgInterface(**self.get_config()) def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda'] class RF_HPO_Classifier(GBDTHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import RandomParamsRFAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface( [RandomParamsRFAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False class RF_HPO_Regressor(GBDTHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self) -> Dict[str, Any]: return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.other_interfaces import RandomParamsRFAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [SingleSplitWrapperAlgInterface( [RandomParamsRFAlgInterface(model_idx=i, **config) for j in range(n_cv)]) for i in range(n_hyperopt_steps)], **config) def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False class RealMLPHPOConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_hyperopt_steps: Optional[int] = None, val_metric_name: Optional[str] = None, calibration_method: Optional[str] = None, hpo_space_name: Optional[str] = None, n_caruana_steps: Optional[int] = None, n_epochs: Optional[int] = None, use_caruana_ensembling: Optional[bool] = None, train_metric_name: Optional[str] = None, time_limit_s: Optional[float] = None, ): """ :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None). If None, 'cuda' will be used if available, otherwise 'cpu'. :param random_state: Random state to use for random number generation (splitting, initialization, batch shuffling). If None, the behavior is not deterministic. :param n_cv: Number of cross-validation splits to use (default=1). If validation set indices or an explicit validation set are given in fit(), `n_cv` models will be fitted using different random seeds. Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). For n_cv=1, a single train-validation split will be used, where `val_fraction` controls the fraction of validation samples. If `n_refit=0` is set, the prediction will use the average of the models fitted during cross-validation. (Averaging is over probabilities for classification, and over outputs for regression.) Otherwise, refitted models will be used. :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0). If zero, only the models from the cross-validation stage are used. If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit()) and their predictions will be averaged during predict(). :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1). Values != 1 are only allowed when no custom validation split is provided. Larger number of repeats make things slower but reduce the potential for validation set overfitting, especially on smaller datasets. :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1). Only used if `n_cv==1` and no validation split is provided in fit(). :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores). :param tmp_folder: Folder in which models can be stored. Setting this allows reducing RAM/VRAM usage by not having all models in RAM at the same time. In this case, the folder needs to be preserved as long as the model exists (including when the model is pickled to disk). :param verbosity: Verbosity level (default=0, higher means more verbose). Set to 2 to see logs from intermediate epochs. :param n_hyperopt_steps: Number of random hyperparameter configs that should be used to train models (default=50). :param val_metric_name: Name of the validation metric (used for selecting the best epoch). Not used for all models but at least for RealMLP and probably TabM. Defaults are 'class_error' for classification and 'rmse' for regression. Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo', '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'. Main available regression metrics: 'rmse', 'mae', 'max_error', 'pinball(0.95)' (also works with other quantiles specified directly in the string). For more metrics, we refer to `models.training.metrics.Metrics.apply()`. :param calibration_method: Post-hoc calibration method (only for classification) (default=None). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. :param hpo_space_name: Name of the HPO space (default='default'). The search space used in the paper for RealMLP is 'default'. However, we recommend using 'tabarena' for the best results. :param n_caruana_steps: Number of weight update iterations for Caruana et al. weighted ensembling (default=40). This parameter is only used when use_caruana_ensembling=True. :param n_epochs: Number of epochs to train for each NN (default=None). If set, it will override the values from the search space. (Might be ignored for non-RealMLP methods.) :param use_caruana_ensembling: Whether to use the algorithm by Caruana et al. (2004) to select a weighted ensemble of models instead of only selecting the best model (default=False). :param train_metric_name: Name of the training metric (default is cross_entropy for classification and mse for regression). For regression, pinball/multi_pinball can be used instead. (Might be ignored for non-RealMLP methods.) :param time_limit_s: Time limit in seconds (default=None). """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_hyperopt_steps = n_hyperopt_steps self.val_metric_name = val_metric_name self.calibration_method = calibration_method self.hpo_space_name = hpo_space_name self.n_caruana_steps = n_caruana_steps self.n_epochs = n_epochs self.use_caruana_ensembling = use_caruana_ensembling self.train_metric_name = train_metric_name self.time_limit_s = time_limit_s class RealMLP_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsNNAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealMLP_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsNNAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class ResnetConstructorMixin: def __init__(self, module_d_embedding: Optional[int] = None, module_d: Optional[int] = None, module_d_hidden_factor: Optional[float] = None, module_n_layers: Optional[int] = None, module_activation: Optional[str] = None, module_normalization: Optional[str] = None, module_hidden_dropout: Optional[float] = None, module_residual_dropout: Optional[float] = None, verbose: Optional[int] = None, max_epochs: Optional[int] = None, batch_size: Optional[int] = None, optimizer: Optional[str] = None, es_patience: Optional[int] = None, lr: Optional[float] = None, lr_scheduler: Optional[bool] = None, lr_patience: Optional[int] = None, optimizer_weight_decay: Optional[float] = None, use_checkpoints: Optional[bool] = None, transformed_target: Optional[bool] = None, tfms: Optional[List[str]] = None, quantile_output_distribution: Optional[str] = None, val_metric_name: Optional[str] = None, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, calibration_method: Optional[str] = None, ): self.module_d_embedding = module_d_embedding self.module_d = module_d self.module_d_hidden_factor = module_d_hidden_factor self.module_n_layers = module_n_layers self.module_activation = module_activation self.module_normalization = module_normalization self.module_hidden_dropout = module_hidden_dropout self.module_residual_dropout = module_residual_dropout self.verbose = verbose self.max_epochs = max_epochs self.batch_size = batch_size self.optimizer = optimizer self.es_patience = es_patience self.lr_scheduler = lr_scheduler self.lr_patience = lr_patience self.lr = lr self.optimizer_weight_decay = optimizer_weight_decay self.use_checkpoints = use_checkpoints self.transformed_target = transformed_target self.tfms = tfms self.quantile_output_distribution = quantile_output_distribution self.val_metric_name = val_metric_name self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.calibration_method = calibration_method class Resnet_RTDL_D_Classifier(ResnetConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.RESNET_RTDL_D_CLASS_TabZilla def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import ResnetSubSplitInterface return SingleSplitWrapperAlgInterface([ResnetSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) # only on windows, only recently?? probably a skorch problem? return True class Resnet_RTDL_D_Regressor(ResnetConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.RESNET_RTDL_D_REG_TabZilla def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import ResnetSubSplitInterface return SingleSplitWrapperAlgInterface([ResnetSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True class FTTransformerConstructorMixin: def __init__(self, module_d_token: Optional[int] = None, module_d_ffn_factor: Optional[float] = None, module_n_layers: Optional[int] = None, module_n_heads: Optional[int] = None, module_token_bias: Optional[bool] = None, module_attention_dropout: Optional[float] = None, module_ffn_dropout: Optional[float] = None, module_residual_dropout: Optional[float] = None, module_activation: Optional[str] = None, module_prenormalization: Optional[bool] = None, module_initialization: Optional[str] = None, module_kv_compression: Optional[str] = None, module_kv_compression_sharing: Optional[str] = None, verbose: Optional[int] = None, max_epochs: Optional[int] = None, batch_size: Optional[int] = None, optimizer: Optional[str] = None, es_patience: Optional[int] = None, lr: Optional[float] = None, lr_scheduler: Optional[bool] = None, lr_patience: Optional[int] = None, optimizer_weight_decay: Optional[float] = None, use_checkpoints: Optional[bool] = None, transformed_target: Optional[bool] = None, tfms: Optional[List[str]] = None, quantile_output_distribution: Optional[str] = None, val_metric_name: Optional[str] = None, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, calibration_method: Optional[str] = None, ): self.module_d_token = module_d_token self.module_d_ffn_factor = module_d_ffn_factor self.module_n_layers = module_n_layers self.module_n_heads = module_n_heads self.module_token_bias = module_token_bias self.module_attention_dropout = module_attention_dropout self.module_ffn_dropout = module_ffn_dropout self.module_residual_dropout = module_residual_dropout self.module_activation = module_activation self.module_prenormalization = module_prenormalization self.module_initialization = module_initialization self.module_kv_compression = module_kv_compression self.module_kv_compression_sharing = module_kv_compression_sharing self.verbose = verbose self.max_epochs = max_epochs self.batch_size = batch_size self.optimizer = optimizer self.es_patience = es_patience self.lr_scheduler = lr_scheduler self.lr_patience = lr_patience self.lr = lr self.optimizer_weight_decay = optimizer_weight_decay self.use_checkpoints = use_checkpoints self.transformed_target = transformed_target self.tfms = tfms self.quantile_output_distribution = quantile_output_distribution self.val_metric_name = val_metric_name self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.calibration_method = calibration_method class FTT_D_Classifier(FTTransformerConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.FTT_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import FTTransformerSubSplitInterface return SingleSplitWrapperAlgInterface( [FTTransformerSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) # only on windows, only recently?? probably a skorch problem? return True class FTT_D_Regressor(FTTransformerConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.FTT_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import FTTransformerSubSplitInterface return SingleSplitWrapperAlgInterface( [FTTransformerSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True class RTDL_MLPConstructorMixin: def __init__(self, module_d_embedding: Optional[int] = None, module_d_layers: Optional[int] = None, module_d_first_layer: Optional[int] = None, module_d_last_layer: Optional[int] = None, module_n_layers: Optional[int] = None, module_dropout: Optional[float] = None, verbose: Optional[int] = None, max_epochs: Optional[int] = None, batch_size: Optional[int] = None, optimizer: Optional[str] = None, es_patience: Optional[int] = None, lr: Optional[float] = None, lr_scheduler: Optional[bool] = None, lr_patience: Optional[int] = None, optimizer_weight_decay: Optional[float] = None, use_checkpoints: Optional[bool] = None, transformed_target: Optional[bool] = None, tfms: Optional[List[str]] = None, quantile_output_distribution: Optional[str] = None, val_metric_name: Optional[str] = None, module_num_emb_type: Optional[str] = None, module_num_emb_dim: Optional[int] = None, module_num_emb_hidden_dim: Optional[int] = None, module_num_emb_sigma: Optional[float] = None, module_num_emb_lite: Optional[bool] = None, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, calibration_method: Optional[str] = None, ): self.module_d_embedding = module_d_embedding self.module_d_layers = module_d_layers self.module_d_first_layer = module_d_first_layer self.module_d_last_layer = module_d_last_layer self.module_n_layers = module_n_layers self.module_dropout = module_dropout self.verbose = verbose self.max_epochs = max_epochs self.batch_size = batch_size self.optimizer = optimizer self.es_patience = es_patience self.lr_scheduler = lr_scheduler self.lr_patience = lr_patience self.lr = lr self.optimizer_weight_decay = optimizer_weight_decay self.use_checkpoints = use_checkpoints self.transformed_target = transformed_target self.tfms = tfms self.quantile_output_distribution = quantile_output_distribution self.module_num_emb_type = module_num_emb_type self.module_num_emb_dim = module_num_emb_dim self.module_num_emb_hidden_dim = module_num_emb_hidden_dim self.module_num_emb_sigma = module_num_emb_sigma self.module_num_emb_lite = module_num_emb_lite self.val_metric_name = val_metric_name self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.calibration_method = calibration_method class MLP_RTDL_D_Classifier(RTDL_MLPConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.MLP_RTDL_D_CLASS_TabZilla def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) # only on windows, only recently?? probably a skorch problem? return True class MLP_RTDL_D_Regressor(RTDL_MLPConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.MLP_RTDL_D_REG_TabZilla def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True class MLP_PLR_D_Classifier(RTDL_MLPConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.MLP_PLR_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) # only on windows, only recently?? probably a skorch problem? return True class MLP_PLR_D_Regressor(RTDL_MLPConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.MLP_PLR_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RTDL_MLPSubSplitInterface return SingleSplitWrapperAlgInterface([RTDL_MLPSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True class TabrConstructorMixin: def __init__(self, num_embeddings: Optional[int] = None, d_main: Optional[int] = None, d_multiplier: Optional[int] = None, encoder_n_blocks: Optional[int] = None, predictor_n_blocks: Optional[int] = None, mixer_normalization: Optional[Union[bool, Literal['auto']]] = None, context_dropout: Optional[float] = None, dropout0: Optional[float] = None, dropout1: Optional[float] = None, normalization: Optional[str] = None, activation: Optional[str] = None, memory_efficient: Optional[bool] = None, candidate_encoding_batch_size: Optional[int] = None, n_epochs: Optional[int] = None, batch_size: Optional[int] = None, eval_batch_size: Optional[int] = None, context_size: Optional[int] = None, freeze_contexts_after_n_epochs: Optional[int] = None, optimizer: Optional[Dict] = None, patience: Optional[int] = None, transformed_target: Optional[bool] = None, tfms: Optional[List[str]] = None, quantile_output_distribution: Optional[str] = None, val_metric_name: Optional[str] = None, add_scaling_layer: Optional[bool] = None, scale_lr_factor: Optional[float] = None, use_ntp_linear: Optional[bool] = None, linear_init_type: Optional[str] = None, # only relevant if use_ntp_linear=True use_ntp_encoder: Optional[bool] = None, ls_eps: Optional[float] = None, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, calibration_method: Optional[str] = None, ): self.num_embeddings = num_embeddings self.d_main = d_main self.d_multiplier = d_multiplier self.encoder_n_blocks = encoder_n_blocks self.predictor_n_blocks = predictor_n_blocks self.mixer_normalization = mixer_normalization self.context_dropout = context_dropout self.dropout0 = dropout0 self.dropout1 = dropout1 self.normalization = normalization self.activation = activation self.memory_efficient = memory_efficient self.candidate_encoding_batch_size = candidate_encoding_batch_size self.n_epochs = n_epochs self.batch_size = batch_size self.eval_batch_size = eval_batch_size self.context_size = context_size self.freeze_contexts_after_n_epochs = freeze_contexts_after_n_epochs self.optimizer = optimizer self.patience = patience self.transformed_target = transformed_target self.tfms = tfms self.quantile_output_distribution = quantile_output_distribution self.val_metric_name = val_metric_name self.add_scaling_layer = add_scaling_layer self.scale_lr_factor = scale_lr_factor self.use_ntp_linear = use_ntp_linear self.linear_init_type = linear_init_type self.use_ntp_encoder = use_ntp_encoder self.ls_eps = ls_eps self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.calibration_method = calibration_method class TabR_S_D_Classifier(TabrConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.TABR_S_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class TabR_S_D_Regressor(TabrConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.TABR_S_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealTabR_D_Classifier(TabrConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.RealTABR_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class RealTabR_D_Regressor(TabrConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.RealTABR_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import TabRSubSplitInterface return SingleSplitWrapperAlgInterface([TabRSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class TabMConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, arch_type: Optional[str] = None, tabm_k: Optional[int] = None, num_emb_type: Optional[str] = None, num_emb_n_bins: Optional[int] = None, batch_size: Optional[int] = None, lr: Optional[float] = None, weight_decay: Optional[float] = None, n_epochs: Optional[int] = None, patience: Optional[int] = None, d_embedding: Optional[int] = None, d_block: Optional[int] = None, n_blocks: Optional[Union[str, int]] = None, dropout: Optional[float] = None, compile_model: Optional[bool] = None, allow_amp: Optional[bool] = None, tfms: Optional[List[str]] = None, gradient_clipping_norm: Optional[Union[float, Literal['none']]] = None, calibration_method: Optional[str] = None, share_training_batches: Optional[bool] = None, val_metric_name: Optional[str] = None, train_metric_name: Optional[str] = None, ): """ :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None). If None, 'cuda' will be used if available, otherwise 'cpu'. :param random_state: Random state to use for random number generation (splitting, initialization, batch shuffling). If None, the behavior is not deterministic. :param n_cv: Number of cross-validation splits to use (default=1). If validation set indices are given in fit(), `n_cv` models will be fitted using different random seeds. Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). If `n_refit=0` is set, the prediction will use the average of the models fitted during cross-validation. (Averaging is over probabilities for classification, and over outputs for regression.) Otherwise, refitted models will be used. :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0). If zero, only the models from the cross-validation stage are used. If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit()) and their predictions will be averaged during predict(). :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1). Values != 1 are only allowed when no custom validation split is provided. Larger number of repeats make things slower but reduce the potential for validation set overfitting, especially on smaller datasets. :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1). Only used if `n_cv==1` and no validation split is provided in fit(). :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores). :param tmp_folder: Temporary folder in which data can be stored during fit(). (Currently unused for TabM and variants.) If None, methods generally try to not store intermediate data. :param verbosity: Verbosity level (default=0, higher means more verbose). Set to 2 to see logs from intermediate epochs. :param arch_type: Architecture type for TabM, one of ['tabm', 'tabm-mini', 'tabm-normal', 'tabm-mini-normal', 'plain']. :param tabm_k: Value of $k$ (number of memory-efficient ensemble members). Default is 32. :param num_emb_type: Type of numerical embedding, one of ['none', 'pwl']. Default is 'none'. 'pwl' stands for piecewise linear embeddings. :param num_emb_n_bins: Number of bins for piecewise linear embeddings (default=48). Only used when piecewise linear numerical embeddings are used. Must be at most the number of training samples, but >1. :param batch_size: Batch size, default is 256. :param lr: Learning rate, default is 2e-3. :param weight_decay: Weight decay, default is 0. :param n_epochs: Maximum number of epochs (if early stopping doesn't apply). Default is 1 billion. :param patience: Patience for early stopping. Default is 16 :param d_embedding: Embedding dimension for numerical embeddings. :param d_block: Hidden layer size. :param n_blocks: Number of linear layers, or 'auto'. Default is 'auto', which will use 3 when num_emb_type=='none' and 2 otherwise. :param dropout: Dropout probability. Default is 0.1. :param compile_model: Whether torch.compile should be applied to the model (default=False). :param allow_amp: Whether automatic mixed precision should be used if the device is a GPU (default=False). :param tfms: Preprocessing transformations, see models.nn_models.models.PreprocessingFactory. Default is ['quantile_tabr']. Categorical values will be one-hot encoded by the model. Note that in the original experiments, it seems that when cat_policy='ordinal', the ordinal-encoded categorical values will later be one-hot encoded by the model. :param gradient_clipping_norm: Norm for gradient clipping. Default is None from the example code (no gradient clipping), but the experiments from the paper use 1.0. :param calibration_method: Post-hoc calibration method (only for classification). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. :param share_training_batches: New in v1.4.1: Whether TabM should use the same training samples for each model in the batch (default=False). We adopt the default value False from the newer version of TabM, while the old code (prior to 1.4.1) was equivalent to share_training_batches=True, except that the new code also excludes certain parameters from weight decay. :param val_metric_name: Name of the validation metric used for early stopping. For classification, the default is 'class_error' but could be 'cross_entropy', 'brier', '1-auc_ovr' etc. For regression, the default is 'rmse' but could be 'mae'. :param train_metric_name: Name of the metric (loss) used for training. For classification, the default is 'cross_entropy'. For regression, it is 'mse' but could be set to something like 'multi_pinball(0.05,0.95)'. """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.arch_type = arch_type self.num_emb_type = num_emb_type self.num_emb_n_bins = num_emb_n_bins self.n_epochs = n_epochs self.patience = patience self.batch_size = batch_size self.compile_model = compile_model self.lr = lr self.weight_decay = weight_decay self.d_embedding = d_embedding self.d_block = d_block self.n_blocks = n_blocks self.dropout = dropout self.tabm_k = tabm_k self.allow_amp = allow_amp self.tfms = tfms self.gradient_clipping_norm = gradient_clipping_norm self.calibration_method = calibration_method self.share_training_batches = share_training_batches self.val_metric_name = val_metric_name self.train_metric_name = train_metric_name class TabM_D_Classifier(TabMConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.TABM_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_class(self) -> bool: return False def _supports_single_sample(self) -> bool: return False class TabM_D_Regressor(TabMConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.TABM_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabm_interface import TabMSubSplitInterface return SingleSplitWrapperAlgInterface([TabMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_multioutput(self) -> bool: return False def _supports_single_sample(self) -> bool: return False class TabM_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): """ HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings """ def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabm_interface import RandomParamsTabMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsTabMAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class TabM_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): """ HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings """ def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabm_interface import RandomParamsTabMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsTabMAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] # ------------------------------ class XRFMConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, bandwidth: Optional[float] = None, p_interp: Optional[float] = None, exponent: Optional[float] = None, reg: Optional[float] = None, iters: Optional[int] = None, diag: Optional[bool] = None, bandwidth_mode: Optional[str] = None, kernel_type: Optional[str] = None, max_leaf_samples: Optional[int] = None, val_metric_name: Optional[str] = None, early_stop_rfm: Optional[bool] = None, early_stop_multiplier: Optional[float] = None, classification_mode: Optional[str] = None, calibration_method: Optional[str] = None, time_limit_s: Optional[float] = None, M_batch_size: Optional[int] = None, ): """ xRFM. In case of out-of-memory, try reducing M_batch_size and/or max_leaf_samples. Some parameters generally benefit a lot from tuning, such as the regularization (reg). :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None). If None, 'cuda' will be used if available, otherwise 'cpu'. :param random_state: Random state to use for random number generation (splitting, initialization, batch shuffling). If None, the behavior is not deterministic. :param n_cv: Number of cross-validation splits to use (default=1). If validation set indices are given in fit(), `n_cv` models will be fitted using different random seeds. Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). If `n_refit=0` is set, the prediction will use the average of the models fitted during cross-validation. (Averaging is over probabilities for classification, and over outputs for regression.) Otherwise, refitted models will be used. :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0). If zero, only the models from the cross-validation stage are used. If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit()) and their predictions will be averaged during predict(). :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1). Values != 1 are only allowed when no custom validation split is provided. Larger number of repeats make things slower but reduce the potential for validation set overfitting, especially on smaller datasets. :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1). Only used if `n_cv==1` and no validation split is provided in fit(). :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores). :param tmp_folder: Temporary folder in which data can be stored during fit(). (Currently unused for xRFM and variants.) If None, methods generally try to not store intermediate data. :param verbosity: Verbosity level (default=0, higher means more verbose). :param bandwidth: Bandwidth of the kernel, i.e., how wide the kernel is (default=10). :param p_interp: For kernel_type='lpq', this parameter controls the parameter p of the L_p norm in the exponent of the kernel. Specifically, we set p = 2 * p_interp + exponent * (1 - p_interp). Should be in [0, 1]. :param exponent: Exponent of the norm inside the kernel (default=1). Should be in (0, 2]. Recommended values are in [0.7, 1.4]. :param reg: Regularization parameter lambda in the kernel ridge regression (default=1e-3). :param iters: How many iterations (fitting the regressor, updating the AGOP matrix) should be done (default=5). The default should be good for most cases. :param diag: Whether to only fit a diagonal AGOP matrix (default=True). :param bandwidth_mode: How to set the bandwidth (default='constant'). For 'constant', the specified bandwidth will be used directly. For 'adaptive', it will be scaled relative to the median distance between samples. We recommend 'constant' for smaller datasets (< max_leaf_samples) where only a single RFM is fit. For larger datasets, 'adaptive' may be more suited since it can adapt the bandwidth to the data in the leaf. :param kernel_type: Type of kernel (default='l2'). For 'l2', the L_2-norm will be used in the generalized Laplace kernel exp(-||x - x'||_2^q), where q is the exponent. This is the fastest kernel and a good default. For 'lpq', the slower exp(-||x - x'||_p^q) will be used, where p is determined from q and p_interp. It will use the kermac implementation if kermac is installed. :param max_leaf_samples: Maximum number of samples in a leaf of xRFM (default=60_000). For datasets with more than max_leaf_samples samples, the memory usage is O(max_leaf_samples**2) and the time complexity is roughly O(n_samples * max_leaf_samples**2). The default is around 60000, which is optimized for GPUs with ~40 GB of VRAM. Reduce this number to reduce the RAM usage. On GPUs with less VRAM, this number can be automatically lowered to avoid exceeding the maximum RAM. :param val_metric_name: Name of the validation metric (used for selecting the best iteration). Defaults are 'class_error' for classification and 'rmse' for regression. Available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auroc-ovr', 'brier'. Available regression metrics: 'rmse'. :param early_stop_rfm: Whether to stop the iterations early if the error stops decreasing (default=False). :param early_stop_multiplier: Tolerance for early stopping, should be larger than one (default=1.1). Larger values will early-stop less aggressively. :param classification_mode: How to convert classification problems to regression problems internally (default='zero_one'). 'zero_one' uses a one-hot encoding, while 'prevalence' uses a simplex encoding with zero corresponding to the marginal class ratio. :param calibration_method: Post-hoc calibration method (only for classification) (default=None). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. :param time_limit_s: Time limit in seconds (default=None). :param M_batch_size: Batch size used to construct the AGOP matrix M (default=8000). Higher values can speed up the computation but may lead to out-of-memory (esp. for the 'lpq' kernel). """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.bandwidth = bandwidth self.p_interp = p_interp self.exponent = exponent self.reg = reg self.iters = iters self.diag = diag self.bandwidth_mode = bandwidth_mode self.kernel_type = kernel_type self.max_leaf_samples = max_leaf_samples self.val_metric_name = val_metric_name self.early_stop_rfm = early_stop_rfm self.early_stop_multiplier = early_stop_multiplier self.classification_mode = classification_mode self.calibration_method = calibration_method self.time_limit_s = time_limit_s self.M_batch_size = M_batch_size class XRFM_D_Classifier(XRFMConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return DefaultParams.XRFM_D_CLASS def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False class XRFM_D_Regressor(XRFMConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return DefaultParams.XRFM_D_REG def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xrfm_interfaces import xRFMSubSplitInterface return SingleSplitWrapperAlgInterface([xRFMSubSplitInterface(**self.get_config()) for i in range(n_cv)]) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _non_deterministic_tag(self) -> bool: # set non-deterministic # since this class can otherwise fail the check_methods_subset_invariance test due to low precision (?) return True def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False class XRFMHPOConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, n_hyperopt_steps: Optional[int] = None, val_metric_name: Optional[str] = None, max_leaf_samples: Optional[int] = None, M_batch_size: Optional[int] = None, bandwidth_mode: Optional[str] = None, calibration_method: Optional[str] = None, hpo_space_name: Optional[str] = None, n_caruana_steps: Optional[int] = None, use_caruana_ensembling: Optional[bool] = None, time_limit_s: Optional[float] = None, ): """ :param device: PyTorch device name like 'cpu', 'cuda', 'cuda:0', 'mps' (default=None). If None, 'cuda' will be used if available, otherwise 'cpu'. :param random_state: Random state to use for random number generation (splitting, initialization, batch shuffling). If None, the behavior is not deterministic. :param n_cv: Number of cross-validation splits to use (default=1). If validation set indices or an explicit validation set are given in fit(), `n_cv` models will be fitted using different random seeds. Otherwise, `n_cv`-fold cross-validation will be used (stratified for classification). For n_cv=1, a single train-validation split will be used, where `val_fraction` controls the fraction of validation samples. If `n_refit=0` is set, the prediction will use the average of the models fitted during cross-validation. (Averaging is over probabilities for classification, and over outputs for regression.) Otherwise, refitted models will be used. :param n_refit: Number of models that should be refitted on the training+validation dataset (default=0). If zero, only the models from the cross-validation stage are used. If positive, `n_refit` models will be fitted on the training+validation dataset (all data given in fit()) and their predictions will be averaged during predict(). :param n_repeats: Number of times that the (cross-)validation split should be repeated (default=1). Values != 1 are only allowed when no custom validation split is provided. Larger number of repeats make things slower but reduce the potential for validation set overfitting, especially on smaller datasets. :param val_fraction: Fraction of samples used for validation (default=0.2). Has to be in [0, 1). Only used if `n_cv==1` and no validation split is provided in fit(). :param n_threads: Number of threads that the method is allowed to use (default=number of physical cores). :param tmp_folder: Folder in which models can be stored. Setting this allows reducing RAM/VRAM usage by not having all models in RAM at the same time. In this case, the folder needs to be preserved as long as the model exists (including when the model is pickled to disk). :param verbosity: Verbosity level (default=0, higher means more verbose). Set to 2 to see logs from intermediate epochs. :param n_hyperopt_steps: Number of random hyperparameter configs that should be used to train models (default=50). :param val_metric_name: Name of the validation metric (used for selecting the best epoch). Defaults are 'class_error' for classification and 'rmse' for regression. Main available classification metrics (all to be minimized): 'class_error', 'cross_entropy', '1-auc_ovo', '1-auc_ovr', '1-auc_mu', 'brier', '1-balanced_accuracy', '1-mcc', 'ece'. Main available regression metrics: 'rmse', 'mae', 'max_error', 'pinball(0.95)' (also works with other quantiles specified directly in the string). For more metrics, we refer to `models.training.metrics.Metrics.apply()`. :param max_leaf_samples: Maximum number of samples in a leaf of xRFM. For datasets with more than max_leaf_samples samples, the memory usage is O(max_leaf_samples**2) and the time complexity is roughly O(n_samples * max_leaf_samples**2). The default is around 60000, which is optimized for GPUs with ~40 GB of VRAM. Reduce this number to reduce the RAM usage. :param M_batch_size: Batch size used to construct the AGOP matrix M (default=8000). Higher values can speed up the computation but may lead to out-of-memory (esp. for the 'lpq' kernel). :param bandwidth_mode: How to set the bandwidth (default='constant'). For 'constant', the specified bandwidth will be used directly. For 'adaptive', it will be scaled relative to the median distance between samples. We recommend 'constant' for smaller datasets (< max_leaf_samples) where only a single RFM is fit. For larger datasets, 'adaptive' may be more suited since it can adapt the bandwidth to the data in the leaf. :param calibration_method: Post-hoc calibration method (only for classification) (default=None). We recommend 'ts-mix' for fast temperature scaling with Laplace smoothing. For other methods, see the get_calibrator method in https://github.com/dholzmueller/probmetrics. :param hpo_space_name: Name of the HPO space. We recommend using 'tabarena' (the default) for the best results. :param n_caruana_steps: Number of weight update iterations for Caruana et al. weighted ensembling (default=40). This parameter is only used when use_caruana_ensembling=True. :param use_caruana_ensembling: Whether to use the algorithm by Caruana et al. (2004) to select a weighted ensemble of models instead of only selecting the best model (default=False). :param time_limit_s: Time limit in seconds (default=None). """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.n_hyperopt_steps = n_hyperopt_steps self.val_metric_name = val_metric_name self.max_leaf_samples = max_leaf_samples self.M_batch_size = M_batch_size self.bandwidth_mode = bandwidth_mode self.calibration_method = calibration_method self.hpo_space_name = hpo_space_name self.n_caruana_steps = n_caruana_steps self.use_caruana_ensembling = use_caruana_ensembling self.time_limit_s = time_limit_s class XRFM_HPO_Classifier(XRFMHPOConstructorMixin, AlgInterfaceClassifier): """ HPO spaces ('default') use xRFM """ def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xrfm_interfaces import RandomParamsxRFMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsxRFMAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False class XRFM_HPO_Regressor(XRFMHPOConstructorMixin, AlgInterfaceRegressor): """ HPO spaces ('default', 'tabarena') use TabM-mini with numerical embeddings """ def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.xrfm_interfaces import RandomParamsxRFMAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsxRFMAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] def _supports_single_sample(self) -> bool: return False def _supports_multioutput(self) -> bool: return False # ------------------------------ class MLP_RTDL_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class MLP_RTDL_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class MLP_PLR_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsRTDLMLPAlgInterface(model_idx=i, num_emb_type='plr', **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class MLP_PLR_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsRTDLMLPAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type( [RandomParamsRTDLMLPAlgInterface(model_idx=i, num_emb_type='plr', **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class Resnet_RTDL_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsResnetAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsResnetAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class Resnet_RTDL_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsResnetAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsResnetAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class FTT_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsFTTransformerAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsFTTransformerAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class FTT_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.rtdl_interfaces import RandomParamsFTTransformerAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsFTTransformerAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class TabR_HPO_Classifier(RealMLPHPOConstructorMixin, AlgInterfaceClassifier): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import RandomParamsTabRAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsTabRAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class TabR_HPO_Regressor(RealMLPHPOConstructorMixin, AlgInterfaceRegressor): def _get_default_params(self): return dict(n_hyperopt_steps=50) def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.tabr_interface import RandomParamsTabRAlgInterface config = self.get_config() n_hyperopt_steps = config['n_hyperopt_steps'] interface_type = CaruanaEnsembleAlgInterface if config.get('use_caruana_ensembling', False) else AlgorithmSelectionAlgInterface return interface_type([RandomParamsTabRAlgInterface(model_idx=i, **config) for i in range(n_hyperopt_steps)], **config) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] # Ensemble-TD class Ensemble_TD_Classifier(AlgInterfaceClassifier): def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, val_metric_name: Optional[str] = None, use_ls: Optional[bool] = None, calibration_method: Optional[str] = None): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.val_metric_name = val_metric_name self.use_ls = use_ls self.calibration_method = calibration_method def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface extra_params = dict() if self.val_metric_name is not None: extra_params['val_metric_name'] = self.val_metric_name if self.use_ls is not None: extra_params['use_ls'] = self.use_ls if self.calibration_method is not None: extra_params['calibration_method'] = self.calibration_method td_interfaces = [ SingleSplitWrapperAlgInterface( [LGBMSubSplitInterface(**DefaultParams.LGBM_TD_CLASS, **extra_params, allow_gpu=False) for i in range(n_cv)]), SingleSplitWrapperAlgInterface( [XGBSubSplitInterface(**DefaultParams.XGB_TD_CLASS, **extra_params, allow_gpu=False) for i in range(n_cv)]), SingleSplitWrapperAlgInterface( [CatBoostSubSplitInterface(**DefaultParams.CB_TD_CLASS, **extra_params, allow_gpu=False) for i in range(n_cv)]), NNAlgInterface(**utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, extra_params)), ] return CaruanaEnsembleAlgInterface(td_interfaces, **extra_params) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class Ensemble_TD_Regressor(AlgInterfaceRegressor): def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, val_metric_name: Optional[str] = None): self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.val_metric_name = val_metric_name def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import CatBoostSubSplitInterface from pytabkit.models.alg_interfaces.lightgbm_interfaces import LGBMSubSplitInterface from pytabkit.models.alg_interfaces.xgboost_interfaces import XGBSubSplitInterface extra_params = dict() if self.val_metric_name is not None: extra_params['val_metric_name'] = self.val_metric_name td_interfaces = [ SingleSplitWrapperAlgInterface( [LGBMSubSplitInterface(**DefaultParams.LGBM_TD_REG, **extra_params, allow_gpu=False) for i in range(n_cv)]), SingleSplitWrapperAlgInterface( [XGBSubSplitInterface(**DefaultParams.XGB_TD_REG, **extra_params, allow_gpu=False) for i in range(n_cv)]), SingleSplitWrapperAlgInterface( [CatBoostSubSplitInterface(**DefaultParams.CB_TD_REG, **extra_params, allow_gpu=False) for i in range(n_cv)]), NNAlgInterface(**DefaultParams.RealMLP_TD_REG, **extra_params), ] return CaruanaEnsembleAlgInterface(td_interfaces, **extra_params) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class EnsembleHPOConstructorMixin: def __init__(self, device: Optional[str] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, n_cv: int = 1, n_refit: int = 0, n_repeats: int = 1, val_fraction: float = 0.2, n_threads: Optional[int] = None, tmp_folder: Optional[Union[str, pathlib.Path]] = None, verbosity: int = 0, val_metric_name: Optional[str] = None, n_hpo_steps: int = 50, calibration_method: Optional[str] = None, use_full_caruana_ensembling: bool = False, n_caruana_steps: int = 40, use_tabarena_spaces: bool = False, time_limit_s: Optional[float] = None, ): """ :param device: :param random_state: :param n_cv: :param n_refit: :param n_repeats: :param val_fraction: :param n_threads: :param tmp_folder: :param verbosity: :param val_metric_name: :param n_hpo_steps: Number of HPO configs per method. :param calibration_method: Calibration method (only for classification). :param use_full_caruana_ensembling: Whether to also ensemble different hyperparameter configs of the same method (default=False). False corresponds to the method used in the paper, True should give better results (with larger inference time). :param n_caruana_steps: How many iterations to use for Caruana et al. (2004) weighted ensembling. :param use_tabarena_spaces: Whether to use search spaces from TabArena instead of from the RealMLP paper. :param time_limit_s: Time limit in seconds (default=None). """ self.device = device self.random_state = random_state self.n_cv = n_cv self.n_refit = n_refit self.n_repeats = n_repeats self.val_fraction = val_fraction self.n_threads = n_threads self.tmp_folder = tmp_folder self.verbosity = verbosity self.val_metric_name = val_metric_name self.n_hpo_steps = n_hpo_steps self.calibration_method = calibration_method self.use_full_caruana_ensembling = use_full_caruana_ensembling self.n_caruana_steps = n_caruana_steps self.use_tabarena_spaces = use_tabarena_spaces self.time_limit_s = time_limit_s class Ensemble_HPO_Classifier(EnsembleHPOConstructorMixin, AlgInterfaceClassifier): def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface extra_params = dict() if self.val_metric_name is not None: extra_params['val_metric_name'] = self.val_metric_name if self.calibration_method is not None: extra_params['calibration_method'] = self.calibration_method if self.use_tabarena_spaces: extra_params['hpo_space_name'] = 'tabarena' extra_params['n_caruana_steps'] = self.n_caruana_steps extra_params['time_limit_s'] = self.time_limit_s n_hpo_steps = self.n_hpo_steps or 50 hpo_configs = [ [RandomParamsLGBMAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsXGBAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsCatBoostAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsNNAlgInterface(model_idx=i, **extra_params) for i in range(n_hpo_steps)], ] if self.use_full_caruana_ensembling: hpo_interfaces = sum(hpo_configs, []) else: hpo_interfaces = [AlgorithmSelectionAlgInterface(lst, **extra_params) for lst in hpo_configs] return CaruanaEnsembleAlgInterface(hpo_interfaces, **extra_params) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] class Ensemble_HPO_Regressor(EnsembleHPOConstructorMixin, AlgInterfaceRegressor): def _create_alg_interface(self, n_cv: int) -> AlgInterface: from pytabkit.models.alg_interfaces.catboost_interfaces import RandomParamsCatBoostAlgInterface from pytabkit.models.alg_interfaces.lightgbm_interfaces import RandomParamsLGBMAlgInterface from pytabkit.models.alg_interfaces.xgboost_interfaces import RandomParamsXGBAlgInterface extra_params = dict() if self.val_metric_name is not None: extra_params['val_metric_name'] = self.val_metric_name if self.use_tabarena_spaces: extra_params['hpo_space_name'] = 'tabarena' extra_params['n_caruana_steps'] = self.n_caruana_steps extra_params['time_limit_s'] = self.time_limit_s n_hpo_steps = self.n_hpo_steps or 50 hpo_configs = [ [RandomParamsLGBMAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsXGBAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsCatBoostAlgInterface(model_idx=i, **extra_params, allow_gpu=False) for i in range(n_hpo_steps)], [RandomParamsNNAlgInterface(model_idx=i, **extra_params) for i in range(n_hpo_steps)], ] if self.use_full_caruana_ensembling: hpo_interfaces = sum(hpo_configs, []) else: hpo_interfaces = [AlgorithmSelectionAlgInterface(lst, **extra_params) for lst in hpo_configs] return CaruanaEnsembleAlgInterface(hpo_interfaces, **extra_params) def _allowed_device_names(self) -> List[str]: return ['cpu', 'cuda', 'mps'] ================================================ FILE: pytabkit/models/torch_utils.py ================================================ from typing import List, Union, Optional import torch import numpy as np def get_available_device_names() -> List['str']: device_names = ['cpu'] + [f'cuda:{i}' for i in range(torch.cuda.device_count())] if torch.backends.mps.is_available(): device_names.append('mps') return device_names def seeded_randperm(n, device, seed): generator = torch.Generator() generator.manual_seed(seed) # todo: can this not be generated directly on the device? return torch.randperm(n, generator=generator).to(device) def permute_idxs(idxs, seed): return idxs[seeded_randperm(idxs.shape[0], idxs.device, seed)] def batch_randperm(n_batch, n, device='cpu'): # batched randperm: # https://discuss.pytorch.org/t/batched-shuffling-of-feature-vectors/30188/4 # https://github.com/pytorch/pytorch/issues/42502 return torch.stack([torch.randperm(n, device=device) for i in range(n_batch)], dim=0) # from https://github.com/runopti/stg/blob/9f630968c4f14cff6da4e54421c497f24ac1e08e/python/stg/layers.py#L10 def gauss_cdf(x): return 0.5 * (1 + torch.erf(x / np.sqrt(2))) class ClampWithIdentityGradientFunc(torch.autograd.Function): @staticmethod def forward(ctx, input: torch.Tensor, low: torch.Tensor, high: torch.Tensor): return torch.minimum(torch.maximum(input, low), high) @staticmethod def backward(ctx, grad_output: torch.Tensor): return grad_output, None, None def clamp_with_identity_gradient_func(x, low, high): return ClampWithIdentityGradientFunc.apply(x, low, high) def cat_if_necessary(tensors: List[torch.Tensor], dim: int): """ Implements torch.cat() but doesn't copy if only one tensor is provided. This can make it faster if no copying behavior is needed. :param tensors: Tensors to be concatenated. :param dim: Dimension in which the tensor should be concatenated. :return: The concatenated tensor. """ if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim=dim) def hash_tensor(tensor: torch.Tensor) -> int: # for debugging purposes, to print two tensor's hashes to see if they are equal # from https://discuss.pytorch.org/t/defining-hash-function-for-multi-dimensional-tensor/107531 import pickle # the .numpy() appears to be necessary for equal tensors to have equal hashes return hash(pickle.dumps(tensor.detach().cpu().numpy())) def torch_np_quantile(tensor: torch.Tensor, q: float, dim: int, keepdim: bool = False) -> torch.Tensor: """ Alternative implementation for torch.quantile() using np.quantile() since the implementation of torch.quantile() uses too much RAM (extreme for Airlines_DepDelay_10M) and can fail for too large tensors. See also https://github.com/pytorch/pytorch/issues/64947 :param tensor: tensor :param q: Quantile value. :param dim: As in torch.quantile() :param keepdim: As in torch.quantile() :return: Tensor with quantiles. """ x_np = tensor.detach().cpu().numpy() q_np = np.quantile(x_np, q=q, axis=dim, keepdims=keepdim) return torch.as_tensor(q_np, device=tensor.device, dtype=tensor.dtype) from time import perf_counter import torch def _cuda_in_use() -> bool: """Return True if CUDA is available and initialized.""" if not torch.cuda.is_available(): return False # is_initialized exists in recent PyTorch; fall back to True if missing is_initialized = getattr(torch.cuda, "is_initialized", None) if is_initialized is None: return True return is_initialized() class TorchTimer: """ Timer for measuring code blocks, with optional CUDA synchronization. Usage: with TorchTimer() as t: y = model(x) print(t.elapsed) # Or manual start/stop: t = TorchTimer() t.start() y = model(x) t.stop() print(t.elapsed) """ def __init__(self, use_cuda: Optional[bool] = None, record_history: bool = False): """ Args: use_cuda: - None (default): auto-detect; sync only if CUDA is in use. - True: force CUDA sync (if available). - False: never sync CUDA. record_history: If True, every measurement is appended to `self.history`. """ self._user_use_cuda = use_cuda self.record_history = record_history self.elapsed = None self.history = [] if record_history else None self._start = None @property def _do_cuda_sync(self) -> bool: if self._user_use_cuda is False: return False if self._user_use_cuda is True: return torch.cuda.is_available() # Auto mode: only if CUDA is available *and* initialized return _cuda_in_use() # ------- context manager API ------- def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_val, exc_tb): self.stop() # ------- manual API ------- def start(self): if self._do_cuda_sync: torch.cuda.synchronize() self._start = perf_counter() def stop(self): if self._start is None: raise RuntimeError("TorchTimer.stop() called before start().") if self._do_cuda_sync: torch.cuda.synchronize() self.elapsed = perf_counter() - self._start if self.record_history: self.history.append(self.elapsed) return self.elapsed def get_available_memory_gb(device: Union[str, torch.device]) -> float: """ Return the available memory (in GB) on the given device. Parameters ---------- device : str or torch.device Device identifier, e.g. "cuda", "cuda:0", or torch.device("cuda:0"). Returns ------- float Available memory in gigabytes. Notes ----- - For CUDA devices, this uses torch.cuda.mem_get_info if available. - For CPU, it uses psutil.virtual_memory().available. - For other device types, NotImplementedError is raised. """ dev = torch.device(device) if dev.type == "cuda": if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available, but a CUDA device was requested.") # Ensure we are querying the correct device torch.cuda.synchronize(dev) if hasattr(torch.cuda, "mem_get_info"): free_bytes, total_bytes = torch.cuda.mem_get_info(dev) else: # Fallback: approximate using total_memory - reserved_by_pytorch props = torch.cuda.get_device_properties(dev) total_bytes = props.total_memory reserved_bytes = torch.cuda.memory_reserved(dev) free_bytes = max(total_bytes - reserved_bytes, 0) return free_bytes / (1024 ** 3) # bytes -> GiB elif dev.type == "cpu": try: import psutil except ImportError as e: raise ImportError( "psutil is required to query CPU memory. Install via `pip install psutil`." ) from e mem = psutil.virtual_memory() return mem.available / (1024 ** 3) else: raise NotImplementedError(f"Memory query not implemented for device type '{dev.type}'") ================================================ FILE: pytabkit/models/training/__init__.py ================================================ ================================================ FILE: pytabkit/models/training/auc_mu.py ================================================ # taken from https://github.com/kleimanr/auc_mu/blob/master/auc_mu.py """ Computation of the measure 'AUC Mu'. This measure requires installation of the numpy and sklearn libraries. This code corresponds to the paper: Kleiman, R., Page, D. ``AUC Mu: A Performance Metric for Multi-Class Machine Learning Models``, Proceedings of the 2019 International Conference on Machine Learning (ICML). """ __author__ = "Ross Kleiman" __copyright__ = "Copyright 2019" __credits__ = ["Ross Kleiman"] __license__ = "MIT" __version__ = "1.0" __maintainer__ = "Ross Kleiman" __email__ = "rkleiman@cs.wisc.edu" __status__ = "Production" import numpy as np from sklearn.metrics import roc_auc_score # ---------------------------------------------------------------------- def auc_mu_impl(y_true, y_score, A=None, W=None): """ Compute the multi-class measure AUC Mu from prediction scores and labels. Parameters ---------- y_true : array, shape = [n_samples] The true class labels in the range [0, n_samples-1] y_score : array, shape = [n_samples, n_classes] Target scores, where each row is a categorical distribution over the n_classes. A : array, shape = [n_classes, n_classes], optional The partition (or misclassification cost) matrix. If ``None`` A is the argmax partition matrix. Entry A_{i,j} is the cost of classifying an instance as class i when the true class is j. It is expected that diagonal entries in A are zero and off-diagonal entries are positive. W : array, shape = [n_classes, n_classes], optional The weight matrix for incorporating class skew into AUC Mu. If ``None``, the standard AUC Mu is calculated. If W is specified, it is expected to be a lower triangular matrix where entrix W_{i,j} is a positive float from 0 to 1 for the partial score between classes i and j. Entries not in the lower triangular portion of W must be 0 and the sum of all entries in W must be 1. Returns ------- auc_mu : float References ---------- .. [1] Kleiman, R., Page, D. ``AUC Mu: A Performance Metric for Multi-Class Machine Learning Models``, Proceedings of the 2019 International Conference on Machine Learning (ICML). """ # Validate input arguments if not isinstance(y_score, np.ndarray): raise TypeError("Expected y_score to be np.ndarray, got: %s" % type(y_score)) if not y_score.ndim == 2: raise ValueError("Expected y_score to be 2 dimensional, got: %s" % y_score.ndim) n_samples, n_classes = y_score.shape if not isinstance(y_true, np.ndarray): raise TypeError("Expected y_true to be np.ndarray, got: %s" % type(y_true)) if not y_true.ndim == 1: raise ValueError("Expected y_true to be 1 dimensional, got: %s" % y_true.ndim) if not y_true.shape[0] == n_samples: raise ValueError("Expected y_true to be shape %s, got: %s" % (str(y_score.shape), str(y_true.shape))) unique_labels = np.unique(y_true) if not np.all(unique_labels == np.arange(n_classes)): raise ValueError("Expected y_true values in range 0..%i, got: %s" % (n_classes - 1, str(unique_labels))) if A is None: A = np.ones((n_classes, n_classes)) - np.eye(n_classes) if not isinstance(A, np.ndarray): raise TypeError("Expected A to be np.ndarray, got: %s" % type(A)) if not A.ndim == 2: raise ValueError("Expected A to be 2 dimensional, got: %s" % A.ndim) if not A.shape == (n_classes, n_classes): raise ValueError("Expected A to be shape (%i, %i), got: %s" % (n_classes, n_classes, str(A.shape))) if not np.all(A.diagonal() == np.zeros(n_classes)): raise ValueError("Expected A to be zero on the diagonals") if not np.all(A >= 0): raise ValueError("Expected A to be non-negative") if W is None: W = np.tri(n_classes, k=-1) W /= W.sum() if not isinstance(W, np.ndarray): raise TypeError("Expected W to be np.ndarray, got: %s" % type(W)) if not W.ndim == 2: raise ValueError("Expected W to be 2 dimensional, got: %s" % W.ndim) if not W.shape == (n_classes, n_classes): raise ValueError("Expected W to be shape (%i, %i), got: %s" % (n_classes, n_classes, str(W.shape))) auc_total = 0.0 for class_i in range(n_classes): preds_i = y_score[y_true == class_i] n_i = preds_i.shape[0] for class_j in range(class_i): preds_j = y_score[y_true == class_j] temp_preds = np.vstack((preds_i, preds_j)) n_j = preds_j.shape[0] n = n_i + n_j temp_labels = np.zeros(n, dtype=int) temp_labels[n_i:n] = 1 v = A[class_i, :] - A[class_j, :] scores = np.dot(temp_preds, v) score_i_j = roc_auc_score(temp_labels, scores) auc_total += W[class_i, class_j] * score_i_j return auc_total ================================================ FILE: pytabkit/models/training/coord.py ================================================ from typing import Dict from pytabkit.models.training.scheduling import ConstantSchedule, get_schedule # layers are created multiple times => either only register after stacking or allow to register multiple times class HyperparamManager: class HyperGetter: def __init__(self, tc: 'HyperparamManager', hyper_name: str, base_value_pattern: str, sched_pattern: str): self.tc = tc self.hyper_name = hyper_name self.base_value_pattern = base_value_pattern self.sched_pattern = sched_pattern def __call__(self): return self.tc.hyper_base_values[self.hyper_name][self.base_value_pattern] * \ self.tc.get_hyper_sched_values()[self.hyper_name][self.sched_pattern] def __init__(self, **config): self.config = config self.hyper_base_values = {} self.hyper_scheds = {} self.hyper_sched_values = None # regularization terms self.reg_terms = [] self.needs_update = True # indicates whether self.hyper_sched_values needs to be updated self.more_info_dict = {} # can be set from outside def get_more_info_dict(self) -> Dict: return self.more_info_dict def _find_pattern(self, d: dict, scope): pattern = None for key in d: if scope.matches(key): #print(d, scope, key) pattern = key if pattern is None: # no pattern was found raise ValueError(f'No key in dict {d} matches scope {str(scope)}') return pattern def register_hyper(self, name: str, scope, default=None, default_sched=lambda: ConstantSchedule(1.0)): if name not in self.hyper_scheds: base_dict = self.config.get(name, default) if not isinstance(base_dict, dict): base_dict = {'': base_dict} sched_dict = self.config.get(name + '_sched', default_sched) if not isinstance(sched_dict, dict): sched_dict = {'': sched_dict} sched_dict = {key: get_schedule(sched) if isinstance(sched, str) else sched() for key, sched in sched_dict.items()} self.hyper_scheds[name] = sched_dict self.hyper_base_values[name] = base_dict self.needs_update = True return HyperparamManager.HyperGetter(self, name, base_value_pattern=self._find_pattern(self.hyper_base_values[name], scope), sched_pattern=self._find_pattern(self.hyper_scheds[name], scope)) # def _to_array(self, value, name: str, length: int) -> torch.Tensor: # if hasattr(value, "__len__"): # # result is already a list or a numpy array # if len(value) != length: # raise ValueError(f'Hyperparameter {name} has {len(value)} values but should have {length} values') # return torch.as_tensor(value) # else: # return torch.as_tensor([value] * length) def get_hyper_sched_values(self): self.update_hyper_sched_values() return self.hyper_sched_values def update_hyper_sched_values(self): if self.needs_update: # print(f'update') self.hyper_sched_values = {name: {pattern: sched.get_value() for pattern, sched in sched_dict.items()} for name, sched_dict in self.hyper_scheds.items()} self.needs_update = False def add_reg_term(self, loss): self.reg_terms.append(loss) def update_hypers(self, learner): # reset regularization terms self.reg_terms = [] self.needs_update = True for name, sched_dict in self.hyper_scheds.items(): for pattern, sched in sched_dict.items(): sched.update(learner) self.update_hyper_sched_values() ================================================ FILE: pytabkit/models/training/lightning_callbacks.py ================================================ from typing import List, Any, Optional, Union, Dict import numpy as np import torch try: from lightning.pytorch.callbacks import Callback import lightning.pytorch as pl except ImportError: from pytorch_lightning.callbacks import Callback import pytorch_lightning as pl from torch import Tensor from pytabkit.models.nn_models.base import Variable, Layer from pytabkit.models.training.coord import HyperparamManager from pytabkit.models.training.logging import Logger class ParamCheckpointer: def __init__(self, n_tv_splits: int, n_tt_splits: int, n_ens: int): self.n_tv_splits = n_tv_splits self.n_tt_splits = n_tt_splits self.n_ens = n_ens self.ckpt_params = [None] * (self.n_tt_splits * self.n_tv_splits) self.ckpt_buffers = [None] * (self.n_tt_splits * self.n_tv_splits) def save(self, parallel_idx: int, model_idx: int, model: Layer): idx = self.n_tv_splits * parallel_idx + model_idx with torch.no_grad(): for ckpt, values in [(self.ckpt_params, model.parameters()), (self.ckpt_buffers, model.buffers())]: if ckpt[idx] is None: ckpt[idx] = [v[idx*self.n_ens:(idx+1)*self.n_ens].clone() for v in values] else: for c, v in zip(ckpt[idx], values): c.copy_(v[idx*self.n_ens:(idx+1)*self.n_ens]) def restore(self, parallel_idx: int, model_idx: int, model: Layer): idx = self.n_tv_splits * parallel_idx + model_idx with torch.no_grad(): for ckpt, values in [(self.ckpt_params, model.parameters()), (self.ckpt_buffers, model.buffers())]: if ckpt[idx] is not None: for c, v in zip(ckpt[idx], values): # print(f'Restore diff: {v[start:end]-c}') v[idx*self.n_ens:(idx+1)*self.n_ens] = c def save_all(self, model: Layer): for parallel_idx in range(self.n_tt_splits): for model_idx in range(self.n_tv_splits): self.save(parallel_idx, model_idx, model) def restore_all(self, model: Layer): for parallel_idx in range(self.n_tt_splits): for model_idx in range(self.n_tv_splits): self.restore(parallel_idx, model_idx, model) class HyperparamCallback(Callback): def __init__(self, hp_manager): self.hp_manager = hp_manager def on_train_batch_start( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int ) -> None: # print(list(pl_module.model.parameters())[-1][0, -1].item()) self.hp_manager.update_hypers(pl_module) def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: Tensor) -> None: reg_terms = self.hp_manager.reg_terms if len(reg_terms) > 0: pl_module.loss += sum(reg_terms) def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: del self.hp_manager # todo: added class L1L2RegCallback(Callback): def __init__(self, hp_manager: HyperparamManager, model: Layer): self.hp_manager = hp_manager self.params: List[Variable] = list(model.parameters()) self.l1_getters = [self.hp_manager.register_hyper('l1_reg', p.context.scope, default=0.0) for p in self.params] self.l2_getters = [self.hp_manager.register_hyper('l2_reg', p.context.scope, default=0.0) for p in self.params] def on_after_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for l1_getter, l2_getter, p in zip(self.l1_getters, self.l2_getters, self.params): l1_reg = l1_getter() * p.hyper_factors.get('l1_reg', 1.0) l2_reg = l2_getter() * p.hyper_factors.get('l2_reg', 1.0) if l1_reg != 0.0: p.grad += l1_reg * torch.sign(p) if l2_reg != 0.0: p.grad += (2.0 * l2_reg) * p self.hp_manager.update_hypers(pl_module) class ModelCheckpointCallback(Callback): def __init__(self, n_tt_splits: int, n_tv_splits: int, n_ens: int, use_best_mean_epoch: bool, val_metric_name: str, restore_best: bool = False): self.n_tt_splits = n_tt_splits self.n_tv_splits = n_tv_splits self.n_ens = n_ens self.val_metric_name = val_metric_name self.restore_best = restore_best self.use_best_mean_epoch = use_best_mean_epoch self.ckpt = ParamCheckpointer(n_tv_splits=n_tv_splits, n_tt_splits=self.n_tt_splits, n_ens=n_ens) def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self.ckpt.save_all(pl_module.model) def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for tt_split_idx in range(self.n_tt_splits): for tv_split_idx in range(self.n_tv_splits): if self.use_best_mean_epoch: if pl_module.best_mean_val_epochs[self.val_metric_name][tt_split_idx] == pl_module.progress.epoch: # if this is the best epoch, save the model self.ckpt.save(tt_split_idx, tv_split_idx, pl_module.model) else: if pl_module.best_val_epochs[self.val_metric_name][tt_split_idx][tv_split_idx] == pl_module.progress.epoch: # print(f'found improvement') # if this is the best epoch, save the model self.ckpt.save(tt_split_idx, tv_split_idx, pl_module.model) def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: # restore at the end. In case of multiple val metrics, can use restore() separately to restore the desired one. self.restore(pl_module) def restore(self, pl_module: "pl.LightningModule") -> None: # restore best params for key, state in pl_module.optimizers().opt.state.items(): # lightning automatically moves the model to the CPU after training, # so we have to do the same for the optimizer state # for sfadam if 'z' in state: state['z'] = state['z'].cpu() pl_module.optimizers().eval() # todo: bit of a hack because ideally the optimizer state should also be restored if not self.restore_best: raise RuntimeError('ValidationCallback: Cannot restore best params when using save_best_params=False') self.ckpt.restore_all(pl_module.model) class StopAtEpochsCallback(Callback): def __init__(self, stop_epochs: List[List[Union[Dict[str, int], int]]], n_models: int, n_ens: int, model: Layer, logger: Optional[Logger] = None): print(f'Refit: {stop_epochs=}') # stop_epochs now has a dict with {metric_name: stop_epoch}, so we need to extract just the stop_epoch def get_epoch(value: Union[Dict[str, int], int]): if isinstance(value, dict): values = list(value.values()) if len(values) != 1: raise ValueError(f'Got stop epochs for multiple metrics, which is not supported in refitting!') return values[0] return value self.stop_epochs = [[get_epoch(ep) for ep in lst] for lst in stop_epochs] self.final_stop_epoch = np.max(sum(self.stop_epochs, [])) self.model = model self.ckpt = ParamCheckpointer(n_tv_splits=n_models, n_tt_splits=len(stop_epochs), n_ens=n_ens) self.logger = logger self.n_models = n_models def _handle_epoch(self, trainer: "pl.Trainer", epoch: int) -> None: if self.logger: self.logger.log(2, f'Refit Epoch {epoch}/{self.final_stop_epoch}') if epoch == self.final_stop_epoch: # print(f'Stopping the training at epoch {epoch}') self.ckpt.restore_all(self.model) trainer.should_stop = True return for tt_split_idx, tv_stop_epochs in enumerate(self.stop_epochs): for tv_split_idx, ep in enumerate(tv_stop_epochs): if ep == epoch: # print(f'Saving checkpoint for model {i}') self.ckpt.save(tt_split_idx, tv_split_idx, self.model) # def on_train_batch_start( # self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int # ) -> None: # print('train batch') def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self._handle_epoch(trainer, epoch=0) def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self._handle_epoch(trainer, epoch=trainer.current_epoch + 1) ================================================ FILE: pytabkit/models/training/lightning_modules.py ================================================ from pytabkit.models.training.lightning_callbacks import ModelCheckpointCallback try: import lightning.pytorch as pl except ImportError: import pytorch_lightning as pl from typing import List, Optional, Dict, Any import numpy as np import torch from pytabkit.models.data.data import ParallelDictDataLoader, DictDataset from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources from pytabkit.models.nn_models.base import Layer from pytabkit.models.optim.optimizers import get_opt_class from pytabkit.models.training.nn_creator import NNCreator from pytabkit.models.training.logging import StdoutLogger, Logger from pytabkit.models.training.metrics import Metrics from pytabkit.models.training.scheduling import LearnerProgress def postprocess_multiquantile(y_pred: torch.Tensor, val_metric_name: Optional[str] = None, sort_quantile_predictions: bool = True, **config): if val_metric_name is None or not val_metric_name.startswith('multi_pinball(') or not sort_quantile_predictions: return y_pred quantiles = [float(q_str) for q_str in val_metric_name[len('multi_pinball('):-1].split(',')] if not all([a <= b for a, b in zip(quantiles[:-1], quantiles[1:])]): raise ValueError(f'Quantiles {quantiles} must be sorted') return y_pred.sort(dim=-1)[0] class TabNNModule(pl.LightningModule): def __init__(self, n_epochs: int = 256, logger: Optional[Logger] = None, fit_params: Optional[List[Dict[str, Any]]] = None, **config): """ Pytorch Lightning Module for building and training a pytorch NN for tabular data. The core of the module is the NNCreatorInterface, which is used to create the model, the callbacks, the hyperparameter manager and the dataloaders. The TabNNModule is responsible for the training loop, (optional) validation and inference. """ super().__init__() self.my_logger = logger or StdoutLogger(verbosity_level=config.get('verbosity', 0)) # todo: improve this self.creator = NNCreator( n_epochs=n_epochs, fit_params=fit_params, **config ) self.hp_manager = self.creator.hp_manager self.model: Optional[Layer] = None self.criterion = None self.train_dl = None self.progress = LearnerProgress() self.progress.max_epochs = n_epochs self.fit_params = fit_params # Validation self.val_preds = [] self.old_training = None self.val_dl = None self.save_best_params = True self.val_metric_names = None self.epoch_mean_val_errors = None self.best_mean_val_errors = None self.best_mean_val_epochs = None self.best_val_errors = None self.best_val_epochs = None self.has_stopped_list = None self.callbacks = None # will contain {val_metric_name: ModelCheckpointCallback(..., val_metric_name)} self.ckpt_callbacks = dict() # LightningModule self.automatic_optimization = False self.config = config def compile_model(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources): """ Method to create the model and all other training dependencies given the dataset and the assigned resources. Once this is called, the module is ready for training. """ self.creator.setup_from_dataset( ds, idxs_list=idxs_list, interface_resources=interface_resources ) self.is_classification_ = ds.tensor_infos['y'].is_cat() self.model = self.creator.create_model(ds, idxs_list=idxs_list) self.train_dl, self.val_dl = self.creator.create_dataloaders(ds) self.criterion, self.val_metric_names = self.creator.get_criterions() def create_callbacks(self): """ Helper method to return callbacks for the trainer.fit callback argument.""" assert self.val_metric_names is not None self.callbacks = self.creator.create_callbacks(self.model, self.my_logger, self.val_metric_names) self.ckpt_callbacks = {} for callback in self.callbacks: if isinstance(callback, ModelCheckpointCallback): self.ckpt_callbacks[callback.val_metric_name] = callback return self.callbacks def get_predict_dataloader(self, ds: DictDataset): """ Helper method to create a dataloader for inference.""" ds_x, _ = ds.split_xy() ds_x = self.creator.static_model.forward_ds(ds_x) # ds_x = self.static_model.forward_ds(ds_x) idxs_single = torch.arange(ds.n_samples, dtype=torch.long) n_ens = self.config.get('n_ens', 1) idxs = idxs_single[None, :].expand( self.creator.n_tt_splits * self.creator.n_tv_splits * n_ens, -1 ) return ParallelDictDataLoader(ds=ds_x, idxs=idxs, batch_size=self.creator.config.get("predict_batch_size", 1024)) # ----- Start LightningModule Methods ----- def on_fit_start(self): self.model.train() self.optimizers().train() # mean val errors will not be accurate if all epochs after this yield NaN self.best_mean_val_errors = {val_metric_name: [np.inf] * self.creator.n_tt_splits for val_metric_name in self.val_metric_names} # epoch 0 counts as before training, epoch 1 is first epoch self.best_mean_val_epochs = {val_metric_name: [0] * self.creator.n_tt_splits for val_metric_name in self.val_metric_names} # don't use simpler notation of the form [[]] * 2 because this will have two references to the same inner array! self.best_val_errors = { val_metric_name: [[np.inf] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for val_metric_name in self.val_metric_names} self.best_val_epochs = { val_metric_name: [[0] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for val_metric_name in self.val_metric_names} self.has_stopped_list = { val_metric_name: [[False] * self.creator.n_tv_splits for i in range(self.creator.n_tt_splits)] for val_metric_name in self.val_metric_names} def training_step(self, batch, batch_idx): # x = batch["x_cont"] # x = x / (1e-8 + x.std(dim=-2, keepdim=True)) # print(f'{x.mean().item()=}') # print(f'{list(self.model.parameters())[0].mean().item()=}') # print(f'{list(self.model.parameters())[-1].mean().item()=}') output = self.model(batch) opt = self.optimizers() # do sum() over models dimension loss = self.criterion(output["x_cont"], output["y"]).sum() # print(f'{loss.item()=}') # Callbacks for regularization are called before the backward pass self.manual_backward(loss) opt.step(loss=loss) opt.zero_grad() self.progress.total_samples += batch["y"].shape[-2] self.progress.epoch_float = ( self.progress.total_samples / self.train_dl.get_num_iterated_samples() ) return loss def on_validation_start(self): self.old_training = self.model.training self.val_preds = [] self.model.eval() def validation_step(self, batch, batch_idx): self.val_preds.append(self.model(batch)["x_cont"]) def on_validation_epoch_end(self): self.model.train(self.old_training) self.old_training = None y_pred = self._postprocess_ens_pred(torch.cat(self.val_preds, dim=-2)) y_pred = postprocess_multiquantile(y_pred, **self.config) n_ens = self.config.get('n_ens', 1) # y is duplicated by the dataloader as well in the ensemble case, deduplicate it y = self.val_dl.val_y[::n_ens] use_early_stopping = self.config.get('use_early_stopping', False) early_stopping_additive_patience = self.config.get('early_stopping_additive_patience', 20) early_stopping_multiplicative_patience = self.config.get('early_stopping_multiplicative_patience', 2) for val_metric_name in self.val_metric_names: val_errors = torch.as_tensor( [ Metrics.apply( y_pred[i, :, :], y[i, :, :], val_metric_name ) for i in range(y_pred.shape[0]) ] ) val_errors = val_errors.view( self.creator.n_tt_splits, self.creator.n_tv_splits ) mean_val_errors = val_errors.mean(dim=-1) # mean over cv/refit dimension mean_val_error = mean_val_errors.mean().item() self.my_logger.log( 2, f"Epoch {self.progress.epoch + 1}/{self.progress.max_epochs}: val {val_metric_name} = {mean_val_error:6.6f}", ) current_epoch = self.progress.epoch + 1 for tt_split_idx in range(self.creator.n_tt_splits): use_last_best_epoch = self.config.get('use_last_best_epoch', True) has_stopped = self.has_stopped_list[val_metric_name][tt_split_idx] # compute best single-split validation errors for tv_split_idx in range(self.creator.n_tv_splits): if use_early_stopping and not has_stopped[tv_split_idx]: if current_epoch > early_stopping_multiplicative_patience \ * self.best_val_epochs[val_metric_name][tt_split_idx][tv_split_idx] \ + early_stopping_additive_patience: has_stopped[tv_split_idx] = True if not has_stopped[tv_split_idx]: # compute best validation errors current_err = val_errors[tt_split_idx, tv_split_idx].item() best_err = self.best_val_errors[val_metric_name][tt_split_idx][tv_split_idx] # use <= on purpose such that latest epoch among tied best epochs is kept # this has been slightly beneficial for accuracy in previous experiments improved = current_err <= best_err if use_last_best_epoch \ else current_err < best_err if improved: self.best_val_errors[val_metric_name][tt_split_idx][tv_split_idx] = current_err self.best_val_epochs[val_metric_name][tt_split_idx][tv_split_idx] = ( self.progress.epoch + 1 ) if not any(has_stopped): # compute best mean validation errors (averaged over sub-splits (cv/refit)) # use <= on purpose such that latest epoch among tied best epochs is kept # this has been slightly beneficial for accuracy in previous experiments improved = mean_val_errors[tt_split_idx] <= self.best_mean_val_errors[val_metric_name][ tt_split_idx] if use_last_best_epoch \ else mean_val_errors[tt_split_idx] < self.best_mean_val_errors[val_metric_name][tt_split_idx] if improved: self.best_mean_val_errors[val_metric_name][tt_split_idx] = mean_val_errors[tt_split_idx] self.best_mean_val_epochs[val_metric_name][tt_split_idx] = ( self.progress.epoch + 1 ) self.progress.epoch += 1 if use_early_stopping and all(all([all(sub_lst) for sub_lst in lst]) for lst in self.has_stopped_list.values()): self.trainer.should_stop = True def on_fit_end(self): # if self.creator.config.get("use_best_epoch", True): # self.fit_params = [{'stop_epoch': mean_ep, 'best_indiv_stop_epochs': single_eps} # for mean_ep, single_eps in zip(self.best_mean_val_epochs, self.best_val_epochs)] # else: # self.fit_params = [ # {"stop_epoch": self.progress.max_epochs} # for i in range(self.creator.n_tt_splits) # ] if self.creator.config.get("use_best_epoch", True): self.fit_params = [{'stop_epoch': {val_metric_name: self.best_mean_val_epochs[val_metric_name][i] for val_metric_name in self.val_metric_names}, 'best_indiv_stop_epochs': {val_metric_name: self.best_val_epochs[val_metric_name][i] for val_metric_name in self.val_metric_names}} for i in range(self.creator.n_tt_splits)] else: self.fit_params = [ {"stop_epoch": {val_metric_name: self.progress.max_epochs for val_metric_name in self.val_metric_names}} for i in range(self.creator.n_tt_splits) ] # put in eval() mode for predict(), so we don't need to save the trainer and the optimizer state self.optimizers(use_pl_optimizer=False).eval() # delete stuff so we don't save the dataset when pickling RealMLP del self.creator.train_idxs del self.creator.val_idxs del self.train_dl del self.val_dl del self.val_preds del self.callbacks del self.ckpt_callbacks def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any: self.model.eval() with torch.no_grad(): return self._postprocess_ens_pred(self.model(batch)["x_cont"].to("cpu")) def _postprocess_ens_pred(self, y_pred: torch.Tensor) -> torch.Tensor: # if n_ens > 1, we need to average the predictions of the ensemble members n_ens = self.config.get('n_ens', 1) if n_ens == 1: return y_pred y_pred = y_pred.reshape(y_pred.shape[0] // n_ens, n_ens, *y_pred.shape[1:]) if self.is_classification_ and not self.config.get('ens_av_before_softmax', False): y_pred = torch.softmax(y_pred, dim=-1) y_pred = y_pred.mean(dim=1) y_pred = torch.log(y_pred + 1e-30) else: y_pred = y_pred.mean(dim=1) return y_pred def configure_optimizers(self): param_groups = [{"params": [p], "lr": 0.01} for p in self.model.parameters()] return get_opt_class(self.config.get('opt', 'adam'))(param_groups, self.hp_manager) def restore_ckpt_for_val_metric_name(self, val_metric_name: str): self.ckpt_callbacks[val_metric_name].restore(self) # from https://github.com/Lightning-AI/pytorch-lightning/discussions/19759 # def on_fit_start(self) -> None: # self.optimizers().train() # already above # def on_predict_start(self) -> None: # print(f'predict start') # self.optimizers(use_pl_optimizer=False).eval() def on_validation_model_eval(self) -> None: self.model.eval() self.optimizers(use_pl_optimizer=False).eval() def on_validation_model_train(self) -> None: self.model.train() self.optimizers(use_pl_optimizer=False).train() def on_test_model_eval(self) -> None: self.model.eval() self.optimizers(use_pl_optimizer=False).eval() def on_test_model_train(self) -> None: self.model.train() self.optimizers(use_pl_optimizer=False).train() def on_predict_model_eval(self) -> None: # redundant with on_predict_start() self.model.eval() # don't do it here in case we don't have the optimizers at predict time # self.optimizers(use_pl_optimizer=False).eval() def to(self, *args: Any, **kwargs: Any) -> 'TabNNModule': super().to(*args, **kwargs) # print(f'moving static model to {args} {kwargs}') self.creator.static_model.to(*args, **kwargs) ================================================ FILE: pytabkit/models/training/logging.py ================================================ class Logger: def __init__(self, verbosity_level): # higher verbosity level means more verbose self.verbosity_level = verbosity_level def get_verbosity_level(self): return self.verbosity_level def log(self, verbosity: int, content: str): if verbosity <= self.verbosity_level: self.force_log(content) def force_log(self, content: str): raise NotImplementedError() class StdoutLogger(Logger): def __init__(self, verbosity_level=0): super().__init__(verbosity_level) def force_log(self, content: str): print(content, flush=True) ================================================ FILE: pytabkit/models/training/metrics.py ================================================ import traceback from typing import Dict, Any, List, Optional, Tuple, Callable import numpy as np from sklearn.metrics import roc_auc_score, balanced_accuracy_score, matthews_corrcoef import torch.nn.functional as F import torch import copy from pytabkit.models.data.data import DictDataset, TaskType from pytabkit.models.data.nested_dict import NestedDict from pytabkit.models.torch_utils import cat_if_necessary, torch_np_quantile from pytabkit.models.training.auc_mu import auc_mu_impl # see also: https://scikit-learn.org/stable/modules/model_evaluation.html def to_one_hot(y, num_classes, label_smoothing_eps=0.0): one_hot = F.one_hot(y, num_classes).float() if label_smoothing_eps > 0.0: low = label_smoothing_eps / num_classes high = 1.0 - label_smoothing_eps + low return low + (high - low) * one_hot else: return one_hot def apply_reduction(res, reduction): if reduction == 'mean': return res.mean(dim=-1) elif reduction is None: return res elif reduction == 'sum': return res.sum(dim=-1) return None def cross_entropy(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'): if torch.is_floating_point(y): res = (-F.log_softmax(y_pred, dim=-1) * y).sum(dim=-1) else: res = -F.log_softmax(y_pred, dim=-1).gather(-1, y).squeeze(-1) return apply_reduction(res, reduction) def softmax_kldiv(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'): if torch.is_floating_point(y): # add 1e-30 to prevent taking the log of 0 -> it gets then multiplied by 0 anyway res = (((y + 1e-30).log() - F.log_softmax(y_pred, dim=-1)) * y).sum(dim=-1) else: res = -F.log_softmax(y_pred, dim=-1).gather(-1, y).squeeze(-1) return apply_reduction(res, reduction) def brier_loss(y_pred: torch.Tensor, y: torch.Tensor, reduction='mean'): if not torch.is_floating_point(y): y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1]) res = (F.softmax(y_pred, dim=-1) - y).square().sum(dim=-1) result = apply_reduction(res, reduction) # print(f'{result.item()=}, {y_pred[4]=}') return result def cos_loss(y_pred, y, reduction='mean'): if not torch.is_floating_point(y): y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1]) res = 1.0 - (y_pred * y).sum(dim=-1) / (y_pred.norm(dim=-1) + 1e-3) return apply_reduction(res, reduction) def mse(y_pred, y, reduction='mean'): if not torch.is_floating_point(y): # in case mse should be used for classification y = F.one_hot(y.squeeze(-1), num_classes=y_pred.shape[-1]) if y_pred.dim() != y.dim(): raise RuntimeError('MSE: y_pred.dim() != y.dim(): could lead to broadcasting errors') res = ((y_pred - y) ** 2).mean(dim=-1) return apply_reduction(res, reduction) def pinball_loss(y_pred: torch.Tensor, y: torch.Tensor, quantile: float, reduction='mean'): if y_pred.dim() != y.dim(): raise RuntimeError('Pinball loss: y_pred.dim() != y.dim(): could lead to broadcasting errors') err = y_pred - y # print(f'{quantile*err=}') res = torch.maximum((1 - quantile) * err, -quantile * err).mean(dim=-1) return apply_reduction(res, reduction) def multi_pinball_loss(y_pred: torch.Tensor, y: torch.Tensor, quantiles: List[float], reduction='mean'): if y_pred.dim() != y.dim(): raise RuntimeError('Multi-Pinball loss: y_pred.dim() != y.dim(): could lead to broadcasting errors') # print(f'{y_pred.shape=}, {y.shape=}') err = y_pred - y assert y.shape[-1] == 1 assert err.shape[-1] == len(quantiles) # print(f'{quantile*err=}') # print(f'{y_pred[:5]=}, {y[:5]=}') quantiles = torch.as_tensor(quantiles, dtype=torch.float32, device=err.device) res = torch.maximum((1 - quantiles) * err, -quantiles * err).mean(dim=-1) return apply_reduction(res, reduction) def mean_interleave(input, repeats, dim): assert input.shape[dim] % repeats == 0 new_shape = input.shape[:dim] + [input.shape[dim] // repeats, repeats] + input.shape[dim + 1:] return input.view(new_shape).mean(dim=dim + 1) def get_y_probs(y: torch.Tensor, n_classes: int) -> torch.Tensor: """ Returns the empirical probabilities of all classes in y. :param y: Tensor of shape [..., n_batch, 1] and dtype torch.long or another integer dtype, containing class labels in {0, 1, ..., n_classes-1} :param n_classes: Total number of classes :return: returns a tensor of shape [..., n_classes] """ if y.shape[-1] != 1: raise ValueError(f'get_y_probs() only supports single-label classification') if torch.is_floating_point(y): raise ValueError(f'get_y_probs() expects y with non-floating dtype') if len(y.shape) > 2: # recursion return cat_if_necessary([get_y_probs(y[i], n_classes) for i in range(y.shape[0])], dim=0) return torch.bincount(y.squeeze(-1), minlength=n_classes).to(torch.float32) / y.shape[0] def insert_missing_class_columns(y_pred: torch.Tensor, train_ds: DictDataset) -> torch.Tensor: """ If train_ds.tensors['y'] does not contain some of the classes specified in train_ds.tensor_infos['y'] and if y_pred does not contain columns for these missing classes, add columns for the missing classes to y_pred, with small probabilities. :param y_pred: Tensor of logits, shape [n_batch, n_classes] :param train_ds: Dataset used for training the model that produced y_pred. :return: Returns y_pred with possibly some columns inserted. """ n_classes = train_ds.tensor_infos['y'].get_cat_sizes()[0].item() if y_pred.shape[-1] >= n_classes: return y_pred # already all columns # assume that the missing classes/columns in y_pred are exactly those that are not represented in the training set train_class_counts = torch.bincount(train_ds.tensors['y'].squeeze(-1), minlength=n_classes).cpu() n_missing = n_classes - y_pred.shape[-1] pred_col_idx = 0 new_cols = [] logsumexp = torch.logsumexp(y_pred, dim=-1) # expected posterior probability of the class under uniform prior # (expected value of corresponding Dirichlet distribution, which is conjugate prior to "multinoulli" distribution) posterior_prob = 1 / (train_ds.n_samples + n_classes) # ensure that the probability of missing classes is posterior_prob if y_pred are the logits missing_values = logsumexp + np.log(posterior_prob / (1 - posterior_prob * n_missing)) for i in range(n_classes): if train_class_counts[i] > 0: # this column should be represented new_cols.append(y_pred[:, pred_col_idx]) pred_col_idx += 1 else: new_cols.append(missing_values) return torch.stack(new_cols, dim=-1) def remove_missing_classes(y_pred: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Removes missing classes from y_pred and y. For example, if y_pred.shape[-1] == 4 but y only contains the values 0 and 2, the columns y_pred[..., 1] and y_pred[..., 3] will be removed and the values (0, 2) will be mapped to (0, 1). :param y_pred: Predictions of shape (n_samples, n_classes) (should be logits because probabilities will not be normalized anymore after removing columns). :param y: classes of shape (n_samples,) :return: y_pred and y with missing classes removed """ # shapes: y_pred should be n_samples x n_classes, y should be n_samples n_classes = y_pred.shape[-1] counts = torch.bincount(y, minlength=n_classes) is_present = counts > 0 if torch.all(is_present).item(): # all classes are present, nothing needs to be removed return y_pred, y num_present = is_present.sum().item() reduced_y_pred = y_pred[..., is_present] class_mapping = torch.zeros(n_classes, dtype=torch.long, device=y.device) class_mapping[is_present] = torch.arange(num_present, dtype=torch.long, device=y.device) reduced_y = class_mapping[y] # print(f'{is_present=}, {reduced_y_pred.shape=}, {torch.unique(reduced_y)=}') return reduced_y_pred, reduced_y def expected_calibration_error(y_pred: torch.Tensor, y: torch.Tensor): if y.is_floating_point(): y = y.argmax(dim=-1) else: y = y.squeeze(-1) if len(y_pred.shape) == 3: # contains a n_models dimension y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])] y_models = [y[i] for i in range(y.shape[0])] else: y_pred_models = [y_pred] y_models = [y] model_scores = [] # evaluate separately for each model for y_pred_indiv, y_indiv in zip(y_pred_models, y_models): # handle classes that don't occur in the test set y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv) # convert logits to probabilities y_pred_indiv_probs = F.softmax(y_pred_indiv, dim=-1) # ensure that no probabilities are zero or one to circumvent some problems # https://github.com/Lightning-AI/torchmetrics/issues/1646 y_pred_indiv_probs = y_pred_indiv_probs.clamp(1e-7, 1 - 1e-7) y_pred_indiv_probs = y_pred_indiv_probs / y_pred_indiv_probs.sum(dim=-1, keepdim=True) num_classes = y_pred_indiv_probs.shape[-1] is_binary = num_classes == 2 if is_binary: # binary classification, torchmetrics expects only probabilities of the positive class y_pred_indiv_probs = y_pred_indiv_probs[..., 1] # print(f'{torch.unique(y_indiv)=}') # print(f'{torch.unique(y_pred_indiv_probs)=}') # print(f'{y_indiv.shape=}, {y_pred_indiv_probs.shape=}') # print(f'{torch.min(y_pred_indiv_probs)=}') # print(f'{torch.max(y_pred_indiv_probs)=}') import torchmetrics metric = torchmetrics.CalibrationError(task='binary' if is_binary else 'multiclass', num_classes=num_classes) model_scores.append(metric.forward(y_pred_indiv_probs, y_indiv)) if len(y_pred.shape) == 3: # input had n_models dimension, so output should have it, too return torch.as_tensor(model_scores, dtype=torch.float32) else: return torch.as_tensor(model_scores[0], dtype=torch.float32) def auc_ovr_torchmetrics(y_pred: torch.Tensor, y: torch.Tensor): if y.is_floating_point(): y = y.argmax(dim=-1) else: y = y.squeeze(-1) if len(y_pred.shape) == 3: # contains a n_models dimension y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])] y_models = [y[i] for i in range(y.shape[0])] else: y_pred_models = [y_pred] y_models = [y] model_scores = [] # evaluate separately for each model for y_pred_indiv, y_indiv in zip(y_pred_models, y_models): # handle classes that don't occur in the test set y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv) # convert logits to probabilities y_pred_indiv_probs = F.softmax(y_pred_indiv, dim=-1) # ensure that no probabilities are zero or one to circumvent some problems # https://github.com/Lightning-AI/torchmetrics/issues/1646 y_pred_indiv_probs = y_pred_indiv_probs.clamp(1e-7, 1 - 1e-7) y_pred_indiv_probs = y_pred_indiv_probs / y_pred_indiv_probs.sum(dim=-1, keepdim=True) num_classes = y_pred_indiv_probs.shape[-1] is_binary = num_classes == 2 if is_binary: # binary classification, torchmetrics expects only probabilities of the positive class y_pred_indiv_probs = y_pred_indiv_probs[..., 1] # print(f'{torch.unique(y_indiv)=}') # print(f'{torch.unique(y_pred_indiv_probs)=}') # print(f'{y_indiv.shape=}, {y_pred_indiv_probs.shape=}') # print(f'{torch.min(y_pred_indiv_probs)=}') # print(f'{torch.max(y_pred_indiv_probs)=}') import torchmetrics metric = torchmetrics.AUROC(task='binary' if is_binary else 'multiclass', num_classes=num_classes) model_scores.append(metric.forward(y_pred_indiv_probs, y_indiv)) if len(y_pred.shape) == 3: # input had n_models dimension, so output should have it, too return torch.as_tensor(model_scores, dtype=torch.float32) else: return torch.as_tensor(model_scores[0], dtype=torch.float32) class Metrics: def __init__(self, metric_names, val_metric_name, task_type): self.metric_names = metric_names self.val_metric_name = val_metric_name self.task_type = task_type if val_metric_name not in metric_names: self.metric_names.append(val_metric_name) def compute_metrics_dict(self, y_preds: List[torch.Tensor], y: torch.Tensor, use_ens: bool) -> NestedDict: """ :param y_preds: y predictions by (possibly multiple) ensemble members :param y: actual labels (one-hot encoded in case of classification) :param use_ens: Whether to also compute metrics for ensembled predictions :return: Returns a NestedDict indexed by [str(n_models), str(start_idx), metric_name] containing the respective metric values (float) for an ensemble using y_preds[start_idx:start_idx+n_models] In the ensembling case, n_models > 1 is also used, but only with start_idx = 0 """ if np.any([y_pred.dim() != 2 for y_pred in y_preds]): raise RuntimeError('Not all y_preds have dim 2') if y.dim() != 2: raise RuntimeError('y.dim() != 2') results_dict = NestedDict() # individual results for start_idx, y_pred in enumerate(y_preds): for metric_name in self.metric_names: result = Metrics.apply(y_pred, y, metric_name).item() results_dict[str(1), str(start_idx), metric_name] = float(result) # ensemble results if len(y_preds) > 1 and use_ens: for n_models in range(2, len(y_preds) + 1): y_pred = Metrics.avg_preds(y_preds[:n_models], self.task_type) for metric_name in self.metric_names: result = Metrics.apply(y_pred, y, metric_name).cpu().numpy() results_dict[str(n_models), str(0), metric_name] = float(result) return results_dict def compute_val_score(self, val_metrics_dict: NestedDict) -> float: # ['1'] refers to ensemble with 1 member # values() contains the results for the different individual models individual_val_scores = [indiv_dict[self.val_metric_name] for indiv_dict in val_metrics_dict['1'].values()] return float(np.mean(individual_val_scores)) @staticmethod def apply(y_pred: torch.Tensor, y: torch.Tensor, metric_name: str) -> torch.Tensor: # shapes in general: n_models x n_samples x output_dim # for some classification metrics, y should contain the class numbers, # be of type torch.long and have output_dim = 1 # for other classification metrics like cross_entropy, y can also be soft labels with output_dim = n_classes # in the classification case, y_pred are assumed to be logits invalid = torch.logical_or(torch.isnan(y_pred), torch.isinf(y_pred)) if torch.any(invalid): if y.is_floating_point(): # regression y_pred = torch.clone(y_pred) y_pred[torch.any(invalid, dim=-1), :] = 0.0 else: # classification # y_pred[invalid] = -np.inf # leads to NaN after softmax() y_pred = torch.clone(y_pred) not_invalid = y_pred[~invalid] if len(not_invalid) == 0: y_pred[invalid] = 0.0 else: y_pred[invalid] = torch.min(not_invalid) - 100 # a very small value, basically zero probability y_pred_probs = torch.softmax(y_pred, dim=-1) y_pred = torch.log(y_pred_probs + 1e-30) def get_y_categorical(): if y.is_floating_point(): return y.argmax(dim=-1) return y.squeeze(-1) if metric_name == 'class_error': return torch.count_nonzero(y_pred.argmax(dim=-1) != get_y_categorical(), dim=-1) / y_pred.shape[-2] elif metric_name == 'cos_loss': return cos_loss(y_pred, y) elif metric_name == 'cross_entropy': return cross_entropy(y_pred, y) elif metric_name == 'n_cross_entropy': n_classes = y_pred.shape[-1] y_avg_log = torch.log(get_y_probs(y, n_classes) + 1e-30) # insert batch dimension and expand along batch dimension y_avg_log = y_avg_log.unsqueeze(-2).expand(*y_pred.shape) return cross_entropy(y_pred, y) / cross_entropy(y_avg_log, y) elif metric_name == 'ce_unif': return (-F.softmax(y_pred, dim=-1).log()).mean(dim=-1).mean(dim=-1) elif metric_name == '1-auc_ovo': return 1.0 - Metrics.apply_sklearn_classification_metric( y_pred, y, lambda y1, y2: roc_auc_score(y1, y2, multi_class='ovo'), needs_pred_probs=True) elif metric_name == '1-auc_ovr': return 1.0 - Metrics.apply_sklearn_classification_metric( y_pred, y, lambda y1, y2: roc_auc_score(y1, y2, multi_class='ovr'), needs_pred_probs=True) elif metric_name == '1-auc_ovr_alt': return 1.0 - auc_ovr_torchmetrics(y_pred, y) elif metric_name == '1-auc_mu': return 1.0 - Metrics.apply_sklearn_classification_metric( y_pred, y, auc_mu_impl, needs_pred_probs=True, two_class_single_column=False) elif metric_name == 'brier': return brier_loss(y_pred, y) elif metric_name == 'n_brier': n_classes = y_pred.shape[-1] y_avg_log = torch.log(get_y_probs(y, n_classes) + 1e-30) # insert batch dimension and expand along batch dimension y_avg_log = y_avg_log.unsqueeze(-2).expand(*y_pred.shape) return brier_loss(y_pred, y) / brier_loss(y_avg_log, y) elif metric_name == '1-balanced_accuracy': return 1.0 - Metrics.apply_sklearn_classification_metric(y_pred, y, balanced_accuracy_score, needs_pred_probs=False) elif metric_name == '1-mcc': return 1.0 - Metrics.apply_sklearn_classification_metric(y_pred, y, matthews_corrcoef, needs_pred_probs=False) elif metric_name == 'ece': return expected_calibration_error(y_pred, y) elif metric_name == 'rmse': return mse(y_pred, y).sqrt() elif metric_name == 'nrmse': # rmse relative to rmse of the best constant predictor rmse = mse(y_pred, y).sqrt() den = y.std(correction=0) return rmse / den elif metric_name == 'mae': return (y_pred - y).abs().mean(dim=-1).mean(dim=-1) elif metric_name == 'nmae': # mae relative to mae of the best constant predictor median = torch.median(y) mae = (y_pred - y).abs().mean(dim=-1).mean(dim=-1) den = (median - y).abs().mean(dim=-1).mean(dim=-1) return mae / den elif metric_name == 'max_error': return (y_pred - y).abs().max(dim=-1)[0].max(dim=-1)[0] elif metric_name == 'n_max_error': # max error relative to the max error of the best constant predictor max_error = (y_pred - y).abs().max(dim=-1)[0].max(dim=-1)[0] max = y.max(dim=-1)[0].max(dim=-1)[0] min = y.min(dim=-1)[0].min(dim=-1)[0] ref_error = (0.5 * (max - min)) return max_error / (ref_error + 1e-30) elif metric_name.startswith('pinball('): # expected format: pinball(number), e.g. pinball(0.95) quantile = float(metric_name[len('pinball('):-1]) result = pinball_loss(y_pred, y, quantile) # print(f'pinball loss: {result:g}') return result elif metric_name.startswith('n_pinball('): # expected format: n_pinball(number), e.g. n_pinball(0.95) # compute loss divided by loss of the best constant predictor quantile = float(metric_name[len('n_pinball('):-1]) raw_loss = pinball_loss(y_pred, y, quantile) best_constant_y_pred = torch_np_quantile(y, quantile, dim=-2, keepdim=True).expand(*y_pred.shape) best_constant_loss = pinball_loss(best_constant_y_pred, y, quantile) return raw_loss / (best_constant_loss + 1e-30) elif metric_name.startswith('c_pinball('): # expected format: c_pinball(number), e.g. c_pinball(0.95) # compute pinball loss after post-hoc calibration quantile = float(metric_name[len('c_pinball('):-1]) err_quantile = torch_np_quantile(y - y_pred, quantile, dim=-2, keepdim=True) raw_loss = pinball_loss(y_pred + err_quantile, y, quantile) return raw_loss elif metric_name.startswith('multi_pinball('): # expected format: multi_pinball(number1, ..., numberk), e.g. multi_pinball(0.25, 0.5, 0.75) quantiles = [float(nbr) for nbr in metric_name[len('multi_pinball('):-1].split(',')] result = multi_pinball_loss(y_pred, y, quantiles) # print(f'pinball loss: {result:g}') return result else: try: import probmetrics.metrics except ImportError: raise ValueError(f'Unknown metric {metric_name}') try: y_cat = get_y_categorical() y_pred = y_pred calref_dict = { f'{cr_short}-{loss_short}-{posthoc_short}{cv_short}': f'{cr_long}_{loss_long}_{posthoc_long}_{cv_long}' for cr_short, cr_long in [('ref', 'refinement'), ('cal', 'calib-err')] for loss_short, loss_long in [('ll', 'logloss'), ('br', 'brier')] for posthoc_short, posthoc_long in [('ts', 'ts-mix'), ('is', 'isotonic-mix')] for cv_short, cv_long in [('', 'all'), ('-cv5', 'cv-5')] } prob_metric_name = metric_name if metric_name in calref_dict: prob_metric_name = calref_dict[metric_name] if 'ts-mix' in prob_metric_name: # run temperature scaling on CPU, it's more efficient (at least for smaller datasets) y_cat = y_cat.cpu() y_pred = y_pred.cpu() metric = probmetrics.metrics.Metric.from_name(prob_metric_name) # todo: doesn't work with soft target distributions for now if len(y_pred.shape) == 2: return metric.compute_from_labels_logits(y_cat, y_pred) elif len(y_pred.shape) == 3: return torch.stack( [metric.compute_from_labels_logits(y_cat[i], y_pred[i]) for i in range(y_pred.shape[0])], dim=0) else: raise AssertionError(f'{len(y_pred.shape)=}, but must be 2 or 3') except ImportError: pass except ValueError as e: # can be thrown if the name is unknown to Metric traceback.print_exc() raise ValueError(f'Unknown metric {metric_name}') @staticmethod def apply_sklearn_classification_metric(y_pred: torch.Tensor, y: torch.Tensor, metric_function: Callable, needs_pred_probs: bool, two_class_single_column: bool = True): if y.is_floating_point(): y = y.argmax(dim=-1) else: y = y.squeeze(-1) if len(y_pred.shape) == 3: # contains a n_models dimension y_pred_models = [y_pred[i] for i in range(y_pred.shape[0])] y_models = [y[i] for i in range(y.shape[0])] else: y_pred_models = [y_pred] y_models = [y] model_scores = [] # evaluate separately for each model for y_pred_indiv, y_indiv in zip(y_pred_models, y_models): # handle classes that don't occur in the test set y_pred_indiv, y_indiv = remove_missing_classes(y_pred_indiv, y_indiv) if needs_pred_probs: # convert logits to probabilities y_pred_np = F.softmax(y_pred_indiv, dim=-1).cpu().numpy() if y_pred_np.shape[-1] == 2 and two_class_single_column: # binary classification, scikit-learn expects only probabilities of the positive class y_pred_np = y_pred_np[..., 1] else: # convert logits to predicted class y_pred_np = torch.argmax(y_pred_indiv, dim=-1).cpu().numpy() y_np = y_indiv.cpu().numpy() model_scores.append(metric_function(y_np, y_pred_np)) if len(y_pred.shape) == 3: # input had n_models dimension, so output should have it, too return torch.as_tensor(model_scores, dtype=torch.float32) else: return torch.as_tensor(model_scores[0], dtype=torch.float32) @staticmethod def avg_preds(y_preds: List[torch.Tensor], task_type): if task_type == TaskType.CLASSIFICATION: # it should be logmeanexp, but doesn't matter because it is normalized by softmax # y_pred = torch.logsumexp(torch.stack(y_preds, dim=0), dim=0) probs = [F.softmax(y_pred, dim=-1) for y_pred in y_preds] avg_probs = sum(probs) / len(probs) y_pred = torch.log(avg_probs + 1e-30) else: y_pred = sum(y_preds) / len(y_preds) return y_pred @staticmethod def defaults(y_cat_sizes, val_metric_name: Optional[str] = None) -> 'Metrics': if val_metric_name is None: val_metric_name = 'class_error' if y_cat_sizes[0] > 0 else 'rmse' # removed cos_loss default_class_metrics = ['class_error', 'cross_entropy', 'ce_unif', 'brier', 'n_cross_entropy', 'n_brier', '1-balanced_accuracy', '1-mcc', 'ece', '1-auc_ovo', '1-auc_ovr'] if len(y_cat_sizes) == 1 and y_cat_sizes[0] == 2: # bin class return Metrics(default_class_metrics, val_metric_name, TaskType.CLASSIFICATION) elif y_cat_sizes[0] > 0: if y_cat_sizes[0] > 100: default_class_metrics = [m for m in default_class_metrics if m != '1-auc_ovo'] # multi-class (or multi-label classification) return Metrics(default_class_metrics, val_metric_name, TaskType.CLASSIFICATION) else: # regression return Metrics(['rmse', 'mae', 'max_error', 'nrmse', 'nmae', 'n_max_error', 'pinball(0.95)', 'n_pinball(0.95)'], val_metric_name, TaskType.REGRESSION) @staticmethod def default_val_metric_name(task_type): if task_type == TaskType.CLASSIFICATION: return 'class_error' elif task_type == TaskType.REGRESSION: return 'rmse' else: raise ValueError(f'Unknown task type {task_type}') @staticmethod def default_eval_metric_name(task_type): if task_type == TaskType.CLASSIFICATION: return 'class_error' elif task_type == TaskType.REGRESSION: return 'nrmse' else: raise ValueError(f'Unknown task type {task_type}') ================================================ FILE: pytabkit/models/training/nn_creator.py ================================================ import functools from typing import List, Optional, Tuple, Callable, Dict, Any import numpy as np import torch from pytabkit.models import utils from pytabkit.models.data.data import DictDataset, ParallelDictDataLoader, TaskType, ValDictDataLoader from pytabkit.models.nn_models.base import set_hp_context, SequentialLayer, Layer, Variable from pytabkit.models.nn_models.models import NNFactory from pytabkit.models.training.coord import HyperparamManager from pytabkit.models.training.logging import Logger from pytabkit.models.training.metrics import Metrics, mse, cross_entropy from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources def get_realmlp_auto_batch_size(n_train: int): # if n_train <= 2**6: # 64 # return 2**4 # 16 # elif n_train <= 2**8: # return 2**5 # elif n_train <= 2**10: # return 2**6 # elif n_train <= 2**12: # return 2**7 # elif n_train <= 2**15: # return 2**8 # elif n_train <= 2**17: # return 2**9 # # return 2**10 if n_train <= 1024: return 64 elif n_train <= 8192: return 128 elif n_train <= 30_000: return 256 elif n_train <= 100_000: return 512 return 1024 class NNCreator: def __init__(self, fit_params: Optional[List[Dict[str, Any]]] = None, **config): self.fit_params = fit_params self.config = config self.device_info = None # todo: allow better configurability, including mps? self.n_tt_splits = None self.n_tv_splits = None self.static_model = None self.factory = self.config.get('factory', None) if self.factory is None: self.factory = NNFactory(**self.config) self.hp_manager = HyperparamManager(**self.config) # Data Info self.is_cv = None self.train_idxs = None self.val_idxs = None self.n_classes = None def setup_from_dataset(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources): torch.backends.cuda.matmul.allow_tf32 = False # todo: should we do this? # todo: allow preprocessing on CPU and then only put batches on GPU in data loader? gpu_devices = interface_resources.gpu_devices self.device_info = gpu_devices[0] if len(gpu_devices) > 0 else 'cpu' # the code below requires all splits to have the same number of sub-splits assert np.all([idxs_list[i].train_idxs.shape[0] == idxs_list[0].train_idxs.shape[0] for i in range(len(idxs_list))]) # we can then decompose the overall number of sub-splits into the number of splits # and the number of sub-splits per split self.n_tt_splits = len(idxs_list) self.n_tv_splits = idxs_list[0].train_idxs.shape[0] self.is_cv = idxs_list[0].val_idxs is not None assert np.all([(split_idxs.val_idxs is not None) == self.is_cv for split_idxs in idxs_list]) y_cat_sizes = ds.tensor_infos['y'].get_cat_sizes().numpy() self.n_classes = y_cat_sizes[0] self.train_idxs = torch.cat([split_idxs.train_idxs for split_idxs in idxs_list], dim=0) self.val_idxs = torch.cat([split_idxs.val_idxs for split_idxs in idxs_list], dim=0) if self.is_cv else None def get_criterions(self) -> Tuple[Callable, List[str]]: task_type = TaskType.REGRESSION if self.n_classes == 0 else TaskType.CLASSIFICATION # train criterion # todo: add more options? train_metric_name = self.config.get('train_metric_name', None) if train_metric_name is None: train_criterion = mse if self.n_classes == 0 else cross_entropy # defaults elif train_metric_name == 'mse': train_criterion = mse elif train_metric_name == 'cross_entropy': train_criterion = cross_entropy else: train_criterion = functools.partial(Metrics.apply, metric_name=train_metric_name) # train_criterion = lambda y_pred, y, mn=train_metric_name: Metrics.apply(y_pred, y, mn) # else: # raise ValueError(f'{train_metric_name=} is currently not supported') val_metric_name = self.config.get('val_metric_name', Metrics.default_val_metric_name(task_type)) val_metric_names = self.config.get('val_metric_names', [val_metric_name]) return train_criterion, val_metric_names def create_model(self, ds: DictDataset, idxs_list: List[SplitIdxs]): ds = ds.to(self.device_info) # Create static model model_fitter = self.factory.create(ds.tensor_infos) static_fitter, dynamic_fitter = model_fitter.split_off_dynamic() self.static_model, ds = static_fitter.fit_transform(ds) # in the single split case, we can already apply static fitters to the dataset is_single_split = len(idxs_list) == 1 and idxs_list[0].n_trainval_splits == 1 n_ens = self.config.get('n_ens', 1) models = [] # Build non-static models for split_idx, split_idxs in enumerate(idxs_list): # loop over different trainval-test splits # fit initial values only on train model_idx = 0 with torch.no_grad(): # fit initial values on train_ds for sub_idx in range(split_idxs.n_trainval_splits): for ens_idx in range(n_ens): # loop over different train-val splits if 'feature_importances' in self.config: assert n_ens == 1 # don't know if model_idx is handled correctly otherwise self.hp_manager.get_more_info_dict()['feature_importances'] = \ self.config['feature_importances'][model_idx] if 'fixed_weight' in self.config: assert n_ens == 1 # don't know if model_idx is handled correctly otherwise self.hp_manager.get_more_info_dict()['fixed_weight'] = \ self.config['fixed_weight'][model_idx] train_ds = ds.get_sub_dataset(split_idxs.train_idxs[sub_idx, :]) # still call it 'trainval_ds' # because that's what the clipping and output standardization layers use self.hp_manager.get_more_info_dict()['trainval_ds'] = train_ds data_fitter, individual_fitter = dynamic_fitter.split_off_individual() ram_limit_gb = self.config.get('init_ram_limit_gb', 1.0) with set_hp_context(self.hp_manager): torch.manual_seed(utils.combine_seeds(split_idxs.split_seed, ens_idx)) # should not be necessary, but just in case # torch.manual_seed(split_idxs.split_seed + ens_idx) # should not be necessary, but just in case data_tfm, tfmd_ds = data_fitter.fit_transform_subsample( train_ds, ram_limit_gb, needs_tensors=individual_fitter.needs_tensors) torch.manual_seed(utils.combine_seeds(split_idxs.sub_split_seeds[sub_idx], ens_idx)) # torch.manual_seed(split_idxs.sub_split_seeds[sub_idx] + ens_idx) with set_hp_context(self.hp_manager): individual_tfm = individual_fitter.fit_transform_subsample( tfmd_ds, ram_limit_gb=ram_limit_gb, needs_tensors=False)[0] if is_single_split and self.config.get('allow_single_split_opt', True): self.static_model = SequentialLayer([self.static_model, data_tfm]) models.append(individual_tfm) else: models.append(SequentialLayer([data_tfm, individual_tfm])) self.hp_manager.get_more_info_dict()['trainval_ds'] = None model_idx += 1 # print(f'{models[0]=}') # for p in models[0].parameters(): # print(str(p.context.scope)) vectorized_model = models[0].stack(models).to(self.device_info) fixed_init_params: Optional[List[Variable]] = self.config.get('fixed_init_params', None) if fixed_init_params is not None: assert n_ens == 1 fixed_init_param_patterns = self.config['fixed_init_param_patterns'] reinit_lr_factor = self.config.get('reinit_lr_factor', 1.0) for param, fixed_init_param in zip(vectorized_model.parameters(), fixed_init_params): scope_str = str(param.context.scope) # print(scope_str) if any(pattern in scope_str for pattern in fixed_init_param_patterns): print(f'Initializing {scope_str} from fixed parameters') with torch.no_grad(): param.copy_(fixed_init_param) param: Variable = param param.hyper_factors['lr'] = reinit_lr_factor * fixed_init_param.hyper_factors.get('lr', 1.0) # param.hyper_factors['wd'] = 0.0 else: print(f'Initializing {scope_str} newly') return vectorized_model def create_callbacks(self, model: Layer, logger: Logger, val_metric_names: List[str]): from pytabkit.models.training.lightning_callbacks import StopAtEpochsCallback, HyperparamCallback, \ L1L2RegCallback, \ ModelCheckpointCallback callbacks = [HyperparamCallback(self.hp_manager), L1L2RegCallback(self.hp_manager, model)] n_ens = self.config.get('n_ens', 1) # if validation if self.is_cv and self.fit_params is None and self.config.get('use_best_epoch', True): for val_metric_name in val_metric_names: callbacks.append(ModelCheckpointCallback(n_tt_splits=self.n_tt_splits, n_tv_splits=self.n_tv_splits, n_ens=n_ens, use_best_mean_epoch=self.config.get('use_best_mean_epoch_for_cv', False), val_metric_name=val_metric_name, restore_best=self.config.get('use_best_epoch', True))) elif self.fit_params is not None: if self.config.get('use_best_mean_epoch_for_refit', True): stop_epochs = [[params['stop_epoch']] * self.n_tv_splits for params in self.fit_params] else: if 'best_indiv_stop_epochs' not in self.fit_params[0] \ or len(self.fit_params[0]['best_indiv_stop_epochs']) != self.n_tv_splits: raise ValueError(f'Setting use_best_mean_epoch_for_refit=False ' f'requires setting use_best_epoch=True and n_cv==n_refit') stop_epochs = [params['best_indiv_stop_epochs'] for params in self.fit_params] callbacks.append( StopAtEpochsCallback(stop_epochs=stop_epochs, n_models=self.n_tv_splits, n_ens=n_ens, model=model, logger=logger)) # only for debugging: # callbacks.append(ValidationCallback(ds=ds, val_idxs=test_idxs, # metric_name=Metrics.default_metric_name(task_type), # logger=logger, n_models=n_models, n_parallel=n_parallel, # save_best_params=False, # val_batch_size=self.config.get('predict_batch_size', 256))) return callbacks def create_dataloaders(self, ds: DictDataset): ds = ds.to(self.device_info) ds = self.static_model(ds) batch_size = self.config.get('batch_size', 256) n_ens = self.config.get('n_ens', 1) if batch_size == 'auto': batch_size = get_realmlp_auto_batch_size(self.train_idxs.shape[0]) train_dl = ParallelDictDataLoader(ds, self.train_idxs.repeat_interleave(n_ens, dim=0), batch_size=batch_size, shuffle=True, drop_last=True, adjust_bs=self.config.get('adjust_bs', False)) val_dl = None if self.is_cv and self.fit_params is None: val_dl = ValDictDataLoader(ds, self.val_idxs.repeat_interleave(n_ens, dim=0), val_batch_size=self.config.get('predict_batch_size', 1024)) return train_dl, val_dl ================================================ FILE: pytabkit/models/training/scheduling.py ================================================ import numpy as np import math class LearnerProgress: def __init__(self): self.epoch = 0 self.epoch_steps = 0 self.total_steps = 0 self.epoch_samples = 0 self.total_samples = 0 self.epoch_float = 0.0 self.max_epochs = 0 def get_fit_progress(self): return None if self.max_epochs is None else self.epoch_float / self.max_epochs def sched_prod(first, second): if not isinstance(first, Schedule): first = ConstantSchedule(first) if not isinstance(second, Schedule): second = ConstantSchedule(second) if isinstance(first, TimeSchedule) and isinstance(second, TimeSchedule): return ProductTimeSchedule_(first, second) return ProductSchedule_(first, second) def sched_sum(first, second): if not isinstance(first, Schedule): first = ConstantSchedule(first) if not isinstance(second, Schedule): second = ConstantSchedule(second) if isinstance(first, TimeSchedule) and isinstance(second, TimeSchedule): return SumTimeSchedule_(first, second) return SumSchedule_(first, second) class Schedule: def get_value(self): raise NotImplementedError() def update(self, learner): raise NotImplementedError() def __mul__(self, other): return sched_prod(self, other) def __rmul__(self, other): return sched_prod(other, self) def __add__(self, other): return sched_sum(self, other) def __radd__(self, other): return sched_sum(other, self) def __neg__(self): return -1.0 * self def __sub__(self, other): return self + (-other) def __rsub__(self, other): return other + (-self) class TimeSchedule(Schedule): def __init__(self): self.t = 0.0 def call_time_(self, t: float): raise NotImplementedError() def get_value(self): return self.call_time_(self.t) def update(self, learner): self.t = learner.progress.get_fit_progress() def scaled(self, ymin=0., ymax=1., tmin=0., tmax=1.): return ScaledSchedule(self, ymin, ymax, tmin, tmax) def reversed(self): return self.scaled(tmin=1., tmax=0.) class ConstantSchedule(TimeSchedule): def __init__(self, val): super().__init__() self.val = val def call_time_(self, t: float): return self.val class FunctionSchedule(TimeSchedule): def __init__(self, f): super().__init__() self.f = f def call_time_(self, t: float): return self.f(t) class ScaledSchedule(TimeSchedule): def __init__(self, base_schedule: TimeSchedule, ymin=0., ymax=1., tmin=0., tmax=1.): super().__init__() self.base_schedule = base_schedule self.ymin = ymin self.ymax = ymax self.tmin = tmin self.tmax = tmax def call_time_(self, t: float): return self.ymin + (self.ymax - self.ymin) * self.base_schedule.call_time_( self.tmin + (self.tmax - self.tmin) * t) class ProductSchedule_(Schedule): def __init__(self, first: Schedule, second: Schedule): super().__init__() self.first = first self.second = second def get_value(self): return self.first.get_value() * self.second.get_value() def update(self, learner): self.first.update(learner) self.second.update(learner) class ProductTimeSchedule_(TimeSchedule): def __init__(self, first: TimeSchedule, second: TimeSchedule): super().__init__() self.first = first self.second = second def call_time_(self, t: float): return self.first.call_time_(t) * self.second.call_time_(t) class SumSchedule_(Schedule): def __init__(self, first: Schedule, second: Schedule): super().__init__() self.first = first self.second = second def get_value(self): return self.first.get_value() + self.second.get_value() def update(self, learner): self.first.update(learner) self.second.update(learner) class SumTimeSchedule_(TimeSchedule): def __init__(self, first: TimeSchedule, second: TimeSchedule): super().__init__() self.first = first self.second = second def call_time_(self, t: float): return self.first.call_time_(t) + self.second.call_time_(t) class ScheduleSequence(TimeSchedule): def __init__(self, lengths, schedules): super().__init__() self.lengths = np.array(lengths) self.event_times = np.hstack([[0.], np.cumsum(self.lengths)]) self.schedules = schedules def call_time_(self, t: float): idx = np.max(np.argwhere(self.event_times <= t)) idx = min(idx, len(self.schedules)-1) start = self.event_times[idx] end = self.event_times[idx+1] return self.schedules[idx].call_time_((t-start)/(end-start)) class ExponentialSchedule(TimeSchedule): def __init__(self, start, end): super().__init__() self.log_start = np.log(start) self.log_end = np.log(end) def call_time_(self, t: float): return np.exp(self.log_start + t * (self.log_end - self.log_start)) def cos_warm_func(x): if x < 2 ** (-10): return 1.0 else: base_x = 2**(int(np.log2(x))-1) # negative float values are rounded up return 0.5 + 0.5*np.cos(np.pi * (x/base_x - 1)) def combine_scheds(lengths, schedules): return ScheduleSequence(lengths, schedules) def get_cos_sched() -> FunctionSchedule: return FunctionSchedule(lambda x: 0.5 * (1.0 - math.cos(math.pi * x))) def get_id_sched() -> FunctionSchedule: return FunctionSchedule(lambda x: x) def get_lin_sched() -> FunctionSchedule: return FunctionSchedule(lambda x: 1.-x) def get_cos_warm_sched() -> FunctionSchedule: return FunctionSchedule(cos_warm_func) def connect_cos_scheds(times, values): return combine_scheds([t2 - t1 for t1, t2 in zip(times[:-1], times[1:])], [get_cos_sched().scaled(v1, v2) for v1, v2 in zip(values[:-1], values[1:])]) def connect_lin_scheds(times, values): return combine_scheds([t2 - t1 for t1, t2 in zip(times[:-1], times[1:])], [get_cos_sched().scaled(v1, v2) for v1, v2 in zip(values[:-1], values[1:])]) class FirstToLastSchedule(TimeSchedule): def __init__(self, n_params): super().__init__() argmax_points = np.linspace(0.2, 0.6, n_params) self.scheds = [combine_scheds([t, 1.-t], [get_cos_sched().scaled(0.04, 1.), get_cos_sched().scaled(1., 1e-5)]) for t in argmax_points] def call_time_(self, t: float): return np.array([s.call_time_(t) for s in self.scheds]) class StepFunctionSchedule(Schedule): def __init__(self, f): self.step = 0 self.f = f def update(self, learner): self.step = learner.progress.total_steps def get_value(self): return self.f(self.step) class EpochLengthSqMomSchedule(Schedule): def __init__(self, min_value: float = 0.95, base_value: float = 0.5): self.value = min_value self.min_value = min_value self.base_value = base_value def update(self, learner): n_batches_per_epoch = len(learner.data_loader) self.value = max(self.min_value, self.base_value ** (1 / n_batches_per_epoch)) def get_value(self): return self.value class CoslogFunc: def __init__(self, n_cycles: int): self.n_cycles = n_cycles def __call__(self, t): return 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2 ** self.n_cycles - 1) * t))) class GenCoslogFunc: def __init__(self, n_cycles: int, base: float): self.n_cycles = n_cycles self.base = base def __call__(self, t): return 0.5 * (1 - np.cos(2 * np.pi * np.log(1 + (self.base ** self.n_cycles - 1) * t) / np.log(self.base))) class AltCoslogFunc: def __init__(self, n_cycles: int): self.n_cycles = n_cycles def __call__(self, t): return 0.5 * (1 - np.cos(2 * np.pi * np.log2(np.sqrt(2) + (2 ** self.n_cycles - np.sqrt(2)) * t))) def cos_func(x): return 0.5 * (1.0 - math.cos(math.pi * x)) def identity_func(x): return x def lin_func(x): return 1 - x def get_schedule(sched_name: str) -> Schedule: sched_type = sched_name base_sched = None cos_sched = FunctionSchedule(cos_func) # from 0 to 1 # id_sched = FunctionSchedule(identity_func) lin_sched = FunctionSchedule(lin_func) # from 1 to 0 cos_warm_sched = FunctionSchedule(cos_warm_func) identity_sched = FunctionSchedule(identity_func) constant_sched = ConstantSchedule(1.0) one_cycle_lr_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 1e-5)]) fastai1_lr_sched = combine_scheds([0.3, 0.7], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 4e-6)]) mod_one_cycle_lr_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(1e-5, 1.), cos_sched.scaled(1., 1e-5)]) if not isinstance(sched_type, str): base_sched = sched_type elif sched_type == 'linear': return lin_sched elif sched_type == 'constant' or sched_type == 'flat': return ConstantSchedule(1.0) elif sched_type == 'one_cycle': base_sched = one_cycle_lr_sched elif sched_type == 'two_cycle': base_sched = combine_scheds([0.5, 0.5], [one_cycle_lr_sched] * 2) elif sched_type == 'three_cycle': base_sched = combine_scheds([0.25, 0.25, 0.5], [one_cycle_lr_sched] * 3) elif sched_type == 'four_cycle': base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [one_cycle_lr_sched] * 4) elif sched_type == 'c4': base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [mod_one_cycle_lr_sched] * 4) elif sched_type == 'c5': base_sched = combine_scheds([0.0625, 0.0625, 0.125, 0.25, 0.5], [mod_one_cycle_lr_sched] * 5) elif sched_type == 'long_plateau': base_sched = combine_scheds([0.2, 0.6, 0.2], [cos_sched.scaled(0.04, 1), ConstantSchedule(1.0), cos_sched.scaled(1, 1e-5)]) elif sched_type == 'sched1': base_sched = connect_cos_scheds([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], [0.04, 1.0, 0.01, 1.0, 1.0, 1e-5]) elif sched_type == 'sched2': base_sched = connect_cos_scheds([0.0, 0.125, 0.375, 0.5, 0.75, 1.0], [0.04, 1.0, 0.05, 1.0, 1.0, 1e-5]) elif sched_type == 'sched3': base_sched = connect_cos_scheds([0.0, 8 / 64, 16 / 64, 24 / 64, 32 / 64, 56 / 64, 1.0], [1e-3, 1.0, 1.0, 1e-3, 1.0, 1.0, 1e-3]) elif sched_type == 'sched4': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) elif sched_type == 'sched5': base_sched = connect_cos_scheds([0.0, 0.75, 1.0], [0.04, 1.0, 1e-5]) elif sched_type == 'sched6': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.5, 0.5], [base_sched] * 2) elif sched_type == 'sched7': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.25, 0.25, 0.5], [base_sched] * 3) elif sched_type == 'sched8': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [base_sched] * 4) elif sched_type == 'sched9': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.125]*8, [base_sched] * 8) elif sched_type == 'sched10': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.0625, 0.0625, 0.125, 0.25, 0.5], [base_sched] * 5) elif sched_type == 'sched11': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [ConstantSchedule(lr) * base_sched for lr in [0.6, 0.8, 1.0, 1.5]]) elif sched_type == 'sched12': base_sched = connect_cos_scheds([0.0, 0.5, 1.0], [0.04, 1.0, 1e-5]) base_sched = combine_scheds([0.125, 0.125, 0.25, 0.5], [ConstantSchedule(lr) * base_sched for lr in [1.0, 1.0, 1.0, 1.5]]) elif sched_type == 'custom1': sched = connect_cos_scheds([0.0, 0.5, 1.0], [4e-2, 1.0, 1e-5]) base_sched = combine_scheds([0.5, 0.5], [sched] * 2) elif sched_type == 'flat_anneal': base_sched = combine_scheds([0.6, 0.4], [ConstantSchedule(1.0), cos_sched.scaled(1., 1e-5)]) elif sched_type == 'flat_cos': base_sched = combine_scheds([0.5, 0.5], [ConstantSchedule(1.0), cos_sched.scaled(1., 0.)]) elif sched_type == 'cos_anneal': base_sched = cos_sched.scaled(1.0, 1e-4) elif sched_type == 'fastai1': base_sched = fastai1_lr_sched elif sched_type == 'cos_warm': base_sched = cos_warm_sched elif sched_type == 'cos_warm_4': base_sched = connect_cos_scheds([0.0, 1/15, 1/15, 3/15, 3/15, 7/15, 7/15, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]) elif sched_type == 'datarobot': # described in https://www.youtube.com/watch?v=WPQOkoXhdBQ base_sched = combine_scheds([0.25, 0.5, 0.25], [cos_sched.scaled(0.1, 1.), cos_sched.scaled(1., 0.1), cos_sched.scaled(0.1, 0.003)]) elif sched_type == 'one_cycle_0.1': base_sched = combine_scheds([0.1, 0.9], [cos_sched.scaled(0.04, 1.), cos_sched.scaled(1., 1e-5)]) elif sched_type == 'one_cycle_mom': base_sched = combine_scheds([0.25, 0.75], [cos_sched.scaled(0.95, 0.85), cos_sched.scaled(0.85, 0.95)]) elif sched_type == '1-1/step': base_sched = StepFunctionSchedule(lambda step: 1-1/(step+1)) elif sched_type == 'epoch_length': base_sched = EpochLengthSqMomSchedule() elif sched_type == 'epoch_length_2': base_sched = EpochLengthSqMomSchedule(base_value=0.1) elif sched_type == 'epoch_length_3': base_sched = EpochLengthSqMomSchedule(base_value=0.05) elif sched_type == 'cos_log_15': base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t)))) elif sched_type == 'cos_log_31': base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 31 * t)))) elif sched_type == 'cos_log_63': base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 63 * t)))) elif sched_type == 'cos_log_31_sq_mom': base_sched = FunctionSchedule(lambda t: np.exp(-0.05 * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 31 * t)))) - 1e-8) elif sched_type == 'cos_sched': base_sched = cos_sched.scaled(1., 0.) elif sched_type == 'cos': base_sched = cos_sched.scaled(1., 0.) elif sched_type == 'cos_increasing': base_sched = cos_sched.scaled(0., 1.) elif sched_type == 'quad': base_sched = FunctionSchedule(lambda t: (1-t)**2) elif sched_type == 'cubic': base_sched = FunctionSchedule(lambda t: (1 - t) ** 3) elif sched_type == 'lin_cos_log_15': base_sched = FunctionSchedule(lambda t: 2 * t * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t)))) elif sched_type == 'lin2_cos_log_15': base_sched = FunctionSchedule(lambda t: (0.5 + t) * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t)))) elif sched_type == 'lin3_cos_log_15': base_sched = FunctionSchedule(lambda t: (1.5 - t) * 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + 15 * t)))) elif isinstance(sched_type, str) and sched_type.startswith('coslin'): n_cycles = int(sched_type[len('coslin')]) base_sched = FunctionSchedule(lambda t, c=n_cycles: 0.5 * (1 - np.cos(2 * np.pi * c * t))) # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t)))) elif isinstance(sched_type, str) and sched_type.startswith('coslog'): n_cycles = int(sched_type[len('coslog'):]) base_sched = FunctionSchedule(CoslogFunc(n_cycles)) elif isinstance(sched_type, str) and sched_type.startswith('gencoslog'): components = sched_type[len('gencoslog'):].split('-') assert len(components)==2 n_cycles = int(components[0]) base = float(components[1]) base_sched = FunctionSchedule(GenCoslogFunc(n_cycles, base)) elif sched_type == 'warmup_0.05_cos': base_sched = connect_cos_scheds([0.0, 0.05, 1.0], [0.0, 1.0, 0.0]) elif sched_type == 'expm4t': base_sched = FunctionSchedule(lambda t: np.exp(-4*t)) elif sched_type == 'expm3t': base_sched = FunctionSchedule(lambda t: np.exp(-3*t)) elif sched_type == 'expm5t': base_sched = FunctionSchedule(lambda t: np.exp(-5*t)) elif sched_type == 'expm6t': base_sched = FunctionSchedule(lambda t: np.exp(-6*t)) elif sched_type == 'expm8t': base_sched = FunctionSchedule(lambda t: np.exp(-8 * t)) elif sched_type == 'invp1e-2': base_sched = FunctionSchedule(lambda t: 1e-2 / (t + 1e-2)) elif sched_type == 'invsqrtp1e-3': base_sched = FunctionSchedule(lambda t: np.sqrt(1e-3) / np.sqrt(t + 1e-3)) elif sched_type == 'quartic': base_sched = FunctionSchedule(lambda t: (1.-t)**4) elif sched_type == 'pow5': base_sched = FunctionSchedule(lambda t: (1.-t)**5) elif sched_type == 'pow6': base_sched = FunctionSchedule(lambda t: (1.-t)**6) elif sched_type == 'warmup_inv': # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8))) base_sched = FunctionSchedule(lambda t: min(20*t, 0.05/(t+1e-8))) elif sched_type == 'sqrt_cos': # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8))) base_sched = FunctionSchedule(lambda t: 0.05/(np.sqrt(t)+0.05) * (0.5 - 0.5*np.cos(5 * 2 * np.pi * t))) elif sched_type == 'lin_cos': # base_sched = FunctionSchedule(lambda t: min(20*t, np.sqrt(0.05)/(np.sqrt(t)+1e-8))) base_sched = FunctionSchedule(lambda t: (1-t) * (0.5 - 0.5*np.cos(20 * 2 * np.pi * t))) elif sched_type == 'linwarm.05eps': base_sched = combine_scheds([0.05, 0.95], [identity_sched.scaled(1e-3, 1.0), constant_sched]) elif isinstance(sched_type, str) and sched_type.startswith('altcoslog'): n_cycles = int(sched_type[len('altcoslog')]) base_sched = FunctionSchedule(AltCoslogFunc(n_cycles)) # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t)))) elif isinstance(sched_type, str) and sched_type.startswith('altquadcyc'): n_cycles = int(sched_type[len('altquadcyc')]) single_cycle = FunctionSchedule(lambda t: 4 * (t-0.5)**2) cycle_sched = single_cycle for i in range(n_cycles-1): cycle_sched = combine_scheds([0.5, 0.5], [cycle_sched, single_cycle]) cycle_sched = cycle_sched.scaled(tmax=0.75) base_sched = cycle_sched # base_sched = FunctionSchedule(lambda t: 0.5 * (1 - np.cos(2 * np.pi * np.log2(1 + (2**n_cycles-1) * t)))) if base_sched is None: raise ValueError(f'Unknown schedule type "{sched_type}"') return base_sched ================================================ FILE: pytabkit/models/utils.py ================================================ import multiprocessing as mp import os import os.path import heapq import glob import gzip import shutil import timeit from pathlib import Path from typing import List, Tuple, Any, Dict, Union, Optional, Callable import copy import uuid import multiprocessing import time import json from torch import multiprocessing as mp try: from yaml import CLoader as Loader, CDumper as Dumper except ImportError: from yaml import Loader, Dumper from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import QuantileTransformer from sklearn.base import check_is_fitted import numpy as np def select_from_config(config: Dict, keys: List): selected = {} for key in keys: if key in config: selected[key] = config[key] return selected def adapt_config(config, **kwargs): new_config = copy.deepcopy(config) for key, value in kwargs.items(): new_config[key] = value return new_config def existsDir(directory): if directory != '': if not os.path.exists(directory): return False return True def existsFile(file_path): return os.path.isfile(file_path) def ensureDir(file_path): directory = os.path.dirname(file_path) if directory != '': if not os.path.exists(directory): os.makedirs(directory) def matchFiles(file_matcher): return glob.glob(file_matcher) def newDirname(prefix): i = 0 name = prefix if existsDir(prefix): while existsDir(prefix + "_" + str(i)): i += 1 name = prefix + "_" + str(i) os.makedirs(name) return name def getSubfolderNames(folder): return [os.path.basename(name) for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))] def getSubfolders(folder): return [os.path.join(folder, name) for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))] def writeToFile(filename, content): ensureDir(filename) file = open(filename, 'w') file.truncate() file.write(content) file.close() def readFromFile(filename): if not os.path.isfile(filename): return '' file = open(filename, 'r') result = file.read() file.close() return result def create_dir(path): os.makedirs(path) def delete_file(path): os.remove(path) def serialize(filename: Union[Path, str], obj: Any, compressed: bool = False, use_json: bool = False, use_yaml: bool = False, use_msgpack: bool = False, use_pickle: bool = False): # json only works for nested dicts ensureDir(filename) if compressed: file = gzip.open(filename, 'wt' if (use_json or use_yaml) else 'wb', compresslevel=5) else: file = open(filename, 'w' if (use_json or use_yaml) else 'wb') if use_json: json.dump(obj, file) elif use_yaml: import yaml yaml.dump(obj, file, Dumper=Dumper) elif use_msgpack: import msgpack msgpack.dump(obj, file) elif use_pickle: import pickle pickle.dump(obj, file) else: # dill can dump lambdas, and dill also dumps the class and not only the contents import dill dill.dump(obj, file) file.close() def deserialize(filename: Union[Path, str], compressed: bool = False, use_json: bool = False, use_yaml: bool = False, use_msgpack: bool = False, use_pickle: bool = False): # json only works for nested dicts if compressed: file = gzip.open(filename, 'rt' if (use_json or use_yaml) else 'rb') else: file = open(filename, 'r' if (use_json or use_yaml) else 'rb') if use_json: result = json.load(file) elif use_yaml: import yaml result = yaml.load(file, Loader=Loader) elif use_msgpack: import msgpack result = msgpack.load(file) elif use_pickle: import pickle result = pickle.load(file) else: import dill result = dill.load(file) file.close() return result def copyFile(src, dst): ensureDir(dst) shutil.copyfile(src, dst) def nsmallest(n, inputList): return heapq.nsmallest(n, inputList)[-1] def identity(x): return x def set_none_except(lst, idxs): for i in range(len(lst)): if i not in idxs: lst[i] = None def argsort(lst): # from https://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python return sorted(range(len(lst)), key=lst.__getitem__) def join_dicts(*dicts): # Attention: arguments do not commute since later dicts can override entries from earlier dicts! result = copy.copy(dicts[0]) for d in dicts[1:]: result.update(d) return result def update_dict(d: dict, update: Optional[dict] = None, remove_keys: Optional[Union[Any, List[Any]]] = None): d = copy.copy(d) if update is not None: d.update(update) if remove_keys is not None: if isinstance(remove_keys, List): for key in remove_keys: if key in d: d.pop(key) else: if remove_keys in d: d.pop(remove_keys) return d def map_nested(obj: Union[List, Dict, Any], f: Callable, dim: int): """ dim=0 will apply f to obj directly, dim=1 to all elements in obj, etc. """ if dim <= 0: return f(obj) elif isinstance(obj, dict): return {key: map_nested(value, f, dim-1) for key, value in obj.items()} elif isinstance(obj, list): return [map_nested(value, f, dim-1) for value in obj] def select_nested(obj: Union[List, Dict], idx: Any, dim: int): return map_nested(obj, lambda x: x[idx], dim) def shift_dim_nested(obj: Union[List, Dict], dim1: int, dim2: int): # in a nested combination of lists and dicts, shift the indexing dimension dim1 to dim2 # example: if d = {'a': [{'b': 1}, {'b': 2}]}, dim1 = 1, dim2 = 2, then the result should be # {'a': {'b': [1, 2]}} if dim1 < 0 or dim2 < 0: raise ValueError(f'expected dim1 >= 0 and dim2 >= 0, but got {dim1=} and {dim2=}') # if dim2 <= dim1: # raise ValueError(f'expected dim2 > dim1, but got {dim1=} and {dim2=}') if dim1 > 0 and dim2 > 0: if isinstance(obj, dict): return {key: shift_dim_nested(value, dim1-1, dim2-1) for key, value in obj.items()} else: # assume that value is a list return [shift_dim_nested(value, dim1-1, dim2-1) for value in obj] elif dim1 > 1: # dim1 > dim2, shift backwards return shift_dim_nested(shift_dim_nested(obj, dim1, dim1 - 1), dim1 - 1, dim2) elif dim2 > 1: # dim2 > dim1, shift forwards return shift_dim_nested(shift_dim_nested(obj, dim1, dim1 + 1), dim1 + 1, dim2) else: # switch dimensions 0 and 1 if isinstance(obj, dict): first = next(iter(obj.values())) if isinstance(first, dict): # swap two dicts return {key2: {key1: obj[key1][key2] for key1 in obj} for key2 in first} else: # assume it is a list return [{key1: obj[key1][i] for key1 in obj} for i in range(len(first))] else: first = obj[0] if isinstance(first, dict): return {key2: [obj[i][key2] for i in range(len(obj))] for key2 in first} else: # assume it is a list return [[obj[i][j] for i in range(len(obj))] for j in range(len(first))] pass pass def pretty_table_str(str_table): if len(str_table) == 0: return '' max_lens = [np.max([len(row[i]) for row in str_table]) for i in range(len(str_table[0]))] whole_str = '' for row in str_table: for i, entry in enumerate(row): whole_str += entry + (' ' * (max_lens[i] - len(entry))) whole_str += '\n' return whole_str[:-1] # remove last newline def get_uuid_str(): pid_str = str(multiprocessing.current_process().pid) time_str = str(time.time_ns()) rand_str = str(uuid.UUID(bytes=os.urandom(16), version=4)) return '_'.join([time_str, pid_str, rand_str]) def get_batch_intervals(n_total: int, batch_size: int) -> List[Tuple[int, int]]: boundaries = [i * batch_size for i in range(1 + n_total // batch_size)] if boundaries[-1] != n_total: boundaries.append(n_total) return [(start, stop) for start, stop in zip(boundaries[:-1], boundaries[1:])] def all_equal(lst: List): # see https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical return not lst or [lst[0]]*len(lst) == lst class Timer: def __init__(self): self.start_time_total = None self.start_time_process = None self.acc_time_total = 0.0 self.acc_time_process = 0.0 def start(self): if self.start_time_total is None or self.start_time_process is None: self.start_time_total = timeit.default_timer() self.start_time_process = time.process_time() def pause(self): if self.start_time_total is None or self.start_time_process is None: return # has already been paused or not been started self.acc_time_total += timeit.default_timer() - self.start_time_total self.acc_time_process += time.process_time() - self.start_time_process self.start_time_total = None self.start_time_process = None def get_result_dict(self): return {'total': self.acc_time_total, 'process': self.acc_time_process} class TimePrinter: def __init__(self, desc: str): self.desc = desc self.timer = Timer() def __enter__(self): self.timer.start() def __exit__(self, exc_type, exc_val, exc_tb): self.timer.pause() print(f'Time for {self.desc}: {self.timer.get_result_dict()["total"]:g}s') def extract_params(config: Dict[str, Any], param_configs: List[Union[Tuple[str, Optional[Union[str, List[str]]]], Tuple[str, Optional[Union[str, List[str]]], Any]]]) -> Dict[str, Any]: """ Convert parameters in config to correct parameter names for another method and (optionally) insert default values :param config: Dictionary with values for parameters :param param_configs: Tuples specifying parameter names, e.g.: ('eta', None) specifies that result['eta'] = config['eta'] should be set if 'eta' is in config ('eta', 'lr') specifies that result['eta'] = config['lr'] should be set if 'lr' is in config ('eta, ['eta', 'lr']) specifies that either config['eta'] or config['lr'] should be used, if available A third value in the tuple specifies a default value that should be used if no value is available in config. :return: A dictionary as specified above. """ result = {} for param_config in param_configs: target_name = param_config[0] source_names = param_config[1] if source_names is None: source_names = [target_name] elif isinstance(source_names, str): source_names = [source_names] source_names_in_config = [source_name for source_name in source_names if source_name in config] if len(source_names_in_config) == 0: if len(param_config) >= 3: # default value specified result[target_name] = param_config[2] # use the default value elif len(source_names_in_config) == 1: result[target_name] = config[source_names_in_config[0]] else: raise ValueError(f'Found multiple parameter names encoding the same parameter: {source_names_in_config}') return result def reverse_argmin(x: Union[List, np.ndarray]): """ Does the same as np.argmin but in case of equality selects the last best one :param x: list or array of numbers :return: index of last minimum """ if isinstance(x, list): x = np.asarray(x) assert(len(x.shape) == 1) return len(x) - 1 - int(np.argmin(x[::-1])) def combine_seeds(seed_1: int, seed_2: int) -> int: """ Combines two random seeds to a new seed in a hopefully "typically injective" way :param seed_1: First random seed. :param seed_2: Second random seed. :return: Another random seed """ generator = np.random.default_rng(seed=seed_1) return int(generator.integers(low=0, high=2**24) + seed_2) def numpy_to_native_rec(obj: Any): if isinstance(obj, list): return [numpy_to_native_rec(o) for o in obj] elif isinstance(obj, dict): return {key: numpy_to_native_rec(value) for key, value in obj.items()} else: # https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types # works for arrays as well as numpy scalars return getattr(obj, "tolist", lambda: obj)() class ProcessPoolMapper: def __init__(self, n_processes: int, chunksize=1): self.n_processes = n_processes self.chunksize = chunksize pass def _apply(self, f_and_args_serialized: str) -> str: import dill f, args = dill.loads(f_and_args_serialized) return dill.dumps(f(*args)) def map(self, f, args_tuples: List[Tuple]) -> Any: import dill if self.n_processes == 1: return [f(*args) for args in args_tuples] mp_ctx = mp.get_context('spawn') pool = mp_ctx.Pool(self.n_processes) serialized_args = [dill.dumps(args) for args in args_tuples] results = pool.map(self._apply, serialized_args, chunksize=self.chunksize) pool.terminate() return [dill.loads(s) for s in results] # adapted from https://github.com/yandex-research/tabular-dl-tabr/blob/75105013189c76bc4f247633c2fb856bc948e579/lib/data.py#L262 class TabrQuantileTransformer(BaseEstimator, TransformerMixin): def __init__(self, noise=1e-3, random_state=None, n_quantiles=1000, subsample=1_000_000_000, output_distribution="normal"): self.noise = noise self.random_state = random_state self.n_quantiles = n_quantiles self.subsample = subsample self.output_distribution = output_distribution def fit(self, X, y=None): # Calculate the number of quantiles based on data size n_quantiles = max(min(X.shape[0] // 30, self.n_quantiles), 10) # Initialize QuantileTransformer normalizer = QuantileTransformer( output_distribution=self.output_distribution, n_quantiles=n_quantiles, subsample=self.subsample, random_state=self.random_state ) # Add noise if required X_modified = self._add_noise(X) if self.noise > 0 else X # Fit the normalizer normalizer.fit(X_modified) # show that it's fitted self.normalizer_ = normalizer return self def transform(self, X, y=None): check_is_fitted(self) return self.normalizer_.transform(X) def _add_noise(self, X): stds = np.std(X, axis=0, keepdims=True) noise_std = self.noise / np.maximum(stds, self.noise) rng = np.random.default_rng(self.random_state) return X + noise_std * rng.standard_normal(X.shape) class FunctionRunner: def __init__(self, dill_f_args_kwargs, result_queue): self.dill_f_args_kwargs = dill_f_args_kwargs self.result_queue = result_queue def __call__(self): # print(f'DEBUG: FunctionRunner start') import dill f, args, kwargs = dill.loads(self.dill_f_args_kwargs) result = f(*args, **kwargs) self.result_queue.put(result) self.result_queue.join() class FunctionProcess: """ Helper class to run a single function in a separate process. """ def __init__(self, f, *args, **kwargs): import dill self.result_queue = mp.JoinableQueue() self.process = mp.Process(target=FunctionRunner(dill.dumps((f, args, kwargs)), self.result_queue)) def start(self) -> 'FunctionProcess': self.process.start() return self def is_done(self) -> bool: return not self.result_queue.empty() def get_ram_usage_gb(self) -> float: import psutil return psutil.Process(self.process.pid).memory_info().rss / 1024 ** 3 def pop_result(self) -> Any: result = self.result_queue.get() self.result_queue.task_done() time.sleep(1e-2) self.process.terminate() return result class ObjectLoadingContext: def __init__(self, obj: Any, filename: Optional[Union[str, Path]] = None): self.obj = obj self.filename = filename self.saved = False def __enter__(self) -> Any: # use pickle since it works better with torch than dill if self.saved: self.obj = deserialize(self.filename, use_pickle=True) return self.obj def __exit__(self, type, value, traceback) -> None: if self.filename is not None: serialize(self.filename, self.obj, use_pickle=True) self.saved = True del self.obj # taken from TabArena def convert_numpy_dtypes(data: dict) -> dict: """Converts NumPy dtypes in a dictionary to Python dtypes. Some hyperparameter search space's generate configs with numpy dtypes which aren't serializable to yaml. This fixes that. """ converted_data = {} for key, value in data.items(): if isinstance(value, np.generic): converted_data[key] = value.item() elif isinstance(value, dict): converted_data[key] = convert_numpy_dtypes(value) elif isinstance(value, list): converted_data[key] = [ convert_numpy_dtypes({i: v})[i] if isinstance(v, (dict, np.generic)) else v for i, v in enumerate(value) ] else: converted_data[key] = value return converted_data ================================================ FILE: scripts/analyze_hpo_best_params.py ================================================ import msgpack_numpy as m m.patch() import numbers from typing import Optional import fire import numpy as np from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection, TaskDescription from pytabkit.bench.run.results import ResultManager from pytabkit.models import utils def analyze_hpo_best(alg_name: str, coll_name: str, n_splits: int = 10, data_path: Optional[str] = None): print(f'Analyzing {coll_name}:') if data_path is not None: paths = Paths(data_path) else: paths = Paths.from_env_variables() if '/' in coll_name: task_infos = [TaskDescription(*coll_name.split('/')).load_info(paths)] else: task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) best_params = [] for task_info in task_infos: for split_id in range(n_splits): results_path = paths.results_alg_task_split(task_info.task_desc, alg_name, n_cv=1, split_type=SplitType.RANDOM, split_id=split_id) result_manager = ResultManager.load(results_path, load_other=True, load_preds=False) if (not isinstance(result_manager.other_dict['cv'], dict) or 'fit_params' not in result_manager.other_dict['cv']): raise ValueError( f'Did not get a dict containing fit_params, instead got {result_manager.other_dict["cv"]=}') fit_params = result_manager.other_dict['cv']['fit_params'] print(f'{fit_params=}') # print(fit_params) best_params.append(fit_params['hyper_fit_params'] if 'hyper_fit_params' in fit_params else (fit_params['sub_fit_params'] if 'sub_fit_params' in fit_params else fit_params)) if isinstance(best_params[-1], list): best_params[-1] = best_params[-1][0] # add keys from sub-dicts like in scikit-learn with __ flattened_params = {} for key, value in best_params[-1].items(): if isinstance(value, dict): for sub_key, sub_value in value.items(): flattened_params[f'{key}__{sub_key}'] = sub_value best_params[-1] = utils.join_dicts(best_params[-1], flattened_params) # print(best_params[-1]) # print(result_manager.other_dict) # return param_names = sorted(list(best_params[0].keys())) for param_name in param_names: values = [config[param_name] for config in best_params] unique_values = [] # do it manually so that it only requires equality comparison and not hashing or other comparisons for v in values: if v not in unique_values: unique_values.append(v) # print(f'Processing {param_name=} with {unique_values=}') if len(unique_values) == 1: continue # a hyperparam that hasn't been tuned, most likely elif len(unique_values) <= 10: print(f'Frequencies of best values for hyperparameter {param_name}:') for value in unique_values: n_best = len([v for v in values if v == value]) print(f'{value}: {n_best}') print() elif all(isinstance(v, numbers.Number) for v in unique_values): print(f'Hyperparameter {param_name}: mean={np.mean(values):g}, quantiles:') for q in np.linspace(0.0, 1.0, 11): print(f'alpha={q:g}: {np.quantile(values, q)}') print() else: print(f'No method for printing values of hyperparameter {param_name}') print() # # for act_name in ['relu', 'selu', 'mish']: # n_best = len([config for config in best_params if config['act'] == act_name]) # print(f'Number of times that {act_name} was best: {n_best}') if __name__ == '__main__': fire.Fire(analyze_hpo_best) ================================================ FILE: scripts/analyze_tasks.py ================================================ from pathlib import Path from typing import List, Optional import fire import matplotlib.pyplot as plt import numpy as np from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.models import utils def print_task_analysis(coll_name: str, paths: Paths): coll = TaskCollection.from_name(coll_name, paths) # coll.save(paths) task_infos = coll.load_infos(paths) print(f'Data sets in task collection {coll_name}:') str_table = [['Data set: ', 'n ', 'k ', 'd ', 'd_one_hot ', 'd_one_hot_leq_10 ', 'd_one_hot_target ', 'largest_cat']] for task_info in task_infos: name = task_info.task_desc.task_name n = task_info.n_samples # k = number of classes k = task_info.tensor_infos['y'].get_cat_sizes()[0].item() n_cont = task_info.tensor_infos['x_cont'].get_n_features() cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy() d = n_cont + len(cat_sizes) # ignore 'missing' categories d_one_hot = n_cont + sum([1 if cs==3 else cs-1 for cs in cat_sizes]) d_one_hot_leq_10 = n_cont + sum([(1 if cs==3 else cs-1) if cs <= 11 else 1 for cs in cat_sizes]) n_target = 1 if k <= 2 else k d_one_hot_target = n_cont + sum([(1 if cs==3 else min(n_target, cs-1)) for cs in cat_sizes]) largest_cat = 0 if cat_sizes is not None and len(cat_sizes) > 0: largest_cat = int(np.max(task_info.tensor_infos['x_cat'].get_cat_sizes().numpy())) str_table.append([name + ' ', str(n) + ' ', str(k) + ' ', str(d.item()) + ' ', str(d_one_hot.item()) + ' ', str(d_one_hot_leq_10.item()) + ' ', str(d_one_hot_target.item()) + ' ', str(largest_cat) + ' ']) print(utils.pretty_table_str(str_table)) print() print(f'Number of tasks with more than 1000 samples: {len([ti for ti in task_infos if ti.n_samples >= 1000])}') print(f'Number of tasks: {len(task_infos)}') print() print() def plot_tasks(coll_name: str, paths: Paths): coll = TaskCollection.from_name(coll_name, paths) task_infos = coll.load_infos(paths) plt.figure(figsize=(5, 4)) for task_info in task_infos: n_cont = task_info.tensor_infos['x_cont'].get_n_features() cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy() d = n_cont + len(cat_sizes) n = task_info.n_samples plt.loglog(n, d, 'k.') plt.xlabel('Number of samples') plt.ylabel('Number of features') plt.tight_layout() filename = Path('../plots') / f'{coll_name}.pdf' utils.ensureDir(filename) plt.savefig(filename, bbox_inches='tight') def plot_tasks_multi(coll_names: List[str], paths: Paths): plt.figure(figsize=(7, 5)) for coll_name in coll_names: coll = TaskCollection.from_name(coll_name, paths) task_infos = coll.load_infos(paths) ds = [] ns = [] for task_info in task_infos: n_cont = task_info.tensor_infos['x_cont'].get_n_features() cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy() d = n_cont + len(cat_sizes) n = task_info.n_samples ds.append(d) ns.append(n) plt.loglog(ns, ds, '.', label=coll_name) plt.legend() plt.xlabel('Number of samples') plt.ylabel('Number of features') plt.tight_layout() filename = Path('../plots') / f'data_set_characteristics.pdf' utils.ensureDir(filename) plt.savefig(filename, bbox_inches='tight') def analyze_tasks(coll_name: Optional[str] = None): paths = Paths.from_env_variables() # coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', # 'grinsztajn-class', 'grinsztajn-reg', # # 'grinsztajn-cat-class', 'grinsztajn-num-class', 'grinsztajn-cat-reg', 'grinsztajn-num-reg', # # 'grinsztajn-cat-class-15k', 'grinsztajn-num-class-15k', 'grinsztajn-cat-reg-15k', # # 'grinsztajn-num-reg-15k' # ] if coll_name is None: coll_names = [dir.stem for dir in paths.tasks().iterdir()] for coll_name in coll_names: print_task_analysis(coll_name, paths) plot_tasks(coll_name, paths) else: print_task_analysis(coll_name, paths) # plot_tasks_multi(coll_names, paths) if __name__ == '__main__': fire.Fire(analyze_tasks) # paths = Paths.from_env_variables() # # # coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', # # 'grinsztajn-class', 'grinsztajn-reg', # # # 'grinsztajn-cat-class', 'grinsztajn-num-class', 'grinsztajn-cat-reg', 'grinsztajn-num-reg', # # # 'grinsztajn-cat-class-15k', 'grinsztajn-num-class-15k', 'grinsztajn-cat-reg-15k', # # # 'grinsztajn-num-reg-15k' # # ] # # coll_names = [dir.stem for dir in paths.tasks().iterdir()] # # for coll_name in coll_names: # print_task_analysis(coll_name, paths) # plot_tasks(coll_name, paths) # # plot_tasks_multi(coll_names, paths) # print_task_analysis('cc18-bin-class', paths) # print_task_analysis('cc18-multi-class', paths) ================================================ FILE: scripts/check_missing_values.py ================================================ from typing import Optional import fire import openml from pytabkit.bench.data.import_tasks import set_openml_cache_dir, PandasTask from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection def check_missing_values(openml_cache_dir: Optional[str] = None): paths = Paths.from_env_variables() for coll_name in ['meta-test-class', 'meta-test-reg']: task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) # task_infos = [task_info for task_info in task_infos if task_info.n_samples < 5000] task_infos_no_missing_numeric = [] task_infos_no_missing = [] for task_info in task_infos: openml_task_id = task_info.more_info_dict['openml_task_id'] with paths.new_tmp_folder() as tmp_folder: set_openml_cache_dir(openml_cache_dir or tmp_folder) task = openml.tasks.get_task(openml_task_id, download_data=False) dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) print(f'Analyzing {dataset.name}:') pd_task = PandasTask.from_openml_task_id(openml_task_id) has_column_nan = pd_task.x_df.isna().any() has_numeric_nan = has_column_nan[pd_task.cont_indicator].any(axis=None) has_categorical_nan = has_column_nan[pd_task.cat_indicator].any(axis=None) print(f'{has_numeric_nan=}, {has_categorical_nan=}') if not has_numeric_nan: task_infos_no_missing_numeric.append(task_info) if not has_categorical_nan: task_infos_no_missing.append(task_info) # task = openml.tasks.get_task(openml_task_id, download_data=False) # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) # x_df, y_df, cat_indicator, names = dataset.get_data(target=task.target_name, dataset_format='dataframe') # has_column_nan = x_df.isna().any() TaskCollection(coll_name + '-no-missing-numeric', [task_info.task_desc for task_info in task_infos_no_missing_numeric]).save(paths) TaskCollection(coll_name + '-no-missing', [task_info.task_desc for task_info in task_infos_no_missing]).save(paths) if __name__ == '__main__': fire.Fire(check_missing_values) pass ================================================ FILE: scripts/copy_algs.py ================================================ import shutil from typing import List import fire from pytabkit.bench.data.paths import Paths def copy_algs_in_paths(paths_1: Paths, paths_2: Paths, alg_names: List[str]): for alg_name in alg_names: print(f'Copying alg {alg_name}') shutil.copytree(paths_1.algs() / alg_name, paths_2.algs() / alg_name) shutil.copytree(paths_1.results() / alg_name, paths_2.results() / alg_name) shutil.copytree(paths_1.result_summaries() / alg_name, paths_2.result_summaries() / alg_name) def copy_specific_algs(): paths_1 = Paths('first_path') paths_2 = Paths('second_path') alg_names = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost'] for version in ['D', 'TD-class', 'TD-reg', 'HPO']] alg_names.extend( [an + suffix for an in ['MLP-RTDL-D', 'ResNet-RTDL-D', 'TabR-S-D'] for suffix in ['-class', '-reg']]) alg_names.extend(['MLP-HPO', 'MLP-RTDL-HPO', 'RF-SKL-D', 'XGB-PBB-D']) copy_algs_in_paths(paths_1, paths_2, alg_names) def copy_algs(path_1: str, path_2: str, *alg_names): paths_1 = Paths(path_1) paths_2 = Paths(path_2) copy_algs_in_paths(paths_1, paths_2, list(alg_names)) if __name__ == '__main__': fire.Fire(copy_algs) ================================================ FILE: scripts/create_plots_and_tables.py ================================================ from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.eval.analysis import ResultsTables from pytabkit.bench.eval.plotting import plot_schedule, plot_schedules, plot_benchmark_bars, plot_scatter, \ plot_pareto, plot_winrates, plot_stopping, plot_cumulative_ablations, plot_cdd from pytabkit.bench.eval.tables import generate_ds_table, generate_collections_table, generate_individual_results_table, \ generate_ablations_table, generate_refit_table, generate_preprocessing_table, generate_stopping_table, \ generate_architecture_table if __name__ == '__main__': paths = Paths.from_env_variables() coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', 'grinsztajn-class-filtered', 'grinsztajn-reg'] tables = ResultsTables(paths) arrow_alg_names = [('MLP-PLR-D', 'RealMLP-TD'), ('TabR-S-D', 'RealTabR-D'), ('XGB-D', 'XGB-TD'), ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'), ('MLP-PLR-HPO', 'RealMLP-HPO')] alg_names = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel', 'Ensemble'] for version in ['D', 'TD', 'HPO']] alg_names.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO', 'MLP-PLR-D', 'MLP-PLR-HPO', 'RealTabR-D', 'FTT-D', 'FTT-HPO', 'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D', 'TabR-HPO']) alg_names_short = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost'] for version in ['D', 'TD', 'HPO']] alg_names_short.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO', 'MLP-PLR-D', 'MLP-PLR-HPO', 'FTT-D', 'FTT-HPO', 'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D', 'RealTabR-D', 'TabR-HPO']) alg_names_hpo_vs_tpe = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost'] for version in ['D', 'TD', 'HPO', 'HPO-TPE']] alg_names_hpo_vs_tpe.extend(['RealMLP-TD', 'RealMLP-HPO']) # extra plot for the README.md plot_pareto(paths, tables, coll_names=['meta-test-class', 'meta-test-reg'], alg_names=alg_names, use_ranks=False, use_normalized_errors=False, use_grinnorm_errors=False, use_geometric_mean=True, use_validation_errors=False, arrow_alg_names=arrow_alg_names) for use_ranks, use_normalized_errors, use_geometric_mean, use_grinnorm_errors in [[False, False, False, False], [False, False, True, False], [True, False, False, False], [False, True, False, False], [False, False, False, True]]: plot_pareto(paths, tables, coll_names=['grinsztajn-class-filtered', 'grinsztajn-reg'], alg_names=alg_names, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, use_grinnorm_errors=use_grinnorm_errors, use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names) plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, use_grinnorm_errors=use_grinnorm_errors, use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names) plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, use_grinnorm_errors=use_grinnorm_errors, use_geometric_mean=use_geometric_mean, arrow_alg_names=arrow_alg_names, use_2x3=True) plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names, use_ranks=use_ranks, use_normalized_errors=use_normalized_errors, use_grinnorm_errors=use_grinnorm_errors, use_geometric_mean=use_geometric_mean, use_validation_errors=True, arrow_alg_names=arrow_alg_names) # alg_names_rssc = alg_names + ['MLP-RTDL-D_rssc', 'ResNet-RTDL-D_rssc', 'TabR-S-D_rssc'] without_rssc = ['MLP-RTDL-D', 'ResNet-RTDL-D', 'TabR-S-D', 'FTT-D', 'MLP-PLR-D'] alg_names_rssc = without_rssc + [an + '_rssc' for an in without_rssc] + ['BestModel_' + an + '_prep' for an in without_rssc] alg_names_rssc = alg_names_rssc + ['RealMLP-TD', 'RealTabR-D'] # alg_names_rssc = alg_names_rssc + ['MLP-RTDL-HPO', 'ResNet-RTDL-HPO', 'FTT-D-HPO', 'MLP-PLR-HPO', 'TabR-HPO'] plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names_rssc, filename='pareto_rssc.pdf') # plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-train-reg'], alg_names=alg_names_rssc, # filename='pareto_rssc_meta-train.pdf') # plot_pareto(paths, tables, coll_names=['meta-test-class', 'meta-test-reg'], alg_names=alg_names_rssc, # filename='pareto_rssc_meta-test.pdf') plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg'], alg_names=alg_names_hpo_vs_tpe, plot_pareto_frontier=False, use_ranks=False, use_normalized_errors=False, use_geometric_mean=True, filename='pareto_hpo-rs-vs-tpe.pdf') plot_pareto(paths, tables, coll_names=['meta-test-class-no-missing', 'meta-test-reg-no-missing'], alg_names=alg_names, arrow_alg_names=arrow_alg_names, filename='pareto_no-missing_geometric.pdf') alg_names_auc = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel'] for version in ['D', 'TD', 'HPO_best-1-auc-ovr']] alg_names_auc.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO_best-1-auc-ovr', 'RealMLP-TD_no-ls', 'RealMLP-TD-S_no-ls', 'MLP-RTDL-D', 'MLP-RTDL-HPO_best-1-auc-ovr', 'MLP-PLR-D', 'MLP-PLR-HPO_best-1-auc-ovr', 'ResNet-RTDL-D', 'ResNet-RTDL-HPO_best-1-auc-ovr', 'RF-SKL-D', 'RF-HPO_best-1-auc-ovr', 'XGB-PBB-D', 'TabR-S-D', 'RealTabR-D', 'RealTabR-D_no-ls', 'TabR-HPO_best-1-auc-ovr', 'BestModel-HPO']) arrow_alg_names_auc = [('MLP-PLR-D', 'RealMLP-TD_no-ls'), ('TabR-S-D', 'RealTabR-D_no-ls'), ('XGB-D', 'XGB-TD'), ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'), ('MLP-PLR-HPO_best-1-auc-ovr', 'RealMLP-HPO_best-1-auc-ovr')] plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class'], alg_names=alg_names_auc, arrow_alg_names=arrow_alg_names_auc, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', filename='pareto_mtrc_mtec_auc-ovr_val-acc.pdf') plot_pareto(paths, tables, coll_names=['meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_auc, arrow_alg_names=arrow_alg_names_auc, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', filename='pareto_mtec_gcf_auc-ovr_val-acc.pdf') plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_auc, arrow_alg_names=arrow_alg_names_auc, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', filename='pareto_mtrc_mtec_gcf_auc-ovr_val-acc.pdf') alg_names_ext = [an + '_val-ce' for an in alg_names] + ['RealMLP-TD_val-ce_no-ls', 'RealMLP-TD-S_val-ce_no-ls', 'RealTabR-D_val-ce_no-ls', 'BestModel-TD_val-ce', 'BestModel-D_val-ce'] arrow_alg_names_valce = [('MLP-PLR-D_val-ce', 'RealMLP-TD_val-ce_no-ls'), ('TabR-S-D_val-ce', 'RealTabR-D_val-ce_no-ls'), ('XGB-D_val-ce', 'XGB-TD_val-ce'), ('LGBM-D_val-ce', 'LGBM-TD_val-ce'), ('CatBoost-D_val-ce', 'CatBoost-TD_val-ce')] plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class'], alg_names=alg_names_ext, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce', arrow_alg_names=arrow_alg_names_valce, filename='pareto_mtrc_mtec_auc-ovr_val-cross-entropy.pdf') plot_pareto(paths, tables, coll_names=['meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_ext, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce', arrow_alg_names=arrow_alg_names_valce, filename='pareto_mtec_gcf_auc-ovr_val-cross-entropy.pdf') plot_pareto(paths, tables, coll_names=['meta-train-class', 'meta-test-class', 'grinsztajn-class-filtered'], alg_names=alg_names_ext, val_metric_name='1-auc_ovr', test_metric_name='1-auc_ovr', tag='paper_val_ce', arrow_alg_names=arrow_alg_names_valce, filename='pareto_mtrc_mtec_gcf_auc-ovr_val-cross-entropy.pdf') # ----- other plots ----- plot_cumulative_ablations(paths, tables) plot_cdd(paths, tables, coll_names=coll_names, alg_names=alg_names_short) plot_cdd(paths, tables, coll_names=coll_names[0:2], alg_names=alg_names_short) plot_cdd(paths, tables, coll_names=coll_names[2:4], alg_names=alg_names_short) generate_architecture_table(paths, tables) plot_stopping(paths, tables, classification=True) plot_stopping(paths, tables, classification=False) generate_preprocessing_table(paths, tables) generate_refit_table(paths, tables, 'RealMLP') generate_refit_table(paths, tables, 'LGBM') generate_ablations_table(paths, tables) generate_collections_table(paths) for coll_name in coll_names: plot_winrates(paths=paths, tables=tables, coll_name=coll_name, alg_names=alg_names) for coll_name in coll_names: for algs_name, new_alg_names in [ ('defaults', ['RealMLP-TD', 'RealTabR-D', 'TabR-S-D', 'MLP-PLR-D', 'MLP-RTDL-D', 'CatBoost-TD', 'LGBM-TD', 'XGB-TD', 'RF-SKL-D']), ('hpo', ['RealMLP-HPO', 'TabR-HPO', 'MLP-PLR-HPO', 'FTT-HPO', 'ResNet-RTDL-HPO', 'MLP-RTDL-HPO', 'CatBoost-HPO', 'LGBM-HPO', 'XGB-HPO', 'RF-HPO'])]: generate_individual_results_table(paths, tables, f'individual_results_{coll_name}_{algs_name}.tex', coll_name=coll_name, alg_names=new_alg_names) generate_ds_table(paths, TaskCollection.from_name('meta-train-class', paths), include_openml_ids=False) generate_ds_table(paths, TaskCollection.from_name('meta-train-reg', paths), include_openml_ids=False) generate_ds_table(paths, TaskCollection.from_name('meta-test-class', paths), include_openml_ids=True) generate_ds_table(paths, TaskCollection.from_name('meta-test-reg', paths), include_openml_ids=True) generate_ds_table(paths, TaskCollection.from_name('grinsztajn-class-filtered', paths), include_openml_ids=True) generate_ds_table(paths, TaskCollection.from_name('grinsztajn-reg', paths), include_openml_ids=True) plot_schedule(paths, filename='coslog4.pdf', sched_name='coslog4') plot_schedules(paths, filename='coslog4_and_flatcos.pdf', sched_names=['coslog4', 'flat_cos'], sched_labels=[r'$\mathrm{coslog}_4$', r'$\mathrm{flat\_cos}$']) for coll_name in ['meta-test-class', 'meta-test-reg']: plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_BestModel-TD_CatBoost-HPO.pdf', coll_names=[coll_name], alg_name_1='BestModel-TD', alg_name_2='CatBoost-HPO') # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_MLP-TD-HPO.pdf', # coll_names=[coll_name], # alg_name_2='RealMLP-HPO', alg_name_1='HPO-on-BestModel-TD') # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_BestModel-HPO.pdf', # coll_names=[coll_name], # alg_name_2='BestModel-HPO', alg_name_1='HPO-on-BestModel-TD') # plot_scatter(paths, tables=tables, filename=f'scatter_{coll_name}_HPO-on-BestModel-TD_BestModel-TD.pdf', # coll_names=[coll_name], # alg_name_2='BestModel-TD', alg_name_1='HPO-on-BestModel-TD') for coll_name in coll_names: for alg_name_1, alg_name_2 in [('RealMLP-TD', 'CatBoost-TD'), ('RealMLP-TD', 'RealMLP-HPO'), ('RealMLP-HPO', 'CatBoost-HPO'), ('CatBoost-TD', 'CatBoost-HPO'), ('BestModel-TD', 'BestModel-HPO'), ('Ensemble-TD', 'BestModel-TD'), ('BestModel-TD', 'CatBoost-HPO'), ('RealMLP-TD', 'MLP-RTDL-D'), ('CatBoost-TD', 'LGBM-TD'), ('BestModel-TD', 'BestModel-D')]: plot_scatter(paths, tables=tables, filename=f'scatter_3x2_{alg_name_1}_{alg_name_2}.pdf', coll_names=coll_names, alg_name_1=alg_name_1, alg_name_2=alg_name_2) plot_scatter(paths, tables=tables, filename=f'scatter_3x2_CatBoost-TD_CatBoost-HPO_valid-errors.pdf', coll_names=coll_names, alg_name_1='CatBoost-TD', alg_name_2='CatBoost-HPO', use_validation_errors=True) ================================================ FILE: scripts/create_probclass_plots.py ================================================ from typing import Optional, List import numpy as np import pandas as pd import torch from adjustText import adjust_text from tueplots import bundles, fonts, fontsizes, figsizes import matplotlib matplotlib.rcParams.update(bundles.icml2024()) matplotlib.rcParams.update(fonts.icml2024_tex()) matplotlib.rcParams.update(fontsizes.icml2024()) from matplotlib import pyplot as plt, ticker import matplotlib.colors as mcolors import matplotlib.patheffects import seaborn as sns from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results from pytabkit.bench.run.results import ResultManager from pytabkit.models import utils def load_stopping_times(paths: Paths, alg_name: str, n_cv: int, n_tt_splits: int, val_metric_name: str, coll_name: str = 'talent-class-small') -> np.ndarray: results = [] coll = TaskCollection.from_name(coll_name, paths) for task_desc in coll.task_descs: for split_id in range(n_tt_splits): results_path = paths.results_alg_task_split(task_desc, alg_name, n_cv=n_cv, split_type=SplitType.RANDOM, split_id=split_id) rm = ResultManager.load(results_path, load_other=True, load_preds=False) fit_params = rm.other_dict['cv']['fit_params'] while True: if 'sub_fit_params' in fit_params: fit_params = fit_params['sub_fit_params'] elif isinstance(fit_params, list): assert len(fit_params) == 1 fit_params = fit_params[0] else: break result = None if 'stop_epoch' in fit_params: result = fit_params['stop_epoch'] elif 'n_estimators' in fit_params: result = fit_params['n_estimators'] else: print(f'No stopping epoch found in {fit_params=}') if isinstance(result, dict): result = result[val_metric_name] results.append(result) return np.asarray(results) def get_desired_symlog_ticks(): pos_small = np.arange(0.1, 1.0, 0.1) pos_mid = np.arange(1, 10, 1) pos_large = np.arange(10, 101, 10) pos_ticks = np.concatenate([pos_small, pos_mid, pos_large]) neg_ticks = -pos_ticks[::-1] return np.concatenate([neg_ticks, [0], pos_ticks]) def plot_barscatter_ax(ax: plt.Axes, df: pd.DataFrame, xlabel: Optional[str], ylabel: str, threshold: Optional[float] = None, use_symlog: bool = False): # hues = list(cal_methods.values()) hues = df['hue'].unique().tolist() # adapted from https://cduvallet.github.io/posts/2018/03/boxplots-in-python sns.set_style('white') # colors = ['#B25116', '#FB84D1'] # colors = ['tab:blue', 'tab:orange'] colors = [(0.6, 0.8, 1.0), (1.0, 0.8, 0.6), (0.6, 1.0, 0.8)] if len(hues) == 1: if 'XGB' in hues[0]: colors = colors[2:3] elif hues[0].startswith('MLP'): colors = colors[1:2] pal = {key: value for key, value in zip(hues, colors[:len(hues)])} # Set up another palette for the boxplots, with slightly lighter shades # light_colors = ['#E5B699', '#FFC9EC'] light_colors = colors face_pal = {key: value for key, value in zip(hues, light_colors[:len(hues)])} hue_order = hues # Make sure to remove the 'facecolor': 'w' property here, otherwise # the palette gets overridden boxprops = {'edgecolor': 'k', 'linewidth': 1} lineprops = {'color': 'k', 'linewidth': 1} boxplot_kwargs = {'boxprops': boxprops, 'medianprops': lineprops, 'whiskerprops': lineprops, 'capprops': lineprops, 'width': 0.75, 'palette': face_pal, 'whis': (10, 90), # use 10% and 90% quantiles for whiskers 'hue_order': hue_order} stripplot_kwargs = {'linewidth': 0.4, 'size': 2.5, 'alpha': 0.6, 'palette': pal, 'hue_order': hue_order} ax.axhline(y=0, color='#888888', linestyle='--') ax.grid(True, which='both') sns.boxplot(x='label', y='value', hue='hue', data=df, ax=ax, fliersize=0, **boxplot_kwargs) sns.stripplot(x='label', y='value', hue='hue', data=df, ax=ax, dodge=True, jitter=0.18, **stripplot_kwargs) if threshold is not None: ax.set_ylim(-threshold, threshold) if use_symlog: ax.set_yscale('symlog', linthresh=1) ax.yaxis.set_minor_formatter(matplotlib.ticker.ScalarFormatter()) ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter()) # ax.yaxis.set_minor_locator(matplotlib.ticker.AutoMinorLocator()) ax.ticklabel_format(style='plain', axis='y') # Get your custom minor tick positions minor_ticks = get_desired_symlog_ticks() # Exclude major ticks (1, 10, 100 and their negatives) major_ticks = np.array([-100, -10, -1, 0, 1, 10, 100]) minor_ticks = [tick for tick in minor_ticks if tick not in major_ticks] # Set the minor ticks ax.yaxis.set_minor_locator(plt.FixedLocator(minor_ticks)) # Remove minor tick labels ax.yaxis.set_minor_formatter(plt.NullFormatter()) # Disable minor grid lines ax.yaxis.grid(False, which='minor') print(f'{len(hues)=}') # Fix the legend, keep only the first len(hues) legend elements # (there would be twice as many because there are also the ones for the scatter plot if len(hues) > 1: handles, hues_ax = ax.get_legend_handles_labels() # ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), # fancybox=True, shadow=True, ncol=5) lgd = ax.legend(handles[:len(hues)], hues_ax[:len(hues)], loc='upper center', bbox_to_anchor=(0.5, -0.25 if xlabel is not None else -0.15), ncol=len(hues), # fancybox=True, shadow=True # fontsize='large', # handletextpad=0.5, ) # lgd.legend_handles[0]._sizes = [40] # lgd.legend_handles[1]._sizes = [40] else: ax.get_legend().remove() ax.set_ylabel(ylabel, fontsize='small') if xlabel is not None: ax.set_xlabel(xlabel, fontsize='small') else: ax.set_xlabel('', fontsize='small') # Draw a clean downward arrow labeled "better" under "Logloss+TS" x_labels = df['label'].unique().tolist() y_min, _ = ax.get_ylim() if 'Logloss+TS' in x_labels: x_idx = x_labels.index('Logloss+TS') # Position arrow so it is visually below all data, regardless of y scale arrow_tip_y = 8e-3 * y_min # it's a symlog scale arrow_base_y = 0.2 * y_min text_y = arrow_base_y + 0.02 * y_min ax.annotate( '', xy=(x_idx, arrow_base_y), xytext=(x_idx, arrow_tip_y), arrowprops=dict(arrowstyle='-|>', color='black', lw=1.1, shrinkA=0, shrinkB=0) ) ax.text(x_idx, text_y, 'lower=better', ha='center', va='top', fontsize='small', style='italic', color='black') def plot_results(paths: Paths, tables: ResultsTables, base_names: List[str], n_hpo_steps: int, n_tt_splits: int, coll_name: str = 'talent-class-small', metric_name: str = 'n_cross_entropy', use_mean_results: bool = False, use_percentages: bool = False, plot_stopping_times: bool = False, n_cv: int = 1, threshold: Optional[float] = 0.02, use_validation_errors: bool = False, use_small_plot: bool = False, use_medium_plot: bool = False, title: Optional[str] = None): val_metrics = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier', 'ref-ll-ts': 'TS-Ref.', 'ref-br-ts': 'Brier-ref.', 'class_error': 'Accuracy'} cal_methods = {'': 'No post-hoc cal.', '_ts-mix': 'Temp. scaling'} metric_display_name_dict = {'n_cross_entropy': 'normalized Logloss', 'cross_entropy': 'Logloss', 'n_brier': 'normalized Brier loss', 'brier': 'Brier loss', 'class_error': 'Class. err.', '1-auroc-ovr': '1-AUROC'} metric_display_name = metric_display_name_dict.get(metric_name, metric_name) # assert use_small_plot or all(len(bn) == 1 for bn in base_names) assert use_small_plot or use_medium_plot or len(base_names) == 1 with (plt.rc_context(figsizes.icml2024_half() if use_small_plot else figsizes.icml2024_full())): # fig, axs = plt.subplots(1, len(base_names)) fig, ax = plt.subplots() dfs = [] for base_name in base_names: cv_suffix = '' if n_cv == 1 else f'-cv{n_cv}' bag_suffix = f' [bag-{n_cv}]' hpo_steps_suffix = f'-{n_hpo_steps}' if 'HPO' in base_name else '' means = None if not plot_stopping_times: if use_mean_results: means_dicts = [] for tag in [f'paper_hpo_{base_name}{cv_suffix}', f'paper_hpo-calib_{base_name}{cv_suffix}']: table = tables.get(coll_name, tag=tag, n_cv=n_cv) means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False, test_metric_name=metric_name, val_metric_name=metric_name, n_splits=n_tt_splits, # don't replace '-class' because it occurs in val-class_error # also don't replace ' [bag-1]' for the cv case simplify_name_fn=lambda s: s, return_percentages=False, use_task_mean=False, use_validation_errors=use_validation_errors, use_geometric_mean=False) means_dicts.append(means) all_means = utils.join_dicts(*means_dicts) means = dict() print(f'Available alg names before aggregating:') for alg_name in all_means: print(alg_name) print() for val_metric_key, val_metric_label in val_metrics.items(): for cal_method_key, cal_method_name in cal_methods.items(): alg_name = f'{base_name}{cv_suffix}-{n_hpo_steps}_val-{val_metric_key}{cal_method_key}{bag_suffix}' alg_names_source = [ f'{base_name}{cv_suffix}_step-{i}_val-{val_metric_key}{cal_method_key}{bag_suffix}' for i in range(n_hpo_steps)] means[alg_name] = np.mean(np.stack([all_means[an] for an in alg_names_source], axis=1), axis=1) else: table = tables.get(coll_name, tag=f'paper_{base_name}{cv_suffix}' if n_cv == 1 else f'paper{cv_suffix}', n_cv=n_cv) means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False, test_metric_name=metric_name, val_metric_name=metric_name, n_splits=n_tt_splits, # don't replace '-class' because it occurs in val-class_error # also don't replace ' [bag-1]' for the cv case use_validation_errors=use_validation_errors, simplify_name_fn=lambda s: s, return_percentages=False, use_task_mean=False, use_geometric_mean=False) # df should contain columns 'value', 'val_metric', 'cal_method' alg_dfs = [] if means is not None: print(f'Available alg names:') for alg_name in means: print(alg_name) print() rel_alg = f'{base_name}{cv_suffix}{hpo_steps_suffix}_val-cross_entropy_ts-mix{bag_suffix}' if use_small_plot: if plot_stopping_times: combinations = [ ('cross_entropy', '', 'Logloss'), ('brier', '', 'Brier'), ('1-auroc-ovr', '', 'AUROC'), ('ref-ll-ts', '', 'TS-Ref.'), ('ref-br-ts', '', 'Brier-Ref.'), ('class_error', '', 'Accuracy'), ] else: combinations = [ ('cross_entropy', '', 'Logloss'), ('cross_entropy', '_ts-mix', 'Logloss+TS'), ('ref-ll-ts', '_ts-mix', 'TS-Ref.+TS'), ('class_error', '_ts-mix', 'Accuracy+TS'), ] elif use_medium_plot: combinations = [ ('cross_entropy', '', 'Logloss'), ('cross_entropy', '_ts-mix', 'Logloss+TS'), ('brier', '_ts-mix', 'Brier+TS'), ('1-auroc-ovr', '_ts-mix', 'AUROC+TS'), ('ref-ll-ts', '_ts-mix', 'TS-Ref.+TS'), ('ref-br-ts', '_ts-mix', 'Brier-Ref.+TS'), ('class_error', '_ts-mix', 'Accuracy+TS'), ] if not any('-HPO' in base_name for base_name in base_names): combinations.insert(5, ('ref-ll-ts-cv5', '_ts-mix', 'TS-Ref.-5CV+TS')) else: combinations = [(val_metric_key, cal_method_key, val_metric_label) for val_metric_key, val_metric_label in val_metrics.items() for cal_method_key in cal_methods] for val_metric_key, cal_method_key, label in combinations: alg_name = f'{base_name}{cv_suffix}{hpo_steps_suffix}_val-{val_metric_key}{cal_method_key}' print(f'Adding results for {alg_name}') if plot_stopping_times: assert not use_mean_results values = load_stopping_times(paths, alg_name=alg_name, n_cv=n_cv, n_tt_splits=n_tt_splits, val_metric_name=val_metric_key, coll_name=coll_name) else: if use_percentages: values = 100 * (means[alg_name + bag_suffix] / means[rel_alg] - 1) else: values = means[alg_name + bag_suffix] - means[rel_alg] if threshold is not None: values = np.clip(values, -threshold, threshold) if use_small_plot or use_medium_plot: hue = base_name.split('-')[0] if hue == 'XGB': hue = 'XGBoost' else: hue = cal_methods[cal_method_key] alg_dfs.append(pd.DataFrame(dict( value=values.tolist(), label=[label] * len(values), hue=[hue] * len(values), ))) df = pd.concat(alg_dfs, axis='index', ignore_index=True) dfs.append(df) df = pd.concat(dfs, axis='index', ignore_index=True) ylabel = ('Stopping iteration' if 'XGB' in base_name else f'Stopping epoch') \ if plot_stopping_times else f'{metric_display_name} diff.\\ to baseline' if use_percentages: ylabel = ylabel + r' [\%]' plot_barscatter_ax(ax=ax, df=df, xlabel=None, # 'Validation and optimization metric', ylabel=ylabel, use_symlog=use_percentages, threshold=threshold if plot_stopping_times else None) if title: ax.set_title(title) suffix = '_mean' if use_mean_results else '' suffix = suffix + ('_rel' if use_percentages else '') suffix = suffix + ('_stoptime' if plot_stopping_times else '') suffix = suffix + ('_valid' if use_validation_errors else '') suffix = suffix + ('' if coll_name == 'talent-class-small' else '_' + coll_name) suffix = suffix + ('_small' if use_small_plot else ('_medium' if use_medium_plot else '')) threshold_str = f'None' if threshold is None else f'{threshold:g}' file_path = paths.plots() / f'boxplot_{"-".join(base_names)}{cv_suffix}_{metric_name}_{threshold_str}{suffix}.pdf' plt.tight_layout() utils.ensureDir(file_path) plt.savefig(file_path) plt.close() def plot_calib_benchmark(paths: Paths, tables: ResultsTables, metric_name: str = 'cross_entropy', n_tt_splits: int = 5, use_validation_errors: bool = False, use_extra_methods: bool = False): times_df = pd.read_csv(paths.base() / 'calib_times' / 'times.csv') methods = list(times_df['calib_name'].unique()) coll_name = 'talent-class-small' table = tables.get(coll_name, tag=f'paper_calib-bench', n_cv=1) means, _ = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False, test_metric_name=metric_name, val_metric_name=metric_name, n_splits=n_tt_splits, # don't replace '-class' because it occurs in val-class_error # also don't replace ' [bag-1]' for the cv case simplify_name_fn=lambda s: s, return_percentages=False, use_task_mean=True, use_validation_errors=use_validation_errors, use_geometric_mean=False) # ----- get reference score without post-hoc calibration means_nocalib, _ = get_benchmark_results(paths, tables.get(coll_name, tag=f'paper_XGB-D', n_cv=1), coll_name=coll_name, use_relative_score=False, test_metric_name=metric_name, val_metric_name=metric_name, n_splits=n_tt_splits, # don't replace '-class' because it occurs in val-class_error # also don't replace ' [bag-1]' for the cv case simplify_name_fn=lambda s: s, return_percentages=False, use_task_mean=True, use_validation_errors=use_validation_errors, use_geometric_mean=False) orig_score = means_nocalib['XGB-D_val-class_error [bag-1]'] avg_times = dict() min_n_val = 10_000 df = times_df.loc[times_df['n_val'] >= min_n_val] for method in methods: where = df['calib_name'] == method # * 1000 for per 1K, *1000 for milliseconds avg_times[method] = np.mean(df.loc[where, 'time'] / df.loc[where, 'n_val']) * 1_000_000 print(repr(means)) print(repr(avg_times)) val_metrics = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier', 'ref-ll-ts': 'TS-Ref.', 'n_cross_entropy': 'norm. Logloss', 'n_brier': 'norm. Brier', 'ref-br-ts': 'Brier-ref.', 'class_error': 'Accuracy'} methods_with_labels = {'ts': r'TS (ours)', # 'ts-mix': r'Bisection + smoothing (ours)', 'ag-ts': r'TS (AutoGluon)', # 'ag-inv-ts': r'AutoGluon + inv. temp.', 'torchunc-ts': 'TS (TorchUncertainty)', 'torchcal-ts': 'TS (TorchCal)', 'guo-ts': 'TS (Guo et al., 2017)', } if use_extra_methods: methods_with_labels = utils.join_dicts(methods_with_labels, { 'ir-mix': 'Isotonic (sklearn) + LS', 'ts-mix': 'TS+LS (ours)' }) labels_list = list(methods_with_labels.values()) with plt.rc_context(figsizes.icml2024_half(height_to_width_ratio=0.5 if use_extra_methods else 0.4)): fig, ax = plt.subplots() # sns.set_theme(style="whitegrid", font_scale=2) plt.ylabel(f'Mean {val_metrics[metric_name]}') plt.xlabel(f'Mean runtime (ms) per 1K samples') colors = ['tab:green', 'tab:blue', 'tab:orange', 'tab:red', 'tab:purple', 'tab:cyan', 'tab:olive'] lines = [] lines.append( ax.axhline(y=means['XGB-D_val-class_error_calib-bench_ts [bag-1]'], color=colors[0], linestyle='--', linewidth=1.0, zorder=-50)) times_list = [avg_times[method] for method in methods_with_labels.keys()] metrics_list = [means[f'XGB-D_val-class_error_calib-bench_{method} [bag-1]'] for method in methods_with_labels.keys()] plt.scatter(times_list, metrics_list, c=colors[:len(times_list)], s=10) # Prepare to annotate the points texts = [] for i, point in enumerate(ax.collections[0].get_offsets()): model_name = labels_list[i] x, y = point if x < np.mean(times_list): # x = 0.7 * x + 0.3 * np.max(times_list) x += 0.15 * (np.max(times_list) - np.min(times_list)) else: # x = 0.7 * x + 0.3 * np.min(times_list) x -= 0.15 * (np.max(times_list) - np.min(times_list)) y = 0.8 * y + 0.2 * np.mean(metrics_list) text_color = colors[i] # Annotate the model names display_name = model_name # with plt.rc_context({'font.family': 'sans-serif', "font.sans-serif": "DejaVu Sans"}): # from matplotlib import font_manager # font_path = font_manager.findfont("DejaVu Sans") # print(f'{font_path=}') with plt.rc_context({'font.family': 'sans-serif', "text.usetex": False}): text = ax.text(x, y, display_name, color=text_color, fontsize=8, ha='center', va='center', font='Arial') # text.set_path_effects([matplotlib.patheffects.withStroke(linewidth=1.2, foreground='white')]) texts.append(text) # import matplotlib.font_manager as fm # print([f.name for f in fm.fontManager.ttflist]) lines.append(ax.axhline(y=orig_score, color='tab:gray', linestyle='--', linewidth=1.0, zorder=-50)) # with plt.rc_context({'font.family': 'sans-serif', "font.sans-serif": "DejaVu Sans"}): with plt.rc_context({'font.family': 'sans-serif', "text.usetex": False}): text = ax.text(np.mean(times_list), orig_score - 0.1 * (np.max(metrics_list) - np.min(metrics_list)), 'No post-hoc cal.', color='tab:gray', fontsize=8, ha='center', va='center', font='Arial') texts.append(text) plt.xlim(left=0) # plt.grid(True, which='both', zorder=-100) ax.set_axisbelow(True) print(ax.collections) # line = ax.axhline(y=means['XGB-D_val-class_error_calib-bench_ts [bag-1]']-0.01, color='white', linestyle='--', # linewidth=1.5, # zorder=-50) # Use adjust_text to repel the labels from each other and the points adjust_text(texts, # force_text=(0.01, 0.02), # objects=lines, x=times_list, y=metrics_list, # force_pull=(0.1, 0.1), # force_explode=(0.1, 0.2), avoid_self=False, expand=(1.15, 1.3), ax=ax, ) if use_extra_methods: ymin, ymax = ax.get_ylim() ymin = ymin - 0.15 * (ymax - ymin) plt.ylim(ymin, ymax) suffix = '_extra' if use_extra_methods else '' filename = f'calib_benchmark_{coll_name}_{metric_name}{suffix}' if use_validation_errors: filename = filename + '_valid' filename = filename + '.pdf' file_path = paths.plots() / filename utils.ensureDir(file_path) plt.tight_layout() plt.savefig(file_path) plt.close(fig) def plot_gap_vs_ds_size(paths: Paths, tables: ResultsTables, base_name: str, metric_name: str, n_hpo_steps: int, use_smallest_class: bool = False, use_2nd_largest_class: bool = False, use_entropy: bool = False, use_percentages: bool = False, color_by_total_loss: bool = False): table = tables.get('talent-class-small', tag=f'paper_{base_name}', n_cv=1) coll_name = 'talent-class-small' task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) means, intervals = get_benchmark_results(paths, table, coll_name='talent-class-small', use_relative_score=False, test_metric_name=metric_name, val_metric_name=metric_name, n_splits=5, # don't replace '-class' because it occurs in val-class_error # also don't replace ' [bag-1]' for the cv case use_validation_errors=False, simplify_name_fn=lambda s: s, return_percentages=False, use_task_mean=False, use_geometric_mean=False) print(f'Available alg names:') for alg_name in means: print(alg_name) extended_base_name = f'{base_name}-{n_hpo_steps}' if 'HPO' in base_name else base_name alg_name_1 = f'{extended_base_name}_val-cross_entropy_ts-mix [bag-1]' alg_name_2 = f'{extended_base_name}_val-ref-ll-ts_ts-mix [bag-1]' if use_percentages: diffs = 100 * (means[alg_name_2] / means[alg_name_1] - 1) else: diffs = means[alg_name_2] - means[alg_name_1] suffix = '_rel' if use_percentages else '' if use_smallest_class: suffix = suffix + '_smallest-class' x = [] for task_info in task_infos: class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy() x.append(np.min(class_frequencies)) elif use_2nd_largest_class: suffix = suffix + '_2nd-largest-class' x = [] for task_info in task_infos: class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy() x.append(np.sort(class_frequencies)[-2]) elif use_entropy: suffix = suffix + '_entropy' x = [] for task_info in task_infos: class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy() class_probs = class_frequencies.astype(np.float32) / task_info.n_samples x.append(-task_info.n_samples * np.dot(class_probs, np.log2(class_probs + 1e-30))) else: x = [ti.n_samples for ti in task_infos] if color_by_total_loss: cbar_label = 'Sum of losses of both versions' suffix = suffix + '_col-loss' colors = means[alg_name_1] + means[alg_name_2] else: cbar_label = 'Total Entropy of Y' colors = [] for task_info in task_infos: class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)).numpy().astype( np.float32) p = class_frequencies / np.sum(class_frequencies) entropy = -np.dot(p, np.log(p)) colors.append(entropy) metric_display_names = {'cross_entropy': 'Logloss', '1-auroc-ovr': 'AUROC', 'brier': 'Brier', 'ref-ll-ts': 'TS-Ref.', 'n_cross_entropy': 'norm. Logloss', 'n_brier': 'norm. Brier', 'ref-br-ts': 'Brier-Ref.', 'class_error': 'Accuracy'} with (plt.rc_context(figsizes.icml2024_half(height_to_width_ratio=0.8))): with plt.rc_context(fontsizes.icml2024(default_smaller=0)): fig, ax = plt.subplots() norm = matplotlib.colors.LogNorm(vmin=np.min(colors), vmax=np.max(colors)) cmap = plt.cm.plasma_r # You can use other colormaps like 'plasma', 'coolwarm', etc. colors = cmap(norm(colors)) # Plot with color based on z for i in range(len(x)): ax.plot(x[i], diffs[i], '.', color=colors[i]) if use_percentages: ax.set_yscale('symlog', linthresh=1) method_display_name = base_name.replace('-HPO', ' (tuned)') method_display_name = method_display_name.replace('-TD', ' (default)') method_display_name = method_display_name.replace('-D', ' (default)') ax.set_title(r'\textbf{' + method_display_name + r'}') ax.set_xscale('log') sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) sm.set_array([]) # Set an empty array as required # Add the colorbar plt.colorbar(sm, label=cbar_label, ax=ax) # plt.semilogx(x, diffs, '.', color='tab:blue') plt.xlabel('Number of samples') plt.ylabel(f'Relative difference in {metric_display_names[metric_name]} [\\%]') plt.axhline(y=0, color='k', linestyle='--', zorder=-1) # plt.tight_layout() file_path = paths.plots() / f'gap_vs_ds_size_{base_name}_{metric_name}{suffix}.pdf' utils.ensureDir(file_path) plt.savefig(file_path) plt.close() if __name__ == '__main__': paths = Paths.from_env_variables() tables = ResultsTables(paths) # calibration methods (label positions can be improved post-hoc using Inkscape) for metric_name in ['cross_entropy', 'n_cross_entropy', 'brier', 'n_brier']: for use_extra_methods in [True, False]: plot_calib_benchmark(paths, tables, metric_name=metric_name, use_extra_methods=use_extra_methods) # results for individual datasets for base_name in ['MLP-HPO', 'XGB-HPO', 'RealMLP-HPO', 'MLP-D', 'XGB-D', 'RealMLP-TD']: plot_gap_vs_ds_size(paths, tables, base_name=base_name, metric_name='cross_entropy', n_hpo_steps=30, use_smallest_class=False, use_2nd_largest_class=False, use_entropy=False, use_percentages=True, color_by_total_loss=True) # plot main benchmark results for use_small_plot in [False, True]: for base_names in [['RealMLP-HPO', 'MLP-HPO', 'XGB-HPO'], ['RealMLP-TD', 'MLP-D', 'XGB-D']]: for coll_name in ['talent-class-small-above10k', 'talent-class-small']: for metric_name in ['cross_entropy', 'class_error', '1-auroc-ovr', 'brier']: plot_results(paths, tables, base_names, n_hpo_steps=30, n_tt_splits=5, use_percentages=True, metric_name=metric_name, coll_name=coll_name, use_validation_errors=False, use_small_plot=use_small_plot, use_medium_plot=not use_small_plot, use_mean_results=False, threshold=100, n_cv=1, title=r'\textbf{Tabular data, tuned hyperparameters}' if 'RealMLP-HPO' in base_names else r'\textbf{Tabular data, default hyperparameters}') # plot stopping times for base_names in [ # ['RealMLP-HPO'], ['MLP-HPO'], ['XGB-HPO'], ['RealMLP-TD'], ['MLP-D'], ['XGB-D']]: for coll_name in ['talent-class-small-above10k', 'talent-class-small']: plot_results(paths, tables, base_names, n_hpo_steps=30, n_tt_splits=5, use_percentages=False, metric_name='cross_entropy', coll_name=coll_name, use_validation_errors=False, use_small_plot=True, use_medium_plot=False, use_mean_results=False, n_cv=1, plot_stopping_times=True, threshold=None, title=r'\textbf{Tabular data, tuned hyperparameters}' if any( '-HPO' in base_name for base_name in base_names) else r'\textbf{Tabular data, default hyperparameters}') ================================================ FILE: scripts/create_xrfm_ablations_table.py ================================================ from typing import List, Optional import numpy as np from pytabkit.bench.run.results import ResultManager from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.eval.analysis import ResultsTables, get_benchmark_results from pytabkit.bench.eval.tables import _get_table_str from pytabkit.models import utils def generate_xrfm_ablations_results_table(paths: Paths, tables: ResultsTables, filename: str, coll_name: str, test_metric_name: Optional[str] = None, val_metric_name: Optional[str] = None): table = tables.get(coll_name, tag='default') alg_display_names = { 'xRFM-HPO-paper-large_new': 'AGOP', 'xRFM-HPO-large-temptune_new': 'AGOP + TT', 'xRFM-HPO-large-temptune-pca_new': 'PCA + TT', 'xRFM-HPO-large-temptune-rf_new': 'RF + TT' } alg_names = list(alg_display_names.keys()) means, intervals = get_benchmark_results(paths, table, coll_name=coll_name, use_relative_score=False, test_metric_name=test_metric_name, val_metric_name=val_metric_name, return_percentages=False, use_task_mean=False, use_geometric_mean=False, n_splits=1) alg_names = [an for an in alg_names if an in means] table_head = [['', r'\multicolumn{4}{c}{Splitting method}'], ['Dataset'] + [alg_display_names[an] for an in alg_names]] table_body = [] enumerated_task_infos = list(enumerate(table.test_table.task_infos)) enumerated_task_infos.sort(key=lambda tup: tup[1].task_desc.task_name.lower()) print(f'{coll_name=}') print(f'{list(means.keys())=}') def get_score_strings(scores: List[float], maximize: bool = False, use_int: bool = False) -> List[str]: best_row_score = np.max(scores) if maximize else np.min(scores) is_best_list = [score == best_row_score for score in scores] row_strs = [] for is_best, row_score in zip(is_best_list, scores): cur_str = str(round(row_score)) if use_int else f'{row_score:5.4f}' if is_best: cur_str = r'\textbf{' + cur_str + r'}' row_strs.append(cur_str) return row_strs for task_idx, task_info in enumerated_task_infos: row_scores = [means[alg_name][task_idx] for alg_name in alg_names] table_body.append([task_info.task_desc.task_name] + get_score_strings(row_scores)) # escape underscores for latex table_head = [[val.replace('_', r'\_') for val in row] for row in table_head] table_body = [[val.replace('_', r'\_') for val in row] for row in table_body] # generate bottom part, first average scores # indexed by [task][alg] scores_matrix = np.asarray( [[means[alg_name][task_idx] for alg_name in alg_names] for task_idx, _ in enumerated_task_infos]) n_wins = (scores_matrix == np.min(scores_matrix, axis=1)[:, None]).astype(np.int32).sum(axis=0).tolist() table_foot = [['Number of wins:'] + get_score_strings(n_wins, maximize=True, use_int=True), ['Shifted geometric mean:'] \ + get_score_strings(np.exp(np.mean(np.log(scores_matrix + 0.01), axis=0)).tolist()), ['Arithmetic mean:'] \ + get_score_strings(np.mean(scores_matrix, axis=0).tolist()) ] # get runtimes mean_fit_times = [] mean_eval_times = [] for alg_name in alg_names: fit_times = [] eval_times = [] for task_idx, task_info in enumerated_task_infos: fit_time = 0.0 eval_time = 0.0 for hpo_step in range(30): path = paths.results_alg_task_split(task_desc=task_info.task_desc, alg_name=alg_name + f'_step-{hpo_step}', n_cv=1, split_type=SplitType.RANDOM, split_id=0) rm = ResultManager.load(path, load_preds=False) fit_time += rm.other_dict['cv']['fit_time_s'] eval_time += rm.other_dict['cv']['eval_time_s'] fit_times.append(fit_time) eval_times.append(eval_time) mean_fit_times.append(np.mean(fit_times)) mean_eval_times.append(np.mean(eval_times)) table_foot.append(['Average fit time [s]:'] + get_score_strings(mean_fit_times, use_int=True)) table_foot.append(['Average eval time [s]:'] + get_score_strings(mean_eval_times, use_int=True)) table_str = _get_table_str(table_head, table_body, table_foot) file_path = paths.plots() / filename utils.writeToFile(file_path, table_str) if __name__ == '__main__': paths = Paths.from_env_variables() tables = ResultsTables(paths) for coll_name in ['meta-test-large-class', 'meta-test-large-reg']: generate_xrfm_ablations_results_table(paths, tables, f'individual_results_{coll_name}.tex', coll_name=coll_name) ================================================ FILE: scripts/custom_paths.py.default ================================================ def get_base_folder(): return 'tab_bench_data' ================================================ FILE: scripts/download_data.py ================================================ from typing import Optional import fire from pytabkit.bench.data.common import TaskSource from pytabkit.bench.data.get_uci import download_all_uci from pytabkit.bench.data.import_talent_benchmark import import_talent_benchmark from pytabkit.bench.data.import_tasks import import_uci_tasks, get_openml_task_ids, import_openml, get_openml_ds_names from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection, TaskDescription, TaskInfo def run_import(openml_cache_dir: str = None, import_meta_train: bool = False, import_meta_test: bool = False, import_openml_class_bin_extra: bool = False, import_grinsztajn: bool = False, import_grinsztajn_medium: bool = False, import_tabzilla_hard: bool = False, import_automl_class_small: bool = False, import_talent_class_small: bool = False, import_talent_reg_small: bool = False, import_tabarena: bool = False, talent_folder: Optional[str] = None): paths = Paths.from_env_variables() min_n_samples = 1000 if import_meta_train: print(f'Importing meta-train') # import UCI download_all_uci(paths) import_uci_tasks(paths) # generate task collections uci_multi_class_descs = TaskCollection.from_source(TaskSource.UCI_MULTI_CLASS, paths).task_descs uci_bin_class_descs = TaskCollection.from_source(TaskSource.UCI_BIN_CLASS, paths).task_descs uci_multi_class_task_names = [td.task_name for td in uci_multi_class_descs] uci_class_descs = uci_multi_class_descs + [td for td in uci_bin_class_descs if td.task_name not in uci_multi_class_task_names] uci_class_descs = [td for td in uci_class_descs if td.load_info(paths).n_samples >= min_n_samples] TaskCollection('meta-train-class', uci_class_descs).save(paths) uci_reg_descs = TaskCollection.from_source(TaskSource.UCI_REGRESSION, paths).task_descs uci_reg_descs = [td for td in uci_reg_descs if td.load_info(paths).n_samples >= min_n_samples] TaskCollection('meta-train-reg', uci_reg_descs).save(paths) # maybe could use faster pyarrow backend for pandas if v2 is available? # pd.options.mode.dtype_backend = "pyarrow" if import_meta_test or import_openml_class_bin_extra or import_automl_class_small: # import AutoML Benchmark and CTR-23 benchmark # could also import the TabZilla suite # https://www.openml.org/search?type=study&study_type=task&id=379&sort=tasks_included # but the selection criteria for this one are based a lot on the performance of different algorithms automl_class_task_ids = get_openml_task_ids(271) automl_reg_task_ids = get_openml_task_ids(269) ctr23_reg_task_ids = get_openml_task_ids(353) sarcos_duplicated_task_id = 361254 sarcos_deduplicated_task_id = 361011 if sarcos_duplicated_task_id in ctr23_reg_task_ids: # use the version of sarcos without the duplicated test set print(f'Using a different version of the sarcos data set for the CTR-23 benchmark') ctr23_reg_task_ids.remove(sarcos_duplicated_task_id) ctr23_reg_task_ids.append(sarcos_deduplicated_task_id) all_reg_task_ids = list(set(automl_reg_task_ids + ctr23_reg_task_ids)) # todo automl_class_ds_names = get_openml_ds_names(automl_class_task_ids) automl_reg_ds_names = get_openml_ds_names(automl_reg_task_ids) ctr23_reg_ds_names = get_openml_ds_names(ctr23_reg_task_ids) def check_task(td: TaskDescription, min_n_samples: Optional[int] = None, max_one_hot_size: Optional[int] = None) -> bool: task_info = td.load_info(paths) if min_n_samples is not None and task_info.n_samples < min_n_samples: print(f'Ignoring task {str(td)} because it has too few samples') return False n_cont = task_info.tensor_infos['x_cont'].get_n_features() cat_sizes = task_info.tensor_infos['x_cat'].get_cat_sizes().numpy() # ignore 'missing' categories # todo: is this really the way we should handle this? d_one_hot = n_cont + sum([1 if cs == 3 else cs - 1 for cs in cat_sizes]) if max_one_hot_size is not None and d_one_hot > max_one_hot_size: print(f'Ignoring task {str(td)} because it is too high-dimensional after one-hot encoding') return False return True if import_meta_test: print(f'Importing meta-test') # treat dionis separately because we want to subsample it to 100k instead of 500k samples for speed and RAM reasons automl_class_task_ids_not_dionis = [id for id, name in zip(automl_class_task_ids, automl_class_ds_names) if name != 'dionis'] automl_class_task_ids_dionis = [id for id, name in zip(automl_class_task_ids, automl_class_ds_names) if name == 'dionis'] assert len(automl_class_task_ids_dionis) == 1 assert len(automl_class_task_ids_not_dionis) == len(automl_class_task_ids) - 1 import_openml(automl_class_task_ids_not_dionis, TaskSource.OPENML_CLASS, paths, openml_cache_dir, max_n_samples=500_000, rerun=False) import_openml(automl_class_task_ids_dionis, TaskSource.OPENML_CLASS, paths, openml_cache_dir, max_n_samples=100_000, rerun=True) import_openml(all_reg_task_ids, TaskSource.OPENML_REGRESSION, paths, openml_cache_dir, normalize_y=True, max_n_samples=500000, rerun=False) class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs # generate task collections exclude_automl_class = ['kr-vs-kp', 'wilt', 'ozone-level-8hr', 'first-order-theorem-proving', 'GesturePhaseSegmentationProcessed', 'PhishingWebsites', 'wine-quality-white', 'nomao', 'bank-marketing', 'adult'] filtered_class_descs = [td for td in class_descs if td.task_name not in exclude_automl_class and td.task_name in automl_class_ds_names and check_task(td, min_n_samples=min_n_samples, max_one_hot_size=10000)] TaskCollection('meta-test-class', filtered_class_descs).save(paths) # we exclude Brazilian_houses because there is already brazilian_houses in ctr-23, # and Brazilian_houses includes three features that should not be used for predicting the target, # while brazilian_houses should not contain them exclude_automl_reg = ['wine_quality', 'abalone', 'OnlineNewsPopularity', 'Brazilian_houses'] exclude_ctr23_reg = ['abalone', 'physiochemical_protein', 'naval_propulsion_plant', 'superconductivity', 'white_wine', 'red_wine', 'grid_stability'] reg_descs = TaskCollection.from_source(TaskSource.OPENML_REGRESSION, paths).task_descs filtered_reg_descs = [td for td in reg_descs if td.task_name not in exclude_automl_reg + exclude_ctr23_reg and td.task_name in automl_reg_ds_names + ctr23_reg_ds_names and check_task(td, min_n_samples=min_n_samples, max_one_hot_size=10000)] TaskCollection('meta-test-reg', filtered_reg_descs).save(paths) if import_openml_class_bin_extra: print(f'Importing openml-class-bin-extra') # also import binary version of multiclass tasks # requires that meta_test has already been imported class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs multiclass_names = [td.task_name for td in class_descs if td.load_info(paths).get_n_classes() > 2] # print(f'{multiclass_names=}') import_openml(automl_class_task_ids, TaskSource.OPENML_CLASS_BIN_EXTRA, paths, openml_cache_dir, max_n_classes=2, include_only_ds_names=multiclass_names) if import_automl_class_small: print(f'Importing automl-class-small') import_openml(automl_class_task_ids, TaskSource.AUTOML_CLASS_SMALL, paths, openml_cache_dir, ignore_above_n_classes=50, min_n_samples=1000, max_n_samples=100_000) descs = TaskCollection.from_source(TaskSource.AUTOML_CLASS_SMALL, paths).task_descs filtered_descs = [td for td in descs if check_task(td, max_one_hot_size=1000)] TaskCollection('automl-class-small-filtered', filtered_descs).save(paths) if import_grinsztajn: print(f'Importing grinsztain benchmark') import_grinsztajn_datasets(openml_cache_dir) if import_grinsztajn_medium: print(f'Importing grinsztain medium benchmark') import_grinsztajn_medium_datasets(openml_cache_dir) if import_tabzilla_hard: print(f'Importing TabZilla hard benchmark') import_tabzilla_hard_datasets(openml_cache_dir) if import_talent_class_small: if talent_folder is None: raise ValueError(f'Please specify talent_folder to import datasets from the TALENT benchmark') import_talent_benchmark(paths, talent_folder=talent_folder, source_name='talent-class-small', allow_regression=False, min_n_samples=1000, max_n_samples=100_000, ignore_above_n_classes=100) task_infos = TaskCollection.from_source('talent-class-small', paths).load_infos(paths) bin_task_descs = [ti.task_desc for ti in task_infos if ti.get_n_classes() == 2] multi_task_descs = [ti.task_desc for ti in task_infos if ti.get_n_classes() != 2] TaskCollection('talent-bin-class-small', bin_task_descs).save(paths) TaskCollection('talent-multi-class-small', multi_task_descs).save(paths) above10k_descs = [ti.task_desc for ti in task_infos if ti.n_samples >= 10_000] below10k_descs = [ti.task_desc for ti in task_infos if ti.n_samples < 10_000] TaskCollection('talent-class-small-above10k', above10k_descs).save(paths) TaskCollection('talent-class-small-below10k', below10k_descs).save(paths) talent_reg_tabpfn_task_descs = [ti.task_desc for ti in task_infos if ti.get_n_classes() <= 10 and ti.n_samples <= 10_000 and ti.tensor_infos[ 'x_cont'].get_n_features() + ti.tensor_infos[ 'x_cat'].get_n_features() <= 500] TaskCollection('talent-class-tabpfn', talent_reg_tabpfn_task_descs).save(paths) if import_talent_reg_small: if talent_folder is None: raise ValueError(f'Please specify talent_folder to import datasets from the TALENT benchmark') import_talent_benchmark(paths, talent_folder=talent_folder, source_name='talent-reg-small', allow_regression=True, allow_classification=False, min_n_samples=1000, max_n_samples=100_000) task_infos = TaskCollection.from_source('talent-reg-small', paths).load_infos(paths) talent_reg_tabpfn_task_descs = [ti.task_desc for ti in task_infos if ti.n_samples <= 10_000 and ti.tensor_infos[ 'x_cont'].get_n_features() + ti.tensor_infos[ 'x_cat'].get_n_features() <= 500] TaskCollection('talent-reg-tabpfn', talent_reg_tabpfn_task_descs).save(paths) if import_tabarena: all_ids = { TaskSource.TABARENA_REG: [363611, 363612, 363615, 363622, 363625, 363631, 363672, 363675, 363678, 363686, 363693, 363697, 363698, 363701, 363705, 363708, 363709], TaskSource.TABARENA_CLASS: [363613, 363614, 363616, 363617, 363618, 363619, 363620, 363621, 363623, 363624, 363626, 363627, 363628, 363629, 363630, 363632, 363671, 363673, 363674, 363676, 363677, 363679, 363680, 363681, 363682, 363683, 363684, 363685, 363687, 363688, 363689, 363691, 363692, 363694, 363695, 363696, 363699, 363700, 363702, 363703, 363704, 363706, 363707] } for task_source, ids in all_ids.items(): print(f'Importing {task_source}') class_descs = TaskCollection.from_source(TaskSource.OPENML_CLASS, paths).task_descs multiclass_names = [td.task_name for td in class_descs if td.load_info(paths).get_n_classes() > 2] # print(f'{multiclass_names=}') import_openml(ids, task_source, paths, openml_cache_dir, min_n_samples=500) TaskCollection.from_source(task_source, paths).save(paths) def import_grinsztajn_datasets(openml_cache_dir: str = None): # import data sets from the benchmark of Grinsztajn et al. paths = Paths.from_env_variables() import_openml(get_openml_task_ids(334), 'grinsztajn-cat-class', paths, openml_cache_dir, max_n_samples=500000, rerun=False) import_openml(get_openml_task_ids(335), 'grinsztajn-cat-reg', paths, openml_cache_dir, normalize_y=True, max_n_samples=500000, rerun=False) import_openml(get_openml_task_ids(336), 'grinsztajn-num-reg', paths, openml_cache_dir, normalize_y=True, max_n_samples=500000, rerun=False) import_openml(get_openml_task_ids(337), 'grinsztajn-num-class', paths, openml_cache_dir, max_n_samples=500000, rerun=False) import_openml(get_openml_task_ids(334), 'grinsztajn-cat-class-15k', paths, openml_cache_dir, max_n_samples=15_000, rerun=False) import_openml(get_openml_task_ids(335), 'grinsztajn-cat-reg-15k', paths, openml_cache_dir, normalize_y=True, max_n_samples=15_000, rerun=False) import_openml(get_openml_task_ids(336), 'grinsztajn-num-reg-15k', paths, openml_cache_dir, normalize_y=True, max_n_samples=15_000, rerun=False) import_openml(get_openml_task_ids(337), 'grinsztajn-num-class-15k', paths, openml_cache_dir, max_n_samples=15_000, rerun=False) def import_grinsztajn_medium_datasets(openml_cache_dir: str = None): paths = Paths.from_env_variables() for bench_name, bench_id_cat, bench_id_num in [('grinsztajn-class', 334, 337), ('grinsztajn-reg', 335, 336)]: task_ids_cat = get_openml_task_ids(bench_id_cat) task_ids_num = get_openml_task_ids(bench_id_num) task_ids = task_ids_cat + [task_id for task_id in task_ids_num if task_id not in task_ids_cat] import_openml(task_ids, bench_name, paths, openml_cache_dir, max_n_samples=500_000, # normalize_y=(bench_name=='grinsztajn-reg'), rerun=False) task_infos = TaskCollection.from_source(bench_name, paths).load_infos(paths) for task_info in task_infos: # use 13333 so the 75%-25% train-val split will use 10k training samples task_info.max_n_trainval = 13_333 task_info.save(paths) tc_orig = TaskCollection.from_source('grinsztajn-class', paths) tc_orig.save(paths) # exclude eye_movements because it has a leak according to the TabR paper tc = TaskCollection('grinsztajn-class-filtered', [task_desc for task_desc in tc_orig.task_descs if task_desc.task_name != 'eye_movements']) tc.save(paths) def import_tabzilla_hard_datasets(openml_cache_dir: str = None): # import data sets from the benchmark of Grinsztajn et al. paths = Paths.from_env_variables() import_openml(get_openml_task_ids(379), 'tabzilla-hard-class', paths, openml_cache_dir, rerun=False) def split_meta_test(paths: Paths): for task_type in ['class', 'reg']: coll_name = f'meta-test-{task_type}' task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) def is_ood(task_info: TaskInfo): if task_info.n_samples < 1500 or task_info.n_samples > 60000: return True n_features = (task_info.tensor_infos['x_cont'].get_n_features() + task_info.tensor_infos['x_cat'].get_n_features()) if n_features > 750: return True x_cat_info = task_info.tensor_infos['x_cat'] if x_cat_info.get_n_features() > 0 and x_cat_info.get_cat_sizes().max().item() > 50: return True return False id_task_descs = [task_info.task_desc for task_info in task_infos if not is_ood(task_info)] ood_task_descs = [task_info.task_desc for task_info in task_infos if is_ood(task_info)] TaskCollection(f'{coll_name}-indist', id_task_descs).save(paths) TaskCollection(f'{coll_name}-oodist', ood_task_descs).save(paths) print(f'{len(id_task_descs)=}, {len(ood_task_descs)=}') # could extend this for other task collections like openml-cc18, pmlb, uci121 or uci-small if __name__ == '__main__': fire.Fire(run_import) # import_grinsztajn_datasets() # paths = Paths.from_env_variables() # split_meta_test(paths) # meta_train = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) # only_bin_class = [info.task_desc for info in meta_train if info.get_n_classes() == 2] # only_multi_class = [info.task_desc for info in meta_train if info.get_n_classes() > 2] # TaskCollection('meta-train-bin-class', only_bin_class).save(paths) # TaskCollection('meta-train-multi-class', only_multi_class).save(paths) # print(get_openml_ds_names([361011])) # ctr23_reg_task_ids = get_openml_task_ids(353) # ctr23_reg_ds_names = get_openml_ds_names(ctr23_reg_task_ids) # for ds_name in ctr23_reg_ds_names: # print(ds_name) # test brazilian houses data set # import openml # import pandas as pd # task = openml.tasks.get_task(361267, download_data=False) # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=True) # df: pd.DataFrame = dataset.get_data()[0] # print(df.head()) # print(dataset.dataset_id) # test sarcos dataset # import openml # task = openml.tasks.get_task(361011, download_data=False) # dataset = openml.datasets.get_dataset(task.dataset_id, download_data=False) # print(dataset.dataset_id) ================================================ FILE: scripts/estimate_resource_params.py ================================================ import multiprocessing import time from typing import List, Dict, Any, Callable import numpy as np import sklearn import torch from sklearn.base import BaseEstimator from pytabkit.models.utils import FunctionProcess from pytabkit.models.alg_interfaces.resource_computation import UniformSampler, FeatureSpec, get_resource_features, \ process_resource_features, \ Sampler, ds_to_xy, fit_resource_factors, TimeWrapper from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskCollection from pytabkit.models import utils from pytabkit.models.data.data import DictDataset from pytabkit.models.sklearn.sklearn_interfaces import CatBoost_TD_Classifier, XGB_TD_Classifier, LGBM_TD_Classifier def get_param_grid(grids_1d: Dict[str, List[Any]]) -> List[Dict[str, Any]]: configs = [dict()] for key, values in grids_1d.items(): configs = [utils.update_dict(c, {key: val}) for val in values for c in configs] return configs def estimate_params(paths: Paths, exp_name: str, coll_name: str, estimator: BaseEstimator, is_lgbm: bool = False, rerun: bool = False): if is_lgbm: # use num_leaves instead of max_depth learner_space = dict( n_estimators=UniformSampler(2, 2, log=True, is_int=True), n_threads=UniformSampler(4, 4, log=True, is_int=True), num_leaves=UniformSampler(10, 100, log=True, is_int=True), ) time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product('n_cv_refit', 'n_splits', ['', 'log_num_leaves', 'num_leaves'], 'n_estimators', '1/n_threads', FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product(['', 'log_num_leaves', 'num_leaves'], FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) else: learner_space = dict( n_estimators=UniformSampler(2, 2, log=True, is_int=True), n_threads=UniformSampler(4, 4, log=True, is_int=True), max_depth=UniformSampler(3, 10, is_int=True), ) time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product('n_cv_refit', 'n_splits', ['', 'max_depth', '2_power_maxdepth'], 'n_estimators', '1/n_threads', FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product(['', 'max_depth', '2_power_maxdepth'], FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) coefs = calibrate_resources(exp_name, paths=paths, learner_space=learner_space, coll_name=coll_name, time_feature_spec=time_feature_spec, ram_feature_spec=ram_feature_spec, sklearn_learner=estimator, n_combinations=300, rerun=rerun) print(f'time_params={coefs["time_s"]}') print(f'cpu_ram_params={coefs["ram_gb"]}') ram_params = coefs['ram_gb'] time_params = coefs['time_s'] print(f'Analyzing dionis:') task_info = TaskDescription('openml-class', 'dionis').load_info(paths) # task_info = TaskDescription('uci-bin-class', 'madelon').load_info(paths) ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples) config = dict(n_estimators=1000, n_threads=4, max_depth=6, num_leaves=31) raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1) ram_features = process_resource_features(raw_features, ram_feature_spec) ram_gb = sum([ram_features[key] * ram_params[key] for key in ram_params]) time_features = process_resource_features(raw_features, time_feature_spec) time_s = sum([time_features[key] * time_params[key] for key in time_params]) print(f'{ram_gb=}, {time_s=}') def estimate_params_new(paths: Paths, exp_name: str, coll_name: str, estimator: BaseEstimator, hparam_grid: List[Dict[str, Any]], short_name: str, is_lgbm: bool = False, rerun: bool = False): if is_lgbm: # use num_leaves instead of max_depth time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product('n_cv_refit', 'n_splits', ['', 'log_num_leaves', 'num_leaves'], 'n_estimators', '1/n_threads', FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product(['', 'log_num_leaves', 'num_leaves'], FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) else: time_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product('n_cv_refit', 'n_splits', ['', 'max_depth', '2_power_maxdepth'], 'n_estimators', '1/n_threads', FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) ram_feature_spec = FeatureSpec.concat('', 'ds_size_gb', 'ds_prep_size_gb', 'ds_onehot_size_gb', FeatureSpec.product(['', 'max_depth', '2_power_maxdepth'], FeatureSpec.powerset_products('n_features', 'n_samples', 'n_tree_repeats'))) coefs = calibrate_resources_new_2(exp_name, paths=paths, hparam_grid=hparam_grid, coll_name=coll_name, time_feature_spec=time_feature_spec, ram_feature_spec=ram_feature_spec, sklearn_learner=estimator, rerun=rerun) print(f'{short_name}_time={coefs["time_s"]}') print(f'{short_name}_ram={coefs["ram_gb"]}') ram_params = coefs['ram_gb'] time_params = coefs['time_s'] print(f'Analyzing dionis:') task_info = TaskDescription('openml-class', 'dionis').load_info(paths) # task_info = TaskDescription('uci-bin-class', 'madelon').load_info(paths) ds = DictDataset(tensors=None, tensor_infos=task_info.tensor_infos, device='cpu', n_samples=task_info.n_samples) config = dict(n_estimators=1000, n_threads=4, max_depth=6, num_leaves=31) raw_features = get_resource_features(config, ds, n_cv=1, n_refit=0, n_splits=1) ram_features = process_resource_features(raw_features, ram_feature_spec) ram_gb = sum([ram_features[key] * ram_params[key] for key in ram_params]) time_features = process_resource_features(raw_features, time_feature_spec) time_s = sum([time_features[key] * time_params[key] for key in time_params]) print(f'{ram_gb=}, {time_s=}') if __name__ == '__main__': print(get_param_grid(dict(n_estimators=[2], max_depth=[4, 6, 7, 9]))) paths = Paths.from_env_variables() # estimate_catboost_params(paths) # estimate_params(paths, 'CB-class-7', 'meta-test-class', CatBoostTDClassifier(verbosity=2)) # # estimate_params(paths, 'CB-reg-7', 'meta-test-reg', CatBoostTDClassifier(verbosity=2)) # estimate_params(paths, 'XGB-class-2', 'meta-test-class', # XGBTDClassifier(verbosity=2, subsample=1.0, colsample_bytree=1.0, colsample_bylevel=1.0)) # estimate_params(paths, 'LGBM-class-3', 'meta-test-class', # LGBMTDClassifier(subsample=1.0), is_lgbm=True) estimate_params_new(paths, 'CB-class-11', 'meta-test-class', CatBoost_TD_Classifier(subsample=1.0), hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4], max_depth=[4, 6, 7, 9])), short_name='cb_class') # estimate_params(paths, 'CB-reg-7', 'meta-test-reg', CatBoostTDClassifier(verbosity=2)) estimate_params_new(paths, 'XGB-class-3', 'meta-test-class', XGB_TD_Classifier(subsample=1.0, colsample_bytree=1.0, colsample_bylevel=1.0), hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4], max_depth=[4, 6, 8, 11])), short_name='xgb_class') estimate_params_new(paths, 'LGBM-class-4', 'meta-test-class', LGBM_TD_Classifier(subsample=1.0, colsample_bytree=1.0), hparam_grid=get_param_grid(dict(n_estimators=[2], n_threads=[4], num_leaves=[31, 100, 300, 1000])), short_name='lgbm_class', is_lgbm=True) pass def calibrate_resources(exp_name: str, paths: Paths, learner_space: Dict[str, Sampler], coll_name: str, time_feature_spec: List[str], ram_feature_spec: List[str], sklearn_learner: BaseEstimator, n_combinations: int, rerun: bool) \ -> Dict[str, Dict[str, float]]: if multiprocessing.get_start_method() != 'spawn': multiprocessing.set_start_method('spawn', force=True) all_results = [] task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) for i in range(n_combinations): np.random.seed(i) torch.manual_seed(i) file_path = paths.resources_exp_it(exp_name, i) / 'results.yaml' learner_params = {key: value.sample() for key, value in learner_space.items()} task_idx = np.random.randint(len(task_infos)) task_info = task_infos[task_idx] print(f'Iteration {i + 1}/{n_combinations}: Evaluating {type(sklearn_learner)} with \n' f'{str(task_info.task_desc)=}\n' f'{learner_params=}', flush=True) if utils.existsFile(file_path) and not rerun: print(f'Loading saved result') all_results.append(utils.deserialize(file_path, use_yaml=True)) else: print(f'Running estimator...') # compute it learner: BaseEstimator = sklearn.base.clone(sklearn_learner) learner.set_params(**learner_params) ds = task_info.load_task(paths).ds X, y = ds_to_xy(ds) f = lambda learner_=learner, X_=X, y_=y[:, 0]: learner_.fit(X_, y_) new_results: Dict[str, Dict[str, Any]] = dict() new_results['measured'] = measure_resources(f) new_results['features'] = get_resource_features(config=learner_params, ds=ds, n_cv=1, n_refit=0, n_splits=1) # new_results['features'] = {'time_s': time_feature_map.get_features(ds), # 'ram_gb': ram_feature_map.get_features(ds)} all_results.append(new_results) utils.serialize(file_path, new_results, use_yaml=True) print(all_results[-1]['measured']) coefs = dict() coefs['time_s'] = fit_resource_factors([(process_resource_features(results['features'], time_feature_spec), results['measured']['time_s']) for results in all_results], pessimistic=False) coefs['ram_gb'] = fit_resource_factors([(process_resource_features(results['features'], ram_feature_spec), results['measured']['ram_gb']) for results in all_results], pessimistic=True) return coefs def calibrate_resources_new_2(exp_name: str, paths: Paths, hparam_grid: List[Dict[str, Any]], coll_name: str, time_feature_spec: List[str], ram_feature_spec: List[str], sklearn_learner: BaseEstimator, rerun: bool) \ -> Dict[str, Dict[str, float]]: if multiprocessing.get_start_method() != 'spawn': multiprocessing.set_start_method('spawn', force=True) all_results = [] task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) for idx_1, task_info in enumerate(task_infos): for idx_2, learner_params in enumerate(hparam_grid): i = idx_1 * len(hparam_grid) + idx_2 np.random.seed(i) torch.manual_seed(i) file_path = paths.resources_exp_it(exp_name, i) / 'results.yaml' print(f'Iteration {i + 1}/{len(task_infos)*len(hparam_grid)}: Evaluating {type(sklearn_learner)} with \n' f'{str(task_info.task_desc)=}\n' f'{learner_params=}', flush=True) if utils.existsFile(file_path) and not rerun: print(f'Loading saved result') all_results.append(utils.deserialize(file_path, use_yaml=True)) else: print(f'Running estimator...') # compute it learner: BaseEstimator = sklearn.base.clone(sklearn_learner) learner.set_params(**learner_params) ds = task_info.load_task(paths).ds X, y = ds_to_xy(ds) f = lambda learner_=learner, X_=X, y_=y[:, 0]: learner_.fit(X_, y_) new_results: Dict[str, Dict[str, Any]] = dict() new_results['measured'] = measure_resources(f) new_results['features'] = get_resource_features(config=learner_params, ds=ds, n_cv=1, n_refit=0, n_splits=1) # new_results['features'] = {'time_s': time_feature_map.get_features(ds), # 'ram_gb': ram_feature_map.get_features(ds)} all_results.append(new_results) utils.serialize(file_path, new_results, use_yaml=True) print(all_results[-1]['measured']) coefs = dict() coefs['time_s'] = fit_resource_factors([(process_resource_features(results['features'], time_feature_spec), results['measured']['time_s']) for results in all_results], pessimistic=True) coefs['ram_gb'] = fit_resource_factors([(process_resource_features(results['features'], ram_feature_spec), results['measured']['ram_gb']) for results in all_results], pessimistic=True, coef_factor=1.6) return coefs def measure_resources(f: Callable[[], None]) -> Dict[str, float]: # open function in one process (that measures the time), poll the RAM usages from another process process = FunctionProcess(TimeWrapper(f)) process.start() time_interval = 0.01 max_ram_usage_gb = 0.0 while not process.is_done(): max_ram_usage_gb = max(max_ram_usage_gb, process.get_ram_usage_gb()) time.sleep(time_interval) process_time = process.pop_result() return {'time_s': process_time, 'ram_gb': max_ram_usage_gb} ================================================ FILE: scripts/get_sklearn_names.py ================================================ import importlib # get the names of all sklearn interfaces, for exporting them in __all__ to import them from a higher-level module if __name__ == '__main__': # Import the module module = importlib.import_module("pytabkit.models.sklearn.sklearn_interfaces") # Get all top-level attributes of the module (like classes, functions) attrs = [attr_name for attr_name in dir(module) if not attr_name.startswith('_') and not 'Mixin' in attr_name and hasattr(getattr(module, attr_name), '__module__') and getattr(module, attr_name).__module__ == module.__name__] print(f'"' + '", "'.join(attrs) + '"') ================================================ FILE: scripts/make_plot_animation.py ================================================ from typing import List from pytabkit.bench.eval.plotting import plot_pareto from pytabkit.bench.data.paths import Paths from pytabkit.bench.eval.analysis import ResultsTables from pathlib import Path def plot_animations(coll_names: List[str]): paths = Paths.from_env_variables() tables = ResultsTables(paths) arrow_alg_names = [('MLP-PLR-D', 'RealMLP-TD'), ('TabR-S-D', 'RealTabR-D'), ('XGB-D', 'XGB-TD'), ('LGBM-D', 'LGBM-TD'), ('CatBoost-D', 'CatBoost-TD'), ('MLP-PLR-HPO', 'RealMLP-HPO')] alg_names = [f'{method}-{version}' for method in ['XGB', 'LGBM', 'CatBoost', 'BestModel', 'Ensemble'] for version in ['D', 'TD', 'HPO']] alg_names.extend(['RealMLP-TD', 'RealMLP-TD-S', 'RealMLP-HPO', 'MLP-RTDL-D', 'MLP-RTDL-HPO', 'MLP-PLR-D', 'MLP-PLR-HPO', 'RealTabR-D', 'FTT-D', 'FTT-HPO', 'ResNet-RTDL-D', 'ResNet-RTDL-HPO', 'RF-SKL-D', 'RF-HPO', 'XGB-PBB-D', 'TabR-S-D', 'TabR-HPO']) alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO", "MLP-RTDL-HPO"] # #all # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # plot_pareto_frontier=False, # filename_suffix='_1', # subfolder='animations', # alg_names_to_hide=[])#alg_name for alg_name in alg_names if alg_name not in black_border_alg_names]) # # # show pareto frontier # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_2', # subfolder='animations', # alg_names_to_hide=[])#alg_name for alg_name in alg_names if alg_name not in black_border_alg_names]) # # # show only MLP models # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_3', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # # # add NN baselines # alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO", # "MLP-RTDL-HPO", "TabR-S-D", "TabR-HPO", "FTT-D", "FTT-HPO"] # # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_4', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # # # show that we can also improve TabR with RealTabr # alg_names_to_keep = ["MLP-RTDL-D", "MLP-PLR-D", "RealMLP-TD", "MLP-HPO", "MLP-PLR-HPO", "RealMLP-HPO", # "MLP-RTDL-HPO", "TabR-S-D", "TabR-HPO", "RealTabR-D", "FTT-D", "FTT-HPO"] # # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_5', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # # #show that we can also create TD for trees # alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", "CatBoost-HPO", # "XGB-D", "XGB-TD", "XGB-HPO", # "LGBM-D", "LGBM-TD", "LGBM-HPO"] # # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_6', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # # # show that ensembles work well for td # alg_names_to_keep = ["CatBoost-TD", "CatBoost-HPO", # "XGB-TD", "XGB-HPO", # "LGBM-TD", "LGBM-HPO", # "RealMLP-TD", "RealMLP-HPO", # "Ensemble-D", "BestModel-D", # "Ensemble-TD", "Ensemble-HPO", # "BestModel-TD", "BestModel-HPO"] # # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_7', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", #"CatBoost-HPO", # "XGB-D", "XGB-TD", #"XGB-HPO", # "LGBM-D", "LGBM-TD", #"LGBM-HPO", # "MLP-PLR-D", "MLP-PLR-HPO", # "RealMLP-TD", "RealMLP-HPO", # "TabR-S-D", "RealTabR-D"] # # plot_pareto(paths, tables, # coll_names=coll_names, # alg_names=alg_names, # use_ranks=False, use_normalized_errors=False, # use_grinnorm_errors=False, # use_geometric_mean=True, arrow_alg_names=arrow_alg_names, # pareto_frontier_width=4., # filename_suffix='_8', # subfolder='animations', # alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) alg_names_to_keep = ["CatBoost-D", "CatBoost-TD", "CatBoost-HPO", "MLP-PLR-D", "MLP-PLR-HPO", "RealMLP-TD", "RealMLP-HPO", "TabR-S-D", "RealTabR-D", "TabR-HPO", "BestModel-D", "BestModel-TD", "BestModel-HPO"] plot_pareto(paths, tables, coll_names=coll_names, alg_names=alg_names, use_ranks=False, use_normalized_errors=False, use_grinnorm_errors=False, use_geometric_mean=True, arrow_alg_names=arrow_alg_names, pareto_frontier_width=4., filename_suffix='_9', subfolder='animations', alg_names_to_hide=[alg_name for alg_name in alg_names if alg_name not in alg_names_to_keep]) # animation # everything # then bigger pareto front # then remove everything except the algorithms of interest if __name__ == '__main__': coll_names = ['meta-train-class', 'meta-train-reg', 'meta-test-class', 'meta-test-reg', 'grinsztajn-class-filtered', 'grinsztajn-reg'] plot_animations(['meta-test-class', 'meta-test-reg']) plot_animations(['grinsztajn-class-filtered', 'grinsztajn-reg']) plot_animations(['meta-train-class', 'meta-train-reg']) plot_animations(['meta-test-class', 'grinsztajn-class-filtered']) plot_animations(['meta-test-reg', 'grinsztajn-reg']) ================================================ FILE: scripts/meta_hyperopt.py ================================================ from typing import Optional, Tuple, Any, Dict import numpy as np from pytabkit.bench.alg_wrappers.interface_wrappers import LGBMInterfaceWrapper, XGBInterfaceWrapper, \ CatBoostInterfaceWrapper from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskCollection from pytabkit.bench.eval.evaluation import FunctionAlgFilter, MultiResultsTable, DefaultEvalModeSelector, \ MeanTableAnalyzer from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models import utils from pytabkit.models.hyper_opt.coord_opt import Hyperparameter, CoordOptimizer from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer, f_unpack_dict from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.models.nn_models.categorical import EncodingFactory, SingleOrdinalEncodingFactory from pytabkit.models.training.logging import StdoutLogger def load_score(alg_name: Optional[str] = None, coll_name: str = 'meta-train-class', n_cv: int = 1, val_metric_name: Optional[str] = None, test_metric_name: Optional[str] = None, split_type: str = SplitType.RANDOM, use_task_weighting: bool = True, data_path: Optional[str] = None) -> Tuple[float, Any]: paths = Paths(data_path) if data_path is not None else Paths.from_env_variables() if '/' in coll_name: # use a single task parts = coll_name.split('/') if len(parts) != 2: raise ValueError(f'Too many / in coll_name {coll_name}') task_collection = TaskCollection(coll_name, [TaskDescription(*parts)]) else: task_collection = TaskCollection.from_name(coll_name, paths) # print('load table') # table = MultiResultsTable.load_summaries(task_collection, n_cv=n_cv, paths=paths) alg_filter = FunctionAlgFilter(lambda an, tags, aw: an == alg_name) table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=paths, split_type=split_type, alg_filter=alg_filter) # print('process table') test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict={}, val_metric_name=val_metric_name, test_metric_name=test_metric_name) analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + 1e-2) - np.log(1e-2), use_weighting=use_task_weighting) means = analyzer.get_means(test_table) print(f'Mean scores for {alg_name}: {means}') return means[0], None class AlgConfigRunner: def __init__(self, paths: Paths, coll_name: str, create_wrapper, base_name: str, tag: Optional[str] = None, short_key_map: Dict[str, str] = None, **default_params): self.paths = paths self.coll_name = coll_name self.create_wrapper = create_wrapper self.base_name = base_name self.tag = tag or base_name self.default_params = default_params self.short_key_map = short_key_map or {} def __call__(self, config): config = f_unpack_dict(config) print(f'HPO config: {config}') # compute alg_name, potentially round config arguments alg_name_parts = [self.base_name] rounded_config = {} for key, value in config.items(): if key in self.short_key_map: short_key = self.short_key_map[key] else: short_key = key if isinstance(value, float): alg_name_parts.append(f'{short_key}-{value:g}') rounded_config[key] = float(f'{value:g}') else: alg_name_parts.append(f'{short_key}-{value}') rounded_config[key] = value alg_name = '_'.join(alg_name_parts) try: # if already computed, return the computed result return load_score(alg_name, self.coll_name) except IndexError: pass # call wrapper with alg_name, tag, default_params and config wrapper = self.create_wrapper(**utils.join_dicts(self.default_params, config)) # run on task_infos task_infos = TaskCollection.from_name(self.coll_name, self.paths).load_infos(self.paths) job_mgr = TabBenchJobManager(self.paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0) job_mgr.add_jobs(task_infos, config_10_1_0, alg_name, wrapper, tags=[self.tag]) job_mgr.run_jobs(scheduler) # load result return load_score(alg_name, self.coll_name) def test_hyperopt_seed(): from hyperopt import hp space = { 'learning_rate': hp.loguniform('learning_rate', np.log(5e-3), np.log(3e-1)), 'num_leaves': hp.qloguniform('num_leaves', np.log(7), np.log(256), 1), 'feature_fraction': hp.uniform('feature_fraction', 0.3, 1), 'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1), 'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5), 'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]), 'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]), } fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100, hyperopt_algo='tpe') def print_params(params): print(params) return 0.0, None opt.optimize(print_params, seed=1234, opt_desc='LGBM-tuning-1', logger=StdoutLogger(verbosity_level=1)) def run_lgbm_train_class(): short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='train-class', create_wrapper=LGBMInterfaceWrapper, base_name='LGBM-tuning-1', short_key_map=short_key_map) from hyperopt import hp space = { 'learning_rate': hp.loguniform('learning_rate', np.log(5e-3), np.log(3e-1)), 'num_leaves': hp.qloguniform('num_leaves', np.log(7), np.log(256), 1), 'feature_fraction': hp.uniform('feature_fraction', 0.3, 1), 'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1), 'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', -16, 5), 'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]), 'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]), } fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100, hyperopt_algo='tpe') opt.optimize(acr, seed=1234, opt_desc='LGBM-tuning-1', logger=StdoutLogger(verbosity_level=1)) def run_lgbm_train_class_smac(): short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-class', create_wrapper=LGBMInterfaceWrapper, base_name='LGBM-tuning-smac-1', short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (5e-3, 3e-1), log=True), Integer('num_leaves', (7, 256), log=True), Float('feature_fraction', (0.3, 1)), Float('bagging_fraction', (0.3, 1)), Integer('min_data_in_leaf', (1, 64), log=True), Float('min_sum_hessian_in_leaf', (np.exp(-16), np.exp(5)), log=True), Float('lambda_l1', (np.exp(-16), np.exp(2)), log=True), Float('lambda_l2', (np.exp(-16), np.exp(2)), log=True), ]) fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder) opt.optimize(acr, seed=1234, opt_desc='LGBM-tuning-smac-1', logger=StdoutLogger(verbosity_level=1)) def run_lgbm_train_class_smac_2(use_reg: bool = False): base_name = 'LGBM-tuning-smac-2-reg' if use_reg else 'LGBM-tuning-smac-2' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=LGBMInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2), Integer('num_leaves', (16, 64), log=True, default=31), Float('feature_fraction', (0.5, 1), default=0.75), Float('bagging_fraction', (0.5, 1), default=0.75), Integer('min_data_in_leaf', (1, 64), log=True, default=5), Float('min_sum_hessian_in_leaf', (1e-7, 1e-2), log=True, default=1e-5), Float('lambda_l1', (1e-7, 1e-3), log=True, default=1e-7), Float('lambda_l2', (1e-7, 1e-3), log=True, default=1e-7), ]) fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_lgbm_train_class_smac_3(use_reg: bool = False): base_name = 'LGBM-tuning-smac-3-reg' if use_reg else 'LGBM-tuning-smac-3' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=LGBMInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2), Integer('num_leaves', (16, 128), log=True, default=31), # larger max num_leaves than for smac-2 Float('feature_fraction', (0.5, 1), default=0.75), Float('bagging_fraction', (0.5, 1), default=0.75), Integer('min_data_in_leaf', (1, 64), log=True, default=5), Float('min_sum_hessian_in_leaf', (1e-7, 1e-2), log=True, default=1e-5), Float('lambda_l1', (1e-7, 1e-3), log=True, default=1e-7), Float('lambda_l2', (1e-7, 1e-3), log=True, default=1e-7), ]) fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder, n_initial_design=25) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_lgbm_train_class_coord(): base_name = 'LGBM-tuning-coord-1' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-class', create_wrapper=LGBMInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) space = { 'learning_rate': Hyperparameter(start_value=np.log(0.1), min_step_size=0.1, importance=1.0, log_scale=True), 'num_leaves': Hyperparameter(np.log(31), 0.1, 0.2, log_scale=True, only_int=True), 'feature_fraction': Hyperparameter(1.0, 0.01, 0.4, min_value=0.3, max_value=1.0), 'bagging_fraction': Hyperparameter(1.0, 0.01, 0.4, min_value=0.3, max_value=1.0), 'min_data_in_leaf': Hyperparameter(np.log(20), 0.1, 0.2, log_scale=True, only_int=True, max_value=np.log(128)), 'min_sum_hessian_in_leaf': Hyperparameter(np.log(1e-3), 0.1, 0.6, log_scale=True), 'lambda_l1': Hyperparameter(np.log(1e-5), 0.1, 0.2, log_scale=True, min_value=-16.0, max_value=2.0), 'lambda_l2': Hyperparameter(np.log(1e-5), 0.1, 0.2, log_scale=True, min_value=-16.0, max_value=2.0), } fixed_params = { 'n_estimators': 1000, 'bagging_freq': 1 } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = CoordOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_xgb_train_class_smac(use_reg: bool = False): # XGB-tuning-smac-1 accidentally used LightGBM base_name = 'XGB-tuning-smac-2-reg' if use_reg else 'XGB-tuning-smac-2' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') oe_perm_factory = EncodingFactory(SingleOrdinalEncodingFactory(permute_ordinal_encoding=True)) acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=lambda **kwargs: XGBInterfaceWrapper(factory=oe_perm_factory, **kwargs), base_name=base_name, short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2), Integer('max_depth', (4, 8), default=6), Float('subsample', (0.5, 1), default=0.75), Float('colsample_bytree', (0.6, 1), default=1.0), Float('colsample_bylevel', (0.6, 1), default=1.0), Float('colsample_bynode', (0.6, 1), default=1.0), Float('min_child_weight', (1e-7, 1e-2), log=True, default=1e-5), Float('reg_alpha', (1e-7, 1e-2), log=True, default=1e-7), Float('reg_lambda', (1e-7, 1e-2), log=True, default=1e-7), Float('reg_gamma', (1e-7, 1e-2), log=True, default=1e-7), ]) fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder, n_initial_design=25) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_xgb_train_class_smac_3(use_reg: bool = False): # XGB-tuning-smac-1 accidentally used LightGBM base_name = 'XGB-tuning-smac-3-reg' if use_reg else 'XGB-tuning-smac-3' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') oe_perm_factory = EncodingFactory(SingleOrdinalEncodingFactory(permute_ordinal_encoding=True)) acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=lambda **kwargs: XGBInterfaceWrapper(factory=oe_perm_factory, **kwargs), base_name=base_name, short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2), Integer('max_depth', (4, 10), default=6), # increased upper bound to 10 Float('subsample', (0.5, 1), default=0.75), Float('colsample_bytree', (0.6, 1), default=1.0), Float('colsample_bylevel', (0.6, 1), default=1.0), Float('colsample_bynode', (0.6, 1), default=1.0), Float('min_child_weight', (1e-7, 1e-2), log=True, default=1e-5), Float('reg_alpha', (1e-7, 1e-2), log=True, default=1e-7), Float('reg_lambda', (1e-7, 1e-2), log=True, default=1e-7), Float('reg_gamma', (1e-7, 1e-2), log=True, default=1e-7), ]) fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=200 if use_reg else 100, tmp_folder=tmp_folder, n_initial_design=25) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_catboost_train_class_smac(use_reg: bool = False): base_name = 'CatBoost-tuning-smac-reg' if use_reg else 'CatBoost-tuning-smac' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs', one_hot_max_size='ohms', leaf_estimation_iterations='lei', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=CatBoostInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from ConfigSpace import Float, Integer, ConfigurationSpace space = ConfigurationSpace() space.add_hyperparameters([ Float('learning_rate', (2e-2, 1e-1), log=True, default=6e-2), Integer('max_depth', (4, 10), default=8), # increased upper bound to 10 Float('l2_leaf_reg', (1e-7, 1e-2), log=True, default=1e-5), Float('bagging_temperature', (0.0, 1.0), default=1.0), Float('random_strength', (1e-2, 20.0), log=True, default=1.0), Integer('one_hot_max_size', (0, 25), default=10), Integer('leaf_estimation_iterations', (1, 20), default=1) ]) # todo: also try min_child_samples? # todo: try boosting_type and bootstrap_type? ("Bayesian", "Bernoulli", "MVS") # possibly subsample for other bootstrap_type? # https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = SMACOptimizer(space, fixed_params, n_hyperopt_steps=100, tmp_folder=tmp_folder, n_initial_design=25) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_catboost_train_class_hyperopt(use_reg: bool = False): base_name = 'CatBoost-tuning-hyperopt-reg' if use_reg else 'CatBoost-tuning-hyperopt' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', bootstrap_type='boot', boosting_type='boost', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs', one_hot_max_size='ohms', leaf_estimation_iterations='lei', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=CatBoostInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from hyperopt import hp space = { 'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)), 'max_depth': hp.quniform('max_depth', 4, 10, 1), # this was ignored due to an implementation error 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)), 'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)), 'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1), 'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']), 'bootstrap_type': hp.choice('bootstrap_type', [ {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)}, {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)} ]), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1.0), np.log(100.0), 1), } # todo: also try min_child_samples? # todo: try boosting_type and bootstrap_type? ("Bayesian", "Bernoulli", "MVS") # possibly subsample for other bootstrap_type? # https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_catboost_train_class_hyperopt_2(use_reg: bool = False): base_name = 'CatBoost-tuning-hyperopt-2-reg' if use_reg else 'CatBoost-tuning-hyperopt-2' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', bootstrap_type='boot', boosting_type='boost', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs', one_hot_max_size='ohms', leaf_estimation_iterations='lei', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=CatBoostInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from hyperopt import hp space = { 'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)), 'max_depth': hp.quniform('max_depth', 4, 10, 1), # this was ignored due to an implementation error 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)), 'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)), 'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1), 'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']), 'bootstrap_type': hp.choice('bootstrap_type', [ {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)}, {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)} ]), 'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', np.log(1.0), np.log(100.0), 1), } # https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) def run_catboost_train_class_hyperopt_3(use_reg: bool = False): base_name = 'CatBoost-tuning-hyperopt-3-reg' if use_reg else 'CatBoost-tuning-hyperopt-3' short_key_map = dict(n_estimators='nest', bagging_freq='bfreq', learning_rate='lr', num_leaves='nl', colsample_bylevel='cbl', colsample_bytree='cbt', colsample_bynode='cbn', bootstrap_type='boot', boosting_type='boost', max_depth='md', min_child_weight='mcw', reg_alpha='alph', reg_lambda='lam', reg_gamma='gam', subsample='ss', l2_leaf_reg='l2lr', bagging_temperature='bt', random_strength='rs', one_hot_max_size='ohms', leaf_estimation_iterations='lei', feature_fraction='ff', bagging_fraction='bfrac', min_data_in_leaf='mdil', min_sum_hessian_in_leaf='mshil', lambda_l1='ll1', lambda_l2='ll2') acr = AlgConfigRunner(paths=Paths.from_env_variables(), coll_name='meta-train-reg' if use_reg else 'meta-train-class', create_wrapper=CatBoostInterfaceWrapper, base_name=base_name, short_key_map=short_key_map) from hyperopt import hp space = { 'learning_rate': hp.loguniform('learning_rate', np.log(2e-2), np.log(2e-1)), 'max_depth': hp.quniform('max_depth', 4, 10, 1), # this was ignored due to an implementation error 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-6), np.log(1e-2)), 'random_strength': hp.loguniform('random_strength', np.log(1e-3), np.log(5.0)), 'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 25, 1), 'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1), 'boosting_type': 'Plain', #hp.choice('boosting_type', ['Ordered', 'Plain']), 'bootstrap_type': hp.choice('bootstrap_type', [ {'bootstrap_type': 'Bayesian', 'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)}, {'bootstrap_type': 'Bernoulli', 'subsample': hp.uniform('subsample', 0.5, 1.0)} ]), # removed min_data_in_leaf since it is not used with SymmetricTree } # https://www.kaggle.com/code/saurabhshahane/catboost-hyperparameter-tuning-with-optuna/notebook fixed_params = { 'n_estimators': 1000, } paths = Paths.from_env_variables() with paths.new_tmp_folder() as tmp_folder: opt = HyperoptOptimizer(space, fixed_params, n_hyperopt_steps=100) opt.optimize(acr, seed=1234, opt_desc=base_name, logger=StdoutLogger(verbosity_level=1)) if __name__ == '__main__': # load_score('NN-class-special-2', 'train-class') # run_lgbm_train_class() # run_lgbm_train_class_smac() # run_lgbm_train_class_smac_2() # run_lgbm_train_class_coord() # run_xgb_train_class_smac() # run_lgbm_train_class_smac_3(use_reg=True) # run_xgb_train_class_smac_3(use_reg=True) # run_catboost_train_class_smac(use_reg=True) # run_catboost_train_class_hyperopt_2(use_reg=True) # run_catboost_train_class_hyperopt_2(use_reg=False) run_catboost_train_class_hyperopt_3(use_reg=False) # test_hyperopt_seed() pass ================================================ FILE: scripts/move_algs.py ================================================ import shutil from typing import Optional import fire from pytabkit.bench.data.paths import Paths from pytabkit.models import utils def move_algs(base_path_1: str, base_path_2: str, *alg_names, startswith: Optional[str] = None, dry_run: bool = False): paths_1 = Paths(base_folder=base_path_1) paths_2 = Paths(base_folder=base_path_2) if startswith is not None: all_alg_names = [path.name for path in paths_1.algs().iterdir()] alg_names = list(alg_names) + [alg_name for alg_name in all_alg_names if alg_name.startswith(startswith)] for alg_name in alg_names: print(f'Moving alg {alg_name}') if dry_run: continue assert isinstance(alg_name, str) assert utils.existsDir(base_path_1) assert utils.existsDir(base_path_2) assert not utils.existsDir(paths_2.algs() / alg_name) assert not utils.existsDir(paths_2.results() / alg_name) assert not utils.existsDir(paths_2.result_summaries() / alg_name) if utils.existsDir(paths_1.algs() / alg_name): shutil.move(paths_1.algs() / alg_name, paths_2.algs() / alg_name) if utils.existsDir(paths_1.results() / alg_name): shutil.move(paths_1.results() / alg_name, paths_2.results() / alg_name) if utils.existsDir(paths_1.result_summaries() / alg_name): shutil.move(paths_1.result_summaries() / alg_name, paths_2.result_summaries() / alg_name) def move_specific_algs(base_path_1: str, base_path_2: str): paths_1 = Paths(base_folder=base_path_1) alg_names = [] for path in paths_1.algs().iterdir(): name = path.name # if name.startswith('MLP-cumul-abl-') and not name.startswith('MLP-cumul-abl-new'): if name.startswith('MLP-RTDL-HPO') and not name.startswith('MLP-cumul-abl-new'): alg_names.append(name) # print(alg_names) move_algs(base_path_1, base_path_2, *alg_names) if __name__ == '__main__': fire.Fire(move_algs) # fire.Fire(move_specific_algs) ================================================ FILE: scripts/move_many_algs.py ================================================ from typing import Optional import fire from pytabkit.models import utils from scripts.move_algs import move_algs def move_many_algs(base_path_1: str, base_path_2: str, algs_filename: Optional[str] = None, prefixes_filename: Optional[str] = None, dry_run: bool = False): if algs_filename is None: algs = [] else: algs = [name.strip() for name in utils.readFromFile(algs_filename).split('\n') if name.strip() != ''] if prefixes_filename is None: prefixes = [] else: pprefixes = [name.strip() for name in utils.readFromFile(prefixes_filename).split('\n') if name.strip() != ''] move_algs(base_path_1, base_path_2, *algs, dry_run=dry_run) for prefix in prefixes: move_algs(base_path_1, base_path_2, startswith=prefix, dry_run=dry_run) if __name__ == '__main__': fire.Fire(move_many_algs) ================================================ FILE: scripts/print_complete_results.py ================================================ import fire from pytabkit.bench.data.paths import Paths from pytabkit.bench.eval.analysis import ResultsTables from pytabkit.bench.eval.evaluation import DefaultEvalModeSelector def print_complete_results(coll_name: str, n_splits: int = 10): """ Only show alg_names for which results for all splits exist. :param coll_name: :param n_splits: :return: """ paths = Paths.from_env_variables() tables = ResultsTables(paths) table = tables.get(coll_name) test_table = table.get_test_results_table(DefaultEvalModeSelector()) test_table = test_table.filter_n_splits(n_splits) alg_names = test_table.alg_names alg_names.sort(key=lambda x: x.lower()) print(f'Algorithms with {n_splits} splits available on all datasets of {coll_name}:') for alg_name in alg_names: print(alg_name) if __name__ == '__main__': fire.Fire(print_complete_results) ================================================ FILE: scripts/print_runtimes.py ================================================ from pytabkit.bench.data.paths import Paths from pytabkit.bench.eval.runtimes import get_avg_train_times, get_avg_predict_times if __name__ == '__main__': paths = Paths.from_env_variables() for coll_name in ['meta-train-class', 'meta-train-reg']: times_dict = get_avg_train_times(paths, coll_name, per_1k_samples=True) print(f'Average training times per 1K samples for {coll_name}:') for alg_name, time_s in times_dict.items(): print(f'{alg_name}: {time_s:g} s') print(times_dict) for coll_name in ['meta-train-class', 'meta-train-reg']: times_dict = get_avg_predict_times(paths, coll_name, per_1k_samples=True) print(f'Average inference times per 1K samples for {coll_name}:') for alg_name, time_s in times_dict.items(): print(f'{alg_name}: {time_s:g} s') print(times_dict) ================================================ FILE: scripts/ray_slurm_launch.py ================================================ # from https://docs.ray.io/en/latest/cluster/examples/slurm-launch.html#slurm-launch # slurm-launch.py # Usage: # python slurm-launch.py --exp-name test \ # --command "rllib train --run PPO --env CartPole-v0" import argparse # import subprocess import sys import time import os from pathlib import Path from pytabkit.models import utils template_file = Path(__file__).parent / "ray_slurm_template.sh" JOB_NAME = "${JOB_NAME}" NUM_NODES = "${NUM_NODES}" NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}" PARTITION_OPTION = "${PARTITION_OPTION}" ACCOUNT_OPTION = "${ACCOUNT_OPTION}" COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}" GIVEN_NODE = "${GIVEN_NODE}" LOAD_ENV = "${LOAD_ENV}" TIME = "${TIME}" MEM_CMD = "${MEM_CMD}" MAIL_USER = "${MAIL_USER}" LOG_FOLDER = "${LOG_FOLDER}" CONDA_ENV_NAME = "${CONDA_ENV_NAME}" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--exp_name", type=str, required=True, help="The job name and path to logging file (exp_name.out / exp_name.err).") parser.add_argument( "--conda-env-name", type=str, required=True, help="Conda environment name") parser.add_argument( "--num_nodes", "-n", type=int, default=1, help="Number of nodes to use.") parser.add_argument( "--mem", type=str, default=None, help="Memory (int + suffix 'mb').") parser.add_argument( "--time", "-t", type=str, help="Maximum time of job") # parser.add_argument( # "--mem", # type=str, # help="Maximum memory of job") parser.add_argument( "--mail_user", "-m", type=str, default="", help="Mail address to which job updates will be sent") parser.add_argument( "--log_folder", "-l", type=str, default="", help="Folder in which to save log files" ) parser.add_argument( "--node", "-w", type=str, help="The specified nodes to use. Same format as the " "return of 'sinfo'. Default: ''.") parser.add_argument( "--num-gpus", type=int, default=0, help="Number of GPUs to use in each node. (Default: 0)") parser.add_argument( "--queue", "-q", type=str, default=None ) parser.add_argument( "--partition", "-p", type=str, default="", ) parser.add_argument( "--account", "-a", type=str, default="", ) parser.add_argument( "--load-env", type=str, default="", help="The script to load your environment ('module load cuda/10.1')") parser.add_argument( "--command", type=str, required=True, help="The command you wish to execute. For example: " " --command 'python test.py'. " "Note that the command must be a string.") args = parser.parse_args() if args.node: # assert args.num_nodes == 1 node_info = "#SBATCH -w {}".format(args.node) else: node_info = "" job_name = "{}_{}".format(args.exp_name, time.strftime("%y%m%d-%H%M", time.localtime())) partition_option = "#SBATCH --partition={}".format( args.partition) if args.partition else "" account_option = "#SBATCH --account={}".format( args.account) if args.account else "" # ===== Modified the template script ===== with open(template_file, "r") as f: text = f.read() text = text.replace(JOB_NAME, job_name) text = text.replace(NUM_NODES, str(args.num_nodes)) text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus)) text = text.replace(PARTITION_OPTION, partition_option) text = text.replace(ACCOUNT_OPTION, account_option) text = text.replace(COMMAND_PLACEHOLDER, str(args.command)) text = text.replace(LOAD_ENV, str(args.load_env)) text = text.replace(GIVEN_NODE, node_info) text = text.replace(TIME, args.time) mem_cmd = '' if args.mem is None else f'SBATCH --mem={args.mem}' text = text.replace(MEM_CMD, mem_cmd) text = text.replace(MAIL_USER, args.mail_user) text = text.replace(LOG_FOLDER, args.log_folder) text = text.replace(CONDA_ENV_NAME, args.conda_env_name) text = text.replace( "# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!", "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE " "RUNNABLE!") # ===== Save the script ===== script_file = "slurm_scripts/{}.sh".format(job_name) # os.makedirs("slurm_scripts") # todo: ensure this utils.ensureDir(Path('slurm_scripts') / 'test.sh') # ensure that slurm_scripts directory exists with open(script_file, "w") as f: f.write(text) # ===== Submit the job ===== print("Starting to submit job!") cmd = f"sbatch {script_file}" if args.queue is None else f"sbatch -p {args.queue} {script_file}" # subprocess.Popen(cmd) os.system(cmd) print( "Job submitted! Script file is at: <{}>. Log file is at: <{}>".format( script_file, "{}.log".format(job_name))) sys.exit(0) ================================================ FILE: scripts/ray_slurm_template.sh ================================================ #!/bin/bash # shellcheck disable=SC2206 # THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT! # THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION! ${PARTITION_OPTION} ${ACCOUNT_OPTION} #SBATCH --job-name=${JOB_NAME} #SBATCH --output=${LOG_FOLDER}/${JOB_NAME}.out #SBATCH --error=${LOG_FOLDER}/${JOB_NAME}.err ${GIVEN_NODE} ### This script works for any number of nodes, Ray will find and manage all resources #SBATCH --nodes=${NUM_NODES} #SBATCH --time=${TIME} #SBATCH --mail-user=${MAIL_USER} #SBATCH --exclusive ### Give all resources to a single Ray task, ray can manage the resources internally #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-task=${NUM_GPUS_PER_NODE} #${MEM_CMD} # Load modules or your own conda environment here # module load pytorch/v1.4.0-gpu # conda activate ${CONDA_ENV} # ${LOAD_ENV} module load devel/miniconda conda init bash source ~/.bashrc conda deactivate conda activate ${CONDA_ENV_NAME} cd ~/git/pytabkit export RAY_DEDUP_LOGS=0 # to disable ray from trying to deduplicate log messages # ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ===== # This script is a modification to the implementation suggest by gregSchwartz18 here: # https://github.com/ray-project/ray/issues/826#issuecomment-522116599 redis_password=$(uuidgen) export redis_password nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names nodes_array=($nodes) node_1=${nodes_array[0]} ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address # if we detect a space character in the head node IP, we'll # convert it to an ipv4 address. This step is optional. if [[ "$ip" == *" "* ]]; then IFS=' ' read -ra ADDR <<< "$ip" if [[ ${#ADDR[0]} -gt 16 ]]; then ip=${ADDR[1]} else ip=${ADDR[0]} fi echo "IPV6 address detected. We split the IPV4 address as $ip" fi port=6379 ip_head=$ip:$port export ip_head echo "IP Head: $ip_head" echo "STARTING HEAD at $node_1" srun --nodes=1 --ntasks=1 -w "$node_1" \ ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block & sleep 30 worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} echo "STARTING WORKER $i at $node_i" srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block & sleep 5 done # ===== Call your code below ===== ${COMMAND_PLACEHOLDER} ================================================ FILE: scripts/rename_alg.py ================================================ import os import shutil from pathlib import Path import fire from pytabkit.bench.data.paths import Paths from pytabkit.models import utils def rename_alg(old_name: str, new_name: str, copy: bool = False, rename_prefixes: bool = False): # what to rename: # results folder # result_summaries folder # alg_name in algs/alg_name/extended_config.yaml and in the path # cannot realistically change the code in src/ # maybe change alg_name in algs/alg_name/wrapper.pkl (if it can be loaded) paths = Paths.from_env_variables() if rename_prefixes: alg_names = [path.name for path in paths.algs().iterdir()] for alg_name in alg_names: if alg_name.startswith(old_name): rename_alg(alg_name, new_name + alg_name[len(old_name):], copy=copy, rename_prefixes=False) return if utils.existsDir(paths.algs() / new_name): raise ValueError(f'Directory for new name {new_name} already exists') def rename_or_copy(src: Path, dst: Path): if copy: shutil.copytree(src, dst) else: os.rename(src, dst) rename_or_copy(paths.algs() / old_name, paths.algs() / new_name) if utils.existsDir(paths.results() / old_name): rename_or_copy(paths.results() / old_name, paths.results() / new_name) if utils.existsDir(paths.result_summaries() / old_name): rename_or_copy(paths.result_summaries() / old_name, paths.result_summaries() / new_name) # change alg_name in extended_config.yaml extended_config_path = paths.algs() / new_name / 'extended_config.yaml' extended_config = utils.deserialize(extended_config_path, use_yaml=True) extended_config['alg_name'] = new_name utils.serialize(extended_config_path, extended_config, use_yaml=True) # try to change alg_name in wrapper.pkl try: alg_wrapper_path = paths.algs() / new_name / 'wrapper.pkl' alg_wrapper = utils.deserialize(alg_wrapper_path) alg_wrapper.config['alg_name'] = new_name utils.serialize(alg_wrapper_path, alg_wrapper) except Exception as e: print(f'Could not modify alg_wrapper.pkl, got an exception: {e}') if __name__ == '__main__': fire.Fire(rename_alg) ================================================ FILE: scripts/rename_tag.py ================================================ import fire from pytabkit.bench.data.paths import Paths from pytabkit.models import utils def rename_tag(old_name: str, new_name: str): paths = Paths.from_env_variables() for alg_path in paths.algs().iterdir(): tags_path = alg_path / 'tags.yaml' if utils.existsFile(tags_path): tags = utils.deserialize(tags_path, use_yaml=True) tags = [tag if tag != old_name else new_name for tag in tags] utils.serialize(tags_path, tags, use_yaml=True) if __name__ == '__main__': fire.Fire(rename_tag) ================================================ FILE: scripts/run_evaluation.py ================================================ import time from typing import Optional import numpy as np import fire from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskCollection from pytabkit.bench.eval.analysis import get_opt_groups from pytabkit.bench.eval.evaluation import MultiResultsTable, DefaultEvalModeSelector, MeanTableAnalyzer, \ alg_results_str, \ alg_comparison_str, WinsTableAnalyzer, RankTableAnalyzer, NormalizedLossTableAnalyzer, \ GreedyAlgSelectionTableAnalyzer def show_eval(coll_name: str = 'meta-train-class', n_cv: int = 1, show_alg_groups: bool = True, val_metric_name: str = None, metric_name: str = None, split_type: str = SplitType.RANDOM, use_task_weighting: Optional[bool] = None, shift_eps: float = 0.01, data_path: Optional[str] = None, alg_name: Optional[str] = None, alg_name_2: Optional[str] = None, tag: Optional[str] = None, max_n_splits: Optional[int] = None, max_n_algs: Optional[int] = None, show_val_results: bool = False, show_train_results: bool = False, algs_prefix: Optional[str] = None, algs_suffix: Optional[str] = None, algs_contains: Optional[str] = None, exclude_datasets: Optional[str] = None): """ Prints evaluation tables on the selected datasets/algorithms. The following aggregate statistics will be printed, all of which are based on the specified metric and validation metric: - log shifted geometric mean test metric when greedily creating an algorithm portfolio based on the validation results. The algorithms are sorted by order of inclusion into the portfolio. The scores are the scores of selecting the best algorithm out of the portfolio up to this point on every dataset separately, based on the validation sets. - Win fraction: Fraction of datasets (may be weighted) on which this algorithm is the best one. - Arithmetic mean rank - Arithmetic mean normalized test metric: The best method is normalized to 0 and the worst one to 1. - Arithmetic mean test metric - Log shifted geometric mean test metric: mean(log(metric+shift_eps)) - Shifted geometric mean test metric: exp(mean(log(metric+shift_eps))) :param coll_name: Name of the task collection, e.g., 'meta-train-class' :param n_cv: Number of cross-validation folds. Will only print results for algorithms that have been evaluated with this number of cross-validation folds. :param show_alg_groups: Whether to show aggregate algorithms, such as the one that picks the best method on the validation set out of the displayed methods. :param val_metric_name: Name of the validation metric, used for the algorithm groups. By default, the same value as metric_name will be used. :param metric_name: Name of the metric that should be displayed (default = classification error / RMSE). :param split_type: Type of the split, normally random_split. :param use_task_weighting: Whether to weight tasks for the evaluation. If false, uniform weights are used. If True, weights based on prefixes are used. By default, weights are used only for meta-train collections. :param shift_eps: Epsilon parameter used in the shifted geometric mean. :param data_path: Path to the data folder where results are saved. By default, this function will take the path from Paths.from_env_variables(). :param alg_name: Algorithm for which results on individual datasets should be printed :param alg_name_2: Second algorithm for which results on individual datasets should be printed. :param tag: If specified, only print algorithms whose tags include the given tag. :param max_n_splits: If specified, only evaluate the given number of train-test splits. :param max_n_algs: Maximum number of methods that should be processed and displayed. This does not contain groups of methods (e.g. "all algs") that will be added on top later. :param show_val_results: Whether to show validation errors instead of test errors. :param show_train_results: Whether to show training errors instead of test errors. :param algs_prefix: If specified, only methods with this prefix will be displayed. :param algs_suffix: If specified, only methods with this suffix will be displayed. :param algs_contains: If specified, only methods containing this substring will be displayed. :param exclude_datasets: Optional comma-separated list of datasets that will be excluded from the analysis. :return: """ print('start show eval') paths = Paths(data_path) if data_path is not None else Paths.from_env_variables() start_time = time.time() if '/' in coll_name: # use a single task parts = coll_name.split('/') if len(parts) != 2: print(f'Too many / in coll_name {coll_name}') return task_collection = TaskCollection(coll_name, [TaskDescription(*parts)]) else: task_collection = TaskCollection.from_name(coll_name, paths) if exclude_datasets: exclude_names = exclude_datasets.split(',') task_collection = TaskCollection(task_collection.coll_name, [td for td in task_collection.task_descs if td.task_name not in exclude_names]) print('load table') # table = MultiResultsTable.load_summaries(task_collection, n_cv=n_cv, paths=paths) # commas are converted to tuples in the command line, apparently show_tags = tag.split(',') if isinstance(tag, str) else (list(tag) if tag is not None else []) alg_filter = lambda an, tags, config: ((tag is None or np.any([show_tag in tags for show_tag in show_tags])) and (algs_prefix is None or an.startswith(algs_prefix)) and (algs_suffix is None or an.endswith(algs_suffix)) and (algs_contains is None or algs_contains in an)) table = MultiResultsTable.load(task_collection, n_cv=n_cv, paths=paths, max_n_algs=max_n_algs, split_type=split_type, alg_filter=alg_filter, max_n_splits=max_n_splits) print('process table') # alg_group_dict = {'all algs': (lambda an, tags, config: True)} if show_alg_groups else None task_type_name = 'class' if 'class' in coll_name else 'reg' opt_groups = get_opt_groups(task_type_name) alg_group_dict = {'BestModel': (lambda an, tags, config: not an.startswith('Ensemble')), **{ f'BestModel{group_name}': (lambda an, tags, config, ans=alg_names: an in ans) for group_name, alg_names in opt_groups.items() }} if not show_alg_groups: alg_group_dict = None if alg_name is not None and alg_name_2 is not None and show_alg_groups: alg_group_dict['selected algs'] = (lambda an, tags, config, grp=[alg_name, alg_name_2]: np.any([g.startswith(an) for g in grp])) val_test_groups = {f'HPO-on-BestModel-TD-{task_type_name}': {f'{family}-TD-{task_type_name}': f'{family}-HPO' for family in ['XGB', 'LGBM', 'CatBoost', 'MLP']} for task_type_name in ['class', 'reg']} if val_metric_name is None: val_metric_name = metric_name test_table = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=alg_group_dict, test_metric_name=metric_name, val_metric_name=val_metric_name, val_test_groups=val_test_groups, use_validation_errors=show_val_results, use_train_errors=show_train_results) val_table_single = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=dict(), test_metric_name=metric_name, val_metric_name=val_metric_name, val_test_groups=val_test_groups, use_validation_errors=True) test_table_single = table.get_test_results_table(DefaultEvalModeSelector(), alg_group_dict=dict(), test_metric_name=metric_name, val_metric_name=val_metric_name, val_test_groups=val_test_groups, use_validation_errors=show_val_results, use_train_errors=show_train_results) if len(test_table.alg_task_results) == 0: print(f'No results found') return subset = 'train' if show_train_results else ('val' if show_val_results else 'test') if use_task_weighting is None: use_task_weighting = coll_name.startswith('meta-train') or coll_name.startswith('uci') separate_task_names = ['facebook_comment_volume', 'facebook_live_sellers_thailand_shares'] if n_cv == 1: # fails for n_cv > 1 because proper selection on the validation set is not implemented print( f'Greedy algorithm selection cumulative best log shifted geometric mean (err+{shift_eps:g}) {subset} error:') analyzer = GreedyAlgSelectionTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names, f=lambda x: np.log(x + shift_eps)) analyzer.print_analysis(test_table_single, val_table_single) print() print('Win fraction:') analyzer = WinsTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names) analyzer.print_analysis(test_table) print() print('Arithmetic mean rank:') analyzer = RankTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names) analyzer.print_analysis(test_table) print() print(f'Arithmetic mean normalized {subset} metric:') analyzer = NormalizedLossTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names) analyzer.print_analysis(test_table) print() print(f'Arithmetic mean {subset} metric:') analyzer = MeanTableAnalyzer(use_weighting=use_task_weighting, separate_task_names=separate_task_names) analyzer.print_analysis(test_table) print() print(f'Shifted geometric mean (err+{shift_eps:g}) {subset} metric:') analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + shift_eps), use_weighting=use_task_weighting, separate_task_names=separate_task_names, post_f=lambda x: np.exp(x)) analyzer.print_analysis(test_table) print() print(f'Log shifted geometric mean (err+{shift_eps:g}) {subset} metric:') analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + shift_eps), use_weighting=use_task_weighting, separate_task_names=separate_task_names) analyzer.print_analysis(test_table) print() # print('Mean modlog test error:') # todo: name modlog is suboptimal, people could associate mod with modulo # analyzer = MeanTableAnalyzer(f=lambda x: np.log(x + 1e-3) - np.log(1e-3), use_weighting=use_task_weighting) # analyzer.print_analysis(test_table) if alg_name is not None: if alg_name_2 is None: print(f'Errors for alg {alg_name}:') print(alg_results_str(test_table, alg_name)) else: print(f'Comparison: {alg_name} vs. {alg_name_2}') print(alg_comparison_str(test_table, [alg_name, alg_name_2])) print(f'Time for printing: {time.time() - start_time:g} s') if __name__ == '__main__': fire.Fire(show_eval) ================================================ FILE: scripts/run_experiments.py ================================================ from typing import Optional, Dict, Any, List import numpy as np from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.alg_wrappers.interface_wrappers import \ LGBMInterfaceWrapper, \ XGBInterfaceWrapper, LGBMHyperoptInterfaceWrapper, XGBHyperoptInterfaceWrapper, CatBoostHyperoptInterfaceWrapper, \ CatBoostInterfaceWrapper, RFInterfaceWrapper, XGBSklearnInterfaceWrapper, LGBMSklearnInterfaceWrapper, \ CatBoostSklearnInterfaceWrapper, SklearnMLPInterfaceWrapper, NNInterfaceWrapper, CaruanaEnsembleWrapper, \ LoadResultsWrapper, RandomParamsNNInterfaceWrapper, AlgorithmSelectionWrapper, ResNetRTDLInterfaceWrapper, \ MLPRTDLInterfaceWrapper, RandomParamsRTDLMLPInterfaceWrapper, RandomParamsResnetInterfaceWrapper, \ TabRInterfaceWrapper, RandomParamsXGBInterfaceWrapper, RandomParamsLGBMInterfaceWrapper, \ RandomParamsCatBoostInterfaceWrapper, AutoGluonModelInterfaceWrapper, RandomParamsTabRInterfaceWrapper, \ RandomParamsRFInterfaceWrapper, FTTransformerInterfaceWrapper, RandomParamsFTTransformerInterfaceWrapper from pytabkit.bench.eval.analysis import get_ensemble_groups from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager, run_alg_selection from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models import utils from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.models.sklearn.default_params import DefaultParams def run_gbdt_rs_configs(paths: Optional[Paths] = None, min_step_idx: int = 0, n_steps: int = 50, rerun: bool = False, with_lgbm: bool = True, with_xgb: bool = True, with_cb: bool = True, min_split_idx: int = 0, n_splits: int = 10, only_meta_train: bool = False): if paths is None: paths = Paths.from_env_variables() job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5)) run_config = RunConfig(min_split_idx=min_split_idx, n_tt_splits=min_split_idx + n_splits, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos if only_meta_train: all_task_infos = train_task_infos else: all_task_infos = class_task_infos + reg_task_infos for step_idx in range(min_step_idx, min_step_idx + n_steps): if with_xgb: job_mgr.add_jobs(all_task_infos, run_config, f'XGB-HPO_step-{step_idx}', RandomParamsXGBInterfaceWrapper(model_idx=step_idx), tags=['paper_xgb_rs'], rerun=rerun) if with_lgbm: job_mgr.add_jobs(all_task_infos, run_config, f'LGBM-HPO_step-{step_idx}', RandomParamsLGBMInterfaceWrapper(model_idx=step_idx), tags=['paper_lgbm_rs'], rerun=rerun) if with_cb: job_mgr.add_jobs(all_task_infos, run_config, f'CatBoost-HPO_step-{step_idx}', RandomParamsCatBoostInterfaceWrapper(model_idx=step_idx), tags=['paper_cb_rs'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_rf_rs_configs(paths: Optional[Paths] = None, min_step_idx: int = 0, n_steps: int = 50, rerun: bool = False, min_split_idx: int = 0, n_splits: int = 10): # took 18h30m on the Grinsztajn benchmark if paths is None: paths = Paths.from_env_variables() job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5)) run_config = RunConfig(min_split_idx=min_split_idx, n_tt_splits=min_split_idx + n_splits, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos for step_idx in range(min_step_idx, min_step_idx + n_steps): job_mgr.add_jobs(grinsztajn_reg_task_infos + grinsztajn_class_task_infos, run_config, f'RF-HPO_step-{step_idx}', RandomParamsRFInterfaceWrapper(model_idx=step_idx), tags=['paper_rf-hpo'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_realmlp_tuning_configs(paths: Paths, n_steps: int = 50, tag: str = 'paper', rerun: bool = False): # 2h37m for 10 steps on meta-train-class # for 5 steps on all: 1h20m + 13h4m # 1h50m for 10 steps on grinsztajn-benchmark job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for step_idx in range(n_steps): job_mgr.add_jobs(all_task_infos, config_10_1_0, f'RealMLP-HPO_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_rtdl_tuning_configs(paths: Paths, n_steps: int = 50, rerun: bool = False, with_mlp: bool = True, with_resnet: bool = True, with_mlp_plr: bool = True, with_ftt: bool = True, only_meta_train: bool = False, only_meta_test: bool = False, start_split=0, end_split=10): # MLP-PLR takes about 1h5m per step # takes around 4d6h for MLP-HPO and MLP-PLR-HPO together job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos all_train_task_infos = train_class_task_infos + train_reg_task_infos grinsztajn_task_infos = grinsztajn_class_task_infos + grinsztajn_reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos all_task_infos = train_class_task_infos + train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos all_task_infos = test_class_task_infos + test_reg_task_infos for step_idx in range(n_steps): if with_mlp: job_mgr.add_jobs(all_task_infos, config_10_1_0, f'MLP-RTDL-HPO_step-{step_idx}', RandomParamsRTDLMLPInterfaceWrapper(model_idx=step_idx), tags=['paper_mlp-rtdl-hpo'], rerun=rerun) if with_resnet: job_mgr.add_jobs(all_task_infos, config_10_1_0, f'ResNet-RTDL-HPO_step-{step_idx}', RandomParamsResnetInterfaceWrapper(model_idx=step_idx), tags=['paper_resnet-hpo'], rerun=rerun) if with_mlp_plr: job_mgr.add_jobs(all_task_infos, config_10_1_0, f'MLP-PLR-HPO_step-{step_idx}', RandomParamsRTDLMLPInterfaceWrapper(model_idx=step_idx, num_emb_type='plr'), tags=['paper_mlp-plr-hpo'], rerun=rerun) if with_ftt: job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0, f'FTT-HPO_step-{step_idx}', RandomParamsFTTransformerInterfaceWrapper(model_idx=step_idx), tags=['paper_ftt-hpo'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_tabr_tuning_configs(paths: Paths, n_steps: int = 50, rerun: bool = False, start_split=0, end_split=10): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos grinsztajn_task_infos = grinsztajn_class_task_infos + grinsztajn_reg_task_infos all_task_infos = class_task_infos + reg_task_infos all_train_task_infos = train_class_task_infos + train_reg_task_infos for step_idx in range(n_steps): job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0, f'TabR-HPO_step-{step_idx}', RandomParamsTabRInterfaceWrapper(model_idx=step_idx), tags=['paper_tabr-hpo'], rerun=rerun) job_mgr.add_jobs(grinsztajn_task_infos, config_10_1_0, f'RealTabR-HPO_step-{step_idx}', RandomParamsTabRInterfaceWrapper(model_idx=step_idx, hpo_space_name='realtabr'), tags=['paper_realtabr-hpo'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_refit_configs(paths: Paths, tag: str = 'paper', rerun: bool = False): # refit experiments took 3 to 3.5 days job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(available_cpu_ram_multiplier=0.5)) config_10_5_5 = RunConfig(n_tt_splits=10, n_cv=5, n_refit=5, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for mean_cv, mean_refit in [(False, False), (True, True)]: extra_str = f'mean-cv-{mean_cv}_mean-refit-{mean_refit}' job_mgr.add_jobs(class_task_infos, config_10_5_5, f'RealMLP-TD-class_{extra_str}', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS, use_best_mean_epoch_for_cv=mean_cv, use_best_mean_epoch_for_refit=mean_refit, ), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_5_5, f'RealMLP-TD-reg_{extra_str}', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG, use_best_mean_epoch_for_cv=mean_cv, use_best_mean_epoch_for_refit=mean_refit, ), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_5_5, f'LGBM-TD-class_{extra_str}', LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_CLASS, use_best_mean_iteration_for_cv=mean_cv, use_best_mean_iteration_for_refit=mean_refit, ), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_5_5, f'LGBM-TD-reg_{extra_str}', LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_REG, use_best_mean_iteration_for_cv=mean_cv, use_best_mean_iteration_for_refit=mean_refit, ), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_ablations(paths: Paths, param_configs: Dict[str, Any], with_class: bool = True, with_reg: bool = True, tune_lr: bool = True, tag: str = 'paper_mlp_ablations', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False) # todo: it's false train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos combinations = [] if with_class: combinations.append((train_class_task_infos, DefaultParams.RealMLP_TD_CLASS, 'class')) if with_reg: combinations.append((train_reg_task_infos, DefaultParams.RealMLP_TD_REG, 'reg')) # lr_factors = [1.5**k for k in range(-3, 4)] if tune_lr else [1] # lr_factors = [0.3, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0] if tune_lr else [1.0] # lr_factors = [0.3, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0, 4.0, 6.0] if tune_lr else [1.0] lr_factors = [0.1, 0.15, 0.25, 0.35, 0.5, 0.7, 1.0, 1.4, 2.0, 3.0, 4.0] if tune_lr else [1.0] for task_infos, default_params, task_type_name in combinations: for param_config_name, extra_params in param_configs.items(): for lr_factor_idx, lr_factor in enumerate(lr_factors): params = utils.update_dict(default_params, extra_params) params['lr'] *= lr_factor # todo: what if the lr is a dict? alg_name = f'RealMLP-TD-{task_type_name}-ablation_{param_config_name}_lrfactor-{lr_factor}' job_mgr.add_jobs(task_infos, config_10_1_0, alg_name, NNInterfaceWrapper(**params), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_td_configs(paths: Paths, tag: str = 'paper', rerun: bool = False): # this took around 17h24m job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-class', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-S-class', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_S_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'RealMLP-TD-reg', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'RealMLP-TD-S-reg', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_S_REG), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-TD-class', LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'XGB-TD-class', XGBInterfaceWrapper(**DefaultParams.XGB_TD_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'CatBoost-TD-class', CatBoostInterfaceWrapper(**DefaultParams.CB_TD_CLASS), tags=[tag], rerun=rerun) # regression job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'LGBM-TD-reg', LGBMInterfaceWrapper(**DefaultParams.LGBM_TD_REG), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'XGB-TD-reg', XGBInterfaceWrapper(**DefaultParams.XGB_TD_REG), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'CatBoost-TD-reg', CatBoostInterfaceWrapper(**DefaultParams.CB_TD_REG), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_default_ce_configs(paths: Paths, tag: str = 'paper_val_ce', rerun: bool = False): # this took around 17h24m job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-class_val-ce', NNInterfaceWrapper( **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-class_val-ce_no-ls', NNInterfaceWrapper( **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict(val_metric_name='cross_entropy', use_ls=False, ls_eps=0.0))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-S-class_val-ce', NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-S-class_val-ce_no-ls', NNInterfaceWrapper( **utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS, dict(val_metric_name='cross_entropy', use_ls=False, ls_eps=0.0))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-TD-class_val-ce', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'XGB-TD-class_val-ce', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'CatBoost-TD-class_val-ce', CatBoostInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'LGBM-D-class_val-ce', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_D, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'XGB-D-class_val-ce', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_D, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'CatBoost-D-class_val-ce', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_D, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'XGB-PBB-D_val-ce', # Probst, Boulestix, and Bischl, "Tunability: Importance of ..." XGBInterfaceWrapper(n_estimators=4168, lr=0.018, min_child_weight=2.06, max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839, colsample_bytree=0.752, colsample_bylevel=0.585, tree_method='hist', max_n_threads=64, val_metric_name='cross_entropy', tfms=['one_hot'], max_one_hot_cat_size=20), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-RTDL-D-class_val-ce', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_RTDL_D_CLASS_TabZilla, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-PLR-D-class_val-ce', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_PLR_D_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'ResNet-RTDL-D-class_val-ce', ResNetRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.RESNET_RTDL_D_CLASS_TabZilla, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class_val-ce', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealTabR-D-class_val-ce', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.RealTABR_D_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealTabR-D-class_val-ce_no-ls', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.RealTABR_D_CLASS, dict(ls_eps=0.0, val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0, 'FTT-D-class_val-ce', FTTransformerInterfaceWrapper( **utils.join_dicts(DefaultParams.FTT_D_CLASS, dict(val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_nns_no_ls(paths: Paths, tag: str = 'paper', rerun: bool = False): # this took around 48m job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-S-class_no-ls', NNInterfaceWrapper( **utils.join_dicts(DefaultParams.RealMLP_TD_S_CLASS, dict(use_ls=False, ls_eps=0.0))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-class_no-ls', NNInterfaceWrapper( **utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict(use_ls=False, ls_eps=0.0))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealTabR-D-class_no-ls', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.RealTABR_D_CLASS, dict(ls_eps=0.0))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_tabr_configs(paths: Paths, rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealTabR-D-class', TabRInterfaceWrapper(**DefaultParams.RealTABR_D_CLASS), tags=['paper'], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'RealTabR-D-reg', TabRInterfaceWrapper(**DefaultParams.RealTABR_D_REG), tags=['paper'], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class_val-ce', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_CLASS, dict(val_metric_name='cross_entropy'))), tags=['paper_val_ce'], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class_rssc', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_CLASS, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=['paper'], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'TabR-S-D-reg_rssc', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_REG, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=['paper'], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_CLASS, dict())), tags=['paper'], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'TabR-S-D-reg', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_REG, dict())), tags=['paper'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_early_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False): # around 4h job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for esr in [10, 20, 50, 100, 300, 1000]: job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_esr-{esr}', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'XGB-TD-class_esr-{esr}', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'CatBoost-TD-class_esr-{esr}', CatBoostInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) # regression job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'LGBM-TD-reg_esr-{esr}', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_TD_REG, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'XGB-TD-reg_esr-{esr}', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_TD_REG, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'CatBoost-TD-reg_esr-{esr}', CatBoostInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_TD_REG, dict(early_stopping_rounds=esr))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_brier_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False): # around 4h job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for esr in [10, 20, 50, 100, 300, 1000]: # for esr in [300]: job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_val-brier_esr-{esr}', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='brier'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'XGB-TD-class_val-brier_esr-{esr}', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='brier'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'CatBoost-TD-class_val-brier_esr-{esr}', CatBoostInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='brier'))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_cross_entropy_stopping_configs(paths: Paths, tag: str = 'paper_early_stopping', rerun: bool = False): # around 4h job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for esr in [10, 20, 50, 100, 300, 1000]: # for esr in [300]: job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'LGBM-TD-class_val-ce_esr-{esr}', LGBMInterfaceWrapper( **utils.join_dicts(DefaultParams.LGBM_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'XGB-TD-class_val-ce_esr-{esr}', XGBInterfaceWrapper( **utils.join_dicts(DefaultParams.XGB_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'CatBoost-TD-class_val-ce_esr-{esr}', CatBoostInterfaceWrapper( **utils.join_dicts(DefaultParams.CB_TD_CLASS, dict(early_stopping_rounds=esr, val_metric_name='cross_entropy'))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_ensemble_configs(paths: Paths, tag: str = 'paper', rerun: bool = False): # around 20 minutes or so job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for task_infos, task_type_name in [(class_task_infos, 'class'), (reg_task_infos, 'reg')]: for alg_group_name, alg_names in get_ensemble_groups(task_type_name).items(): job_mgr.add_jobs(task_infos, config_10_1_0, f'Ensemble{alg_group_name}', CaruanaEnsembleWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names]), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_realmlp_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32)) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos alg_names = [f'RealMLP-HPO_step-{i}' for i in range(n_hpo_steps)] for task_infos, val_metric_name in [(reg_task_infos, 'rmse'), (class_task_infos, 'class_error')]: run_alg_selection(paths, config_10_1_0, task_infos, f'RealMLP-HPO', alg_names, val_metric_name) run_alg_selection(paths, config_10_1_0, class_task_infos, f'RealMLP-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr') msd_alg_names = [f'RealMLP-HPO-moresigmadim_step-{i}' for i in range(n_hpo_steps)] for task_infos, val_metric_name in [(train_reg_task_infos, 'rmse'), (train_class_task_infos, 'class_error')]: run_alg_selection(paths, config_10_1_0, task_infos, f'RealMLP-HPO-moresigmadim', msd_alg_names, val_metric_name, tags=[tag]) def run_rtdl_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32)) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos alg_names = [f'MLP-RTDL-HPO_step-{i}' for i in range(n_hpo_steps)] plr_alg_names = [f'MLP-PLR-HPO_step-{i}' for i in range(n_hpo_steps)] resnet_alg_names = [f'ResNet-RTDL-HPO_step-{i}' for i in range(n_hpo_steps)] ftt_alg_names = [f'FTT-HPO_step-{i}' for i in range(n_hpo_steps)] for task_infos, val_metric_name in [(reg_task_infos, 'rmse'), (class_task_infos, 'class_error')]: run_alg_selection(paths, config_10_1_0, task_infos, f'MLP-RTDL-HPO', alg_names, val_metric_name) run_alg_selection(paths, config_10_1_0, task_infos, f'MLP-PLR-HPO', plr_alg_names, val_metric_name) run_alg_selection(paths, config_10_1_0, task_infos, f'ResNet-RTDL-HPO', resnet_alg_names, val_metric_name) for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'), (grinsztajn_class_task_infos, 'class_error')]: run_alg_selection(paths, config_10_1_0, task_infos, f'FTT-HPO', ftt_alg_names, val_metric_name) run_alg_selection(paths, config_10_1_0, class_task_infos, f'MLP-RTDL-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr') run_alg_selection(paths, config_10_1_0, class_task_infos, f'MLP-PLR-HPO_best-1-auc-ovr', plr_alg_names, '1-auc_ovr') run_alg_selection(paths, config_10_1_0, class_task_infos, f'ResNet-RTDL-HPO_best-1-auc-ovr', resnet_alg_names, '1-auc_ovr') run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'FTT-HPO_best-1-auc-ovr', ftt_alg_names, '1-auc_ovr') def run_tabr_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=32)) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos alg_names = [f'TabR-HPO_step-{i}' for i in range(n_hpo_steps)] realtabr_alg_names = [f'RealTabR-HPO_step-{i}' for i in range(n_hpo_steps)] for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'), (grinsztajn_class_task_infos, 'class_error')]: run_alg_selection(paths, config_10_1_0, task_infos, f'TabR-HPO', alg_names, val_metric_name, tags=[tag], rerun=rerun) run_alg_selection(paths, config_10_1_0, task_infos, f'RealTabR-HPO', realtabr_alg_names, val_metric_name, tags=[tag], rerun=rerun) run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'TabR-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr', tags=[tag], rerun=rerun) run_alg_selection(paths, config_10_1_0, grinsztajn_class_task_infos, f'RealTabR-HPO_best-1-auc-ovr', realtabr_alg_names, '1-auc_ovr', tags=[tag], rerun=rerun) def run_gbdt_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=16)) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for gbdt_name in ['XGB', 'LGBM', 'CatBoost']: alg_names = [f'{gbdt_name}-HPO_step-{i}' for i in range(n_hpo_steps)] job_mgr.add_jobs(all_task_infos, config_10_1_0, f'{gbdt_name}-HPO', AlgorithmSelectionWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names]), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, f'{gbdt_name}-HPO_best-1-auc-ovr', AlgorithmSelectionWrapper([LoadResultsWrapper(alg_name) for alg_name in alg_names], alg_sel_metric_name='1-auc_ovr'), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_rf_hpo_alg_selection(paths: Paths, n_hpo_steps: int, tag: str = 'paper', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager(max_n_threads=16)) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for task_infos, val_metric_name in [(grinsztajn_reg_task_infos, 'rmse'), (grinsztajn_class_task_infos, 'class_error')]: alg_names = [f'RF-HPO_step-{i}' for i in range(n_hpo_steps)] run_alg_selection(paths, config_10_1_0, task_infos, f'RF-HPO', alg_names, val_metric_name, tags=[tag], rerun=rerun) run_alg_selection(paths, config_10_1_0, task_infos, f'RF-HPO_best-1-auc-ovr', alg_names, '1-auc_ovr', tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_rtdl_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, with_mlp: bool = True, with_resnet: bool = True, only_meta_train: bool = False, only_meta_test: bool = False, tabzilla_defaults: bool = True, with_plr: bool = True, with_ftt: bool = True): # ca 50 min for meta-train-reg job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos if with_resnet: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'ResNet-RTDL-D-class_grinsztajn' if not tabzilla_defaults else 'ResNet-RTDL-D-class', ResNetRTDLInterfaceWrapper( **DefaultParams.RESNET_RTDL_D_CLASS_Grinsztajn if not tabzilla_defaults else DefaultParams.RESNET_RTDL_D_CLASS_TabZilla), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'ResNet-RTDL-D-reg_grinsztajn' if not tabzilla_defaults else 'ResNet-RTDL-D-reg', ResNetRTDLInterfaceWrapper( **DefaultParams.RESNET_RTDL_D_REG_Grinsztajn if not tabzilla_defaults else DefaultParams.RESNET_RTDL_D_REG_TabZilla), tags=[tag], rerun=rerun) if with_mlp: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-RTDL-D-class_grinsztajn' if not tabzilla_defaults else 'MLP-RTDL-D-class', MLPRTDLInterfaceWrapper( **DefaultParams.MLP_RTDL_D_CLASS_Grinsztajn if not tabzilla_defaults else DefaultParams.MLP_RTDL_D_CLASS_TabZilla), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-RTDL-D-reg_grinsztajn' if not tabzilla_defaults else 'MLP-RTDL-D-reg', MLPRTDLInterfaceWrapper( **DefaultParams.MLP_RTDL_D_REG_Grinsztajn if not tabzilla_defaults else DefaultParams.MLP_RTDL_D_REG_TabZilla), tags=[tag], rerun=rerun) if with_plr: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-PLR-D-class', MLPRTDLInterfaceWrapper( **DefaultParams.MLP_PLR_D_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-PLR-D-reg', MLPRTDLInterfaceWrapper( **DefaultParams.MLP_PLR_D_REG), tags=[tag], rerun=rerun) if with_ftt: job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0, 'FTT-D-class', FTTransformerInterfaceWrapper( **DefaultParams.FTT_D_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(grinsztajn_reg_task_infos + train_reg_task_infos, config_10_1_0, 'FTT-D-reg', FTTransformerInterfaceWrapper( **DefaultParams.FTT_D_REG), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_rtdl_rssc_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, with_mlp: bool = True, with_resnet: bool = True, with_plr: bool = True, with_tabr: bool = True, with_ftt: bool = True, only_meta_train: bool = False, only_meta_test: bool = False): # ca 50 min for meta-train-reg (without TabR/FTT) # ca 8h30m for FTT (on meta-train + grinsztajn) job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos if with_resnet: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'ResNet-RTDL-D-class_rssc', ResNetRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.RESNET_RTDL_D_CLASS_TabZilla, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'ResNet-RTDL-D-reg_rssc', ResNetRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.RESNET_RTDL_D_REG_TabZilla, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) if with_mlp: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-RTDL-D-class_rssc', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_RTDL_D_CLASS_TabZilla, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-RTDL-D-reg_rssc', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_RTDL_D_REG_TabZilla, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) if with_plr: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-PLR-D-class_rssc', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_PLR_D_CLASS, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-PLR-D-reg_rssc', MLPRTDLInterfaceWrapper( **utils.join_dicts(DefaultParams.MLP_PLR_D_REG, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) if with_tabr: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class_rssc', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_CLASS, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'TabR-S-D-reg_rssc', TabRInterfaceWrapper( **utils.join_dicts(DefaultParams.TABR_S_D_REG, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) if with_ftt: job_mgr.add_jobs(grinsztajn_class_task_infos + train_class_task_infos, config_10_1_0, 'FTT-D-class_rssc', FTTransformerInterfaceWrapper( **utils.join_dicts(DefaultParams.FTT_D_CLASS, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.add_jobs(grinsztajn_reg_task_infos + train_reg_task_infos, config_10_1_0, 'FTT-D-reg_rssc', FTTransformerInterfaceWrapper( **utils.join_dicts(DefaultParams.FTT_D_REG, dict(tfms=['median_center', 'robust_scale', 'smooth_clip']))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_tabr_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, only_meta_train: bool = False, only_meta_test: bool = False, start_split: int = 0, end_split: int = 10): # ca 50 min for meta-train-reg job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos job_mgr.add_jobs(class_task_infos, config_10_1_0, 'TabR-S-D-class', TabRInterfaceWrapper( **DefaultParams.TABR_S_D_CLASS), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'TabR-S-D-reg', TabRInterfaceWrapper( **DefaultParams.TABR_S_D_REG), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_default_configs(paths: Paths, tag: str = 'paper', rerun: bool = False): # took 12h55s job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(all_task_infos, config_10_1_0, 'LGBM-D', LGBMInterfaceWrapper(**DefaultParams.LGBM_D), tags=[tag], rerun=rerun) job_mgr.add_jobs(all_task_infos, config_10_1_0, 'XGB-D', XGBInterfaceWrapper(**DefaultParams.XGB_D), tags=[tag], rerun=rerun) job_mgr.add_jobs(all_task_infos, config_10_1_0, 'CatBoost-D', CatBoostInterfaceWrapper(**DefaultParams.CB_D), tags=[tag], rerun=rerun) # it was too bad to include in the plots # job_mgr.add_jobs(all_task_infos, config_10_1_0, # 'MLP-SKL-D', # SklearnMLPInterfaceWrapper(tfms=['mean_center', 'l2_normalize', 'one_hot']), # tags=[tag], rerun=rerun) job_mgr.add_jobs(all_task_infos, config_10_1_0, 'RF-SKL-D', RFInterfaceWrapper(tfms=['ordinal_encoding'], permute_ordinal_encoding=True), tags=[tag, 'paper_val_ce'], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'XGB-PBB-D', # Probst, Boulestix, and Bischl, "Tunability: Importance of ..." XGBInterfaceWrapper(n_estimators=4168, lr=0.018, min_child_weight=2.06, max_depth=13, reg_lambda=0.982, reg_alpha=1.113, subsample=0.839, colsample_bytree=0.752, colsample_bylevel=0.585, tree_method='hist', max_n_threads=64, tfms=['one_hot'], max_one_hot_cat_size=20), tags=['paper']) job_mgr.run_jobs(scheduler) def run_gbdts_hpo_tpe(paths: Paths, n_estimators: int = 1000, early_stopping_rounds: int = 300, tag: str = 'paper'): # this generates about 10GB of data # took 7h17m for n_estimators=2 # took about 6h30m for n_estimators=1 (but slightly more tasks were run for that because of the rerun=True) job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) config_5_1_0 = RunConfig(n_tt_splits=5, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for task_infos, config in [(train_task_infos, config_10_1_0), (test_task_infos, config_10_1_0)]: job_mgr.add_jobs(task_infos, config, f'XGB-HPO-TPE', XGBHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, early_stopping_rounds=early_stopping_rounds, tree_method='hist', space='grinsztajn'), tags=[tag]) job_mgr.add_jobs(task_infos, config, f'CatBoost-HPO-TPE', CatBoostHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, early_stopping_rounds=early_stopping_rounds, space='shwartz-ziv'), tags=[tag]) job_mgr.add_jobs(task_infos, config, f'LGBM-HPO-TPE', LGBMHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, early_stopping_rounds=early_stopping_rounds, space='catboost_quality_benchmarks'), tags=[tag]) job_mgr.run_jobs(scheduler) def run_preprocessing_experiments(paths: Paths, tag: str = 'paper_preprocessing'): # this took 7h9m for just two different scikit-learn based transformation configurations! job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) for task_infos, defaults in [(train_class_task_infos, DefaultParams.RealMLP_TD_S_CLASS), (train_reg_task_infos, DefaultParams.RealMLP_TD_S_REG)]: job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-mc-rs-sc-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['median_center', 'robust_scale', 'smooth_clip', 'one_hot'] ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-mc-rs-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['median_center', 'robust_scale', 'one_hot'] ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-std-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['mean_center', 'l2_normalize', 'one_hot'], l2_normalize_eps=1e-30, ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-std-sc-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['mean_center', 'l2_normalize', 'smooth_clip', 'one_hot'], l2_normalize_eps=1e-30, ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-kdi1-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['kdi', 'one_hot'], kdi_alpha=1.0, max_n_vectorized=1, ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-quantile-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['quantile', 'one_hot'], max_n_vectorized=1, ))), [tag]) job_mgr.add_jobs(task_infos, config_10_1_0, 'RealMLP-TD-S_tfms-quantiletabr-oh', NNInterfaceWrapper(**utils.update_dict(defaults, dict( tfms=['quantile_tabr', 'one_hot'], max_n_vectorized=1, ))), [tag]) job_mgr.run_jobs(scheduler) def run_all_ablations(paths: Paths, with_class: bool = True, with_reg: bool = True): run_ablations(paths, { 'default': dict(), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'lr-cos-decay': dict(lr_sched='cos'), 'lr-constant': dict(lr_sched='constant'), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'wd-0.0': dict(wd=0.0, wd_sched='constant', bias_wd_factor=0.0), 'wd-0.02': dict(wd=0.02, wd_sched='constant', bias_wd_factor=0.0), }, with_class=with_class, with_reg=with_reg) # run_ablations(paths, { # 'wd-0.01-flatcos': dict(wd=0.01, wd_sched='flat_cos', bias_wd_factor=0.0), # 'wd-0.01': dict(wd=0.01, wd_sched='constant', bias_wd_factor=0.0), # }, with_class=False, with_reg=with_reg) # run_ablations(paths, { # 'wd-0.0': dict(wd=0.0, wd_sched='constant', bias_wd_factor=0.0), # 'wd-0.01': dict(wd=0.01, wd_sched='constant', bias_wd_factor=0.0), # }, with_class=with_class, with_reg=False) run_ablations(paths, { 'pdrop-0.0': dict(p_drop=0.0, p_drop_sched='constant'), 'pdrop-0.15': dict(p_drop=0.15, p_drop_sched='constant'), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'no-front-scale': dict(first_layer_config=dict()), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'normal-init': dict(bias_init_mode='zeros', weight_init_mode='normal'), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'standard-param_no-wd': dict(weight_param='standard', bias_lr_factor=1 / 16, weight_lr_factor=1 / 16, wd=0.0), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'non-parametric-act': dict(use_parametric_act=False), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'act-relu': dict(act='relu'), 'act-mish': dict(act='mish') }, with_class=with_class, with_reg=False) run_ablations(paths, { 'act-relu': dict(act='relu'), 'act-selu': dict(act='selu') }, with_class=False, with_reg=with_reg) run_ablations(paths, { 'no-label-smoothing': dict(use_ls=False, ls_eps=0.0), }, with_class=with_class, with_reg=False) run_ablations(paths, { 'num-embeddings-plr': dict(plr_act_name='relu', plr_use_densenet=False, plr_use_cos_bias=False), 'num-embeddings-pl': dict(plr_act_name='linear', plr_use_densenet=False, plr_use_cos_bias=False), 'num-embeddings-none': dict(use_plr_embeddings=False) }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'beta2-0.999': dict(sq_mom=0.999), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'first-best-epoch': dict(use_last_best_epoch=False), }, with_class=with_class, with_reg=with_reg) run_ablations(paths, { 'no-cat-embs': dict(max_one_hot_cat_size=-1), }, with_class=with_class, with_reg=with_reg) def run_architecture_ablations(paths: Paths, tag: str = 'paper', rerun: bool = False, only_meta_train: bool = False, only_meta_test: bool = False, start_split: int = 0, end_split: int = 10): # ca 1h45m + 40m + 2h job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos lr_grid_std = [1.5e-3, 7e-4, 1e-3, 4e-4, 2.5e-3, 4e-3, 7e-3, 1e-2, 1.5e-2] lr_grid_ntp = [0.04, 0.2, 0.1, 0.02, 0.07, 0.01, 0.3, 0.03, 0.4] mlp_rtdl_repr_config_class = dict( hidden_sizes=[128, 256, 128], p_drop=0.1, block_str='w-b-a-d', lr=2.5e-3, # will be changed later opt='adam', tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'], embedding_size=8, batch_size=128, n_epochs=1000, use_early_stopping=True, early_stopping_multiplicative_patience=1, early_stopping_additive_patience=20, act='relu', weight_param='standard', weight_init_mode='uniform', weight_init_gain=1. / np.sqrt(3.), bias_init_mode='pytorch-default', use_last_best_epoch=False, emb_init_mode='kaiming-uniform-t', ) mlp_rtdl_repr_config_reg = utils.join_dicts(mlp_rtdl_repr_config_class, dict( normalize_output=True, lr=1.5e-3)) mlp_rtdl_num_emb_repr_config_class = utils.join_dicts(mlp_rtdl_repr_config_class, dict( num_emb_type='plr', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1, # todo: or pl embeddings? lr=2.5e-3, )) mlp_rtdl_num_emb_repr_config_reg = utils.join_dicts(mlp_rtdl_num_emb_repr_config_class, dict(normalize_output=True, lr=7e-4)) mlp_rtdl_pl_config_class = utils.join_dicts(mlp_rtdl_num_emb_repr_config_class, dict( num_emb_type='pl', lr=4e-3)) mlp_rtdl_pl_config_reg = utils.join_dicts(mlp_rtdl_pl_config_class, dict(normalize_output=True, lr=4e-4)) realmlp_arch_class = dict( hidden_sizes=[128, 256, 128], p_drop=0.1, block_str='w-b-a-d', opt='adam', tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'], embedding_size=8, batch_size=128, n_epochs=1000, use_early_stopping=True, early_stopping_multiplicative_patience=1, early_stopping_additive_patience=20, weight_init_mode='uniform', weight_init_gain=1. / np.sqrt(3.), bias_init_mode='pytorch-default', use_last_best_epoch=False, emb_init_mode='kaiming-uniform-t', lr=2e-2, num_emb_type='pbld', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1, weight_param='ntk', bias_lr_factor=0.1, act='selu', use_parametric_act=True, act_lr_factor=0.1, add_front_scale=True, scale_lr_factor=6.0, ) realmlp_arch_reg = utils.join_dicts(realmlp_arch_class, dict(act='mish', normalize_output=True, lr=1e-2)) def add_jobs(name: str, config_class: dict, config_reg: dict, lr_grid: List[float], with_meta_test: bool = True): for task_infos, all_task_infos, task_type_name, config in [ (train_class_task_infos, class_task_infos, 'class', config_class), (train_reg_task_infos, reg_task_infos, 'reg', config_reg)]: for lr in lr_grid: job_mgr.add_jobs(task_infos, config_10_1_0, f'{name}_lr-{lr:g}', NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))), tags=['paper_arch-lr-tuning'], rerun=rerun) if with_meta_test: job_mgr.add_jobs(all_task_infos, config_10_1_0, f'{name}', NNInterfaceWrapper(**config), tags=['paper'], rerun=rerun) add_jobs('MLP-RTDL-reprod', mlp_rtdl_repr_config_class, mlp_rtdl_repr_config_reg, lr_grid_std) add_jobs('MLP-RTDL-reprod-plr', mlp_rtdl_num_emb_repr_config_class, mlp_rtdl_num_emb_repr_config_reg, lr_grid_std, with_meta_test=False) add_jobs('MLP-RTDL-reprod-pl', mlp_rtdl_pl_config_class, mlp_rtdl_pl_config_reg, lr_grid_std) add_jobs('MLP-RTDL-reprod-RealMLP-arch', realmlp_arch_class, realmlp_arch_reg, lr_grid_ntp) job_mgr.run_jobs(scheduler) def run_cumulative_ablations_new(paths: Paths, n_lrs: int = -1, tag: str = 'paper_cumulative_ablations_new', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False) # todo: it's false train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) # lr_grid_ntp = [0.01, 0.015, 0.025, 0.04, 0.07, 0.1, 0.2, 0.3, 0.4] # lr_grid_std = [4e-4, 7e-4, 1e-3, 1.5e-3, 2.5e-3, 4e-3, 7e-3, 1e-2, 2e-2] lr_grid_std = [1.5e-3, 7e-4, 1e-3, 4e-4, 2.5e-3, 4e-3, 7e-3, 1e-2, 1.5e-2] lr_grid_ntp = [0.04, 0.2, 0.1, 0.02, 0.07, 0.01, 0.3, 0.03, 0.4] if n_lrs > 0: lr_grid_std = lr_grid_std[:n_lrs] lr_grid_ntp = lr_grid_ntp[:n_lrs] config_class = dict() config_reg = dict() ablation_counter = 1 def add_config(name: str, lr_grid: List[float], add: Optional[Dict[str, Any]] = None, add_class: Optional[Dict[str, Any]] = None, add_reg: Optional[Dict[str, Any]] = None, run_this: bool = True): nonlocal ablation_counter nonlocal config_class nonlocal config_reg if add is not None: config_class = utils.join_dicts(config_class, add) config_reg = utils.join_dicts(config_reg, add) if add_class is not None: config_class = utils.join_dicts(config_class, add_class) if add_reg is not None: config_reg = utils.join_dicts(config_reg, add_reg) if run_this: for lr in lr_grid: for task_infos, task_type_name, config in [(train_class_task_infos, 'class', config_class), (train_reg_task_infos, 'reg', config_reg)]: job_mgr.add_jobs(task_infos, config_10_1_0, f'MLP-cumul-abl-new-{ablation_counter}-{task_type_name}_{name}_lr-{lr:g}', NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))), tags=[tag], rerun=rerun) ablation_counter += 1 vanilla_config_class = dict( hidden_sizes=[256] * 3, p_drop=0.0, block_str='w-b-a-d', opt='adam', tfms=['quantile', 'embedding'], embedding_size=8, batch_size=256, n_epochs=256, use_early_stopping=True, early_stopping_multiplicative_patience=1, early_stopping_additive_patience=40, act='relu', weight_param='standard', weight_init_mode='uniform', weight_init_gain=1. / np.sqrt(3.), bias_init_mode='pytorch-default', max_n_vectorized=1, use_last_best_epoch=False, ) add_config('vanilla', lr_grid_std, add=vanilla_config_class, add_reg=dict(normalize_output=True), run_this=True) # quantile_tabr was not well-suited for vectorization, now we can vectorize add_config('robust-scale-smooth-clip', lr_grid_std, dict(tfms=['median_center', 'robust_scale', 'smooth_clip', 'embedding'], max_n_vectorized=50)) add_config('one-hot-small-cat', lr_grid_std, dict(tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], max_one_hot_cat_size=9)) add_config('no-early-stop', lr_grid_std, dict(use_early_stopping=False)) add_config('last-best-epoch', lr_grid_std, dict(use_last_best_epoch=True)) add_config('lr-multi-cycle', lr_grid_std, dict(lr_sched='coslog4')) add_config('beta2-0.95', lr_grid_std, dict(sq_mom=0.95)) add_config('label-smoothing', lr_grid_std, add_class=dict(use_ls=True, ls_eps=0.1)) add_config('output-clipping', lr_grid_std, add_reg=dict(clamp_output=True)) add_config('ntp', lr_grid_ntp, dict(weight_param='ntk', bias_lr_factor=0.1)) add_config('different-act', lr_grid_ntp, add_class=dict(act='selu'), add_reg=dict(act='mish')) add_config('param-act', lr_grid_ntp, dict(use_parametric_act=True, act_lr_factor=0.1)) add_config('front-scale', lr_grid_ntp, dict(add_front_scale=True, scale_lr_factor=6.0)) add_config('num-emb-pl', lr_grid_ntp, dict(num_emb_type='pl', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1)) add_config('num-emb-pbld', lr_grid_ntp, dict(num_emb_type='pbld')) add_config('alt-pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15)) add_config('alt-pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos')) add_config('alt-wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0)) add_config('alt-wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos')) add_config('alt-bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5')) add_config('alt-weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0)) # add_config('bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5')) # add_config('weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0)) # add_config('pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15)) # add_config('pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos')) # add_config('wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0)) # add_config('wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos')) job_mgr.run_jobs(scheduler) pass if __name__ == '__main__': paths = Paths.from_env_variables() run_td_configs(paths, tag='paper', rerun=False) run_default_configs(paths, tag='paper', rerun=False) run_rtdl_default_configs(paths, tag='paper', tabzilla_defaults=True) run_tabr_configs(paths) run_gbdt_rs_configs() run_rf_rs_configs() for i in range(50): if (i + 1) % 10 == 0: run_rtdl_tuning_configs(paths, n_steps=i + 1, with_resnet=True, only_meta_train=False) for i in range(50): if (i + 1) % 10 == 0: run_realmlp_tuning_configs(paths, n_steps=i + 1, tag='paper_mlp-hpo', rerun=False) for n_steps in [1, 2, 5, 10, 20, 30, 40, 50]: run_tabr_tuning_configs(paths, n_steps=n_steps) run_rtdl_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper') run_gbdt_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper') run_rf_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper') run_realmlp_hpo_alg_selection(paths, n_hpo_steps=50, tag='paper', rerun=False) run_tabr_hpo_alg_selection(paths, n_hpo_steps=50) run_ensemble_configs(paths, tag='paper') # ----- ablations (mostly for the appendix) ----- for n_lrs in [10]: # range(1, 10): run_cumulative_ablations_new(paths, n_lrs=n_lrs) run_rtdl_rssc_default_configs(paths, tag='paper') run_default_ce_configs(paths) run_nns_no_ls(paths) run_all_ablations(paths) run_architecture_ablations(paths) run_preprocessing_experiments(paths) run_refit_configs(paths, tag='paper', rerun=False) run_early_stopping_configs(paths) run_brier_stopping_configs(paths) run_cross_entropy_stopping_configs(paths) run_refit_configs(paths, tag='paper', rerun=False) ================================================ FILE: scripts/run_experiments_unused.py ================================================ from typing import List, Optional, Dict, Any import numpy as np from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsNNInterfaceWrapper, NNInterfaceWrapper, \ AutoGluonModelInterfaceWrapper, CatBoostInterfaceWrapper, LGBMInterfaceWrapper, XGBInterfaceWrapper, \ XGBHyperoptInterfaceWrapper, CatBoostHyperoptInterfaceWrapper, LGBMHyperoptInterfaceWrapper from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models import utils from pytabkit.models.alg_interfaces.nn_interfaces import RealMLPParamSampler from pytabkit.models.sklearn.default_params import DefaultParams def run_extra_realmlp_tuning_configs(paths: Paths, n_steps: int = 50, tag: str = 'paper_reamlp-hpo-clr', rerun: bool = False): # 1h8m for 5 steps of clr on meta-train. 2h40m for 5 steps of ms on meta-train. job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for step_idx in range(n_steps): job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-clr_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='clr'), tags=['realmlp-hpo-clr'], rerun=rerun) job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-moresigma_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigma'), tags=['realmlp-hpo-ms'], rerun=rerun) job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-moresigmadim_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadim'), tags=['realmlp-hpo-msd'], rerun=rerun) job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-moresigmadimreg_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimreg'), tags=['realmlp-hpo-msdr'], rerun=rerun) job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-moresigmadimsize_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimsize'), tags=['realmlp-hpo-msds'], rerun=rerun) job_mgr.add_jobs(train_task_infos, config_10_1_0, f'RealMLP-HPO-moresigmadimlr_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='moresigmadimlr'), tags=['realmlp-hpo-msdl'], rerun=rerun) job_mgr.run_jobs(scheduler) def run_mlp_random_configs(paths: Paths, n_steps: int = 50, tag: str = 'mlp_random', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos sampler = RealMLPParamSampler(is_classification=False) for step_idx in range(n_steps): params = sampler.sample_params(seed=step_idx) relevant_params = {key: value for key, value in params.items() if key in ['num_emb_type', 'add_front_scale', 'lr', 'p_drop', 'wd', 'plr_sigma', 'act', 'hidden_sizes', 'ls_eps']} config_str = '' for key, value in relevant_params.items(): if key == 'hidden_sizes': value = f'{value[0]}x{len(value)}' config_str = config_str + '_' + key.replace('_', '-') + '-' + str(value) job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'RealMLP-reg' + config_str, NNInterfaceWrapper(**params), tags=[tag], rerun=rerun) sampler = RealMLPParamSampler(is_classification=True) for step_idx in range(n_steps): params = sampler.sample_params(seed=step_idx) relevant_params = {key: value for key, value in params.items() if key in ['num_emb_type', 'add_front_scale', 'lr', 'p_drop', 'wd', 'plr_sigma', 'act', 'hidden_sizes', 'ls_eps']} config_str = '' for key, value in relevant_params.items(): if key == 'hidden_sizes': value = f'{value[0]}x{len(value)}' config_str = config_str + '_' + key.replace('_', '-') + '-' + str(value) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'RealMLP-class' + config_str, NNInterfaceWrapper(**params), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_mlp_random_seed_configs(paths: Paths, n_steps: int = 50, tag: str = 'mlp_random_seeds', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos for step_idx in range(n_steps): job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'RealMLP-reg_seed-offset-{step_idx}', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG, random_seed_offset=step_idx), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_additional_configs(paths: Paths, tag: str = 'paper_additional', rerun: bool = False): # not in the paper # this took around 17h24m job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos # run class-on-reg and reg-on-class job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, 'RealMLP-TD-class-on-reg', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_CLASS, dict(use_ls=False, ls_eps=0.0, normalize_output=True, clamp_output=True))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, 'RealMLP-TD-reg-on-class', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_REG, dict(use_ls=True, ls_eps=0.1, normalize_output=False, clamp_output=False))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, 'RealMLP-TD-S-class-on-reg', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_S_CLASS, dict(use_ls=False, ls_eps=0.0, normalize_output=True))), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_class_task_infos, config_10_1_0, 'RealMLP-TD-S-reg-on-class', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_S_REG, dict(use_ls=True, ls_eps=0.1, normalize_output=False))), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'RealMLP-TD-class_only-one-hot', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_CLASS, dict(max_one_hot_cat_size=-1))), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'RealMLP-TD-reg_only-one-hot', NNInterfaceWrapper(**utils.update_dict(DefaultParams.RealMLP_TD_REG, dict(max_one_hot_cat_size=-1))), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_seed_opt_configs(paths: Paths, random_seed_offset: int, tag: str = 'paper', rerun: bool = False): # not used in the paper # this took around 17h24m job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(train_class_task_infos, config_10_1_0, f'RealMLP-TD-class_alt-seed-{random_seed_offset}', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_CLASS, random_seed_offset=random_seed_offset), tags=[tag], rerun=rerun) job_mgr.add_jobs(train_reg_task_infos, config_10_1_0, f'RealMLP-TD-reg_alt-seed-{random_seed_offset}', NNInterfaceWrapper(**DefaultParams.RealMLP_TD_REG, random_seed_offset=random_seed_offset), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_ag_nn_configs(paths: Paths, tag: str = 'paper', rerun: bool = False, only_meta_train: bool = False, only_meta_test: bool = False, with_ftt: bool = True, start_split: int = 0, end_split: int = 10): # ca 50 min for meta-train-reg job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=end_split, min_split_idx=start_split, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) grinsztajn_class_task_infos = TaskCollection.from_name('grinsztajn-class', paths).load_infos(paths) grinsztajn_reg_task_infos = TaskCollection.from_name('grinsztajn-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos + grinsztajn_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos + grinsztajn_reg_task_infos train_task_infos = train_class_task_infos + train_reg_task_infos test_task_infos = test_class_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos if only_meta_train: class_task_infos = train_class_task_infos reg_task_infos = train_reg_task_infos elif only_meta_test: class_task_infos = test_class_task_infos reg_task_infos = test_reg_task_infos # fastai on meta-train took 40 GPU-minutes # MLP-AGT took 1h31m on one RTX 3090 # FT-T with some RAM estimates: ca 44m + 17h + 40m job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-FAI-D-class', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='FASTAI', max_n_models_per_type=1), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-FAI-D-reg', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='FASTAI', max_n_models_per_type=1), tags=[tag], rerun=rerun) job_mgr.add_jobs(class_task_infos, config_10_1_0, 'MLP-AGT-D-class', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='NN_TORCH', max_n_models_per_type=1), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'MLP-AGT-D-reg', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default', model_types='NN_TORCH', max_n_models_per_type=1), tags=[tag], rerun=rerun) if with_ftt: job_mgr.add_jobs(class_task_infos, config_10_1_0, 'FT-Transformer-D-class', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default_FTT', model_types='FT_TRANSFORMER', max_n_models_per_type=1), tags=[tag], rerun=rerun) job_mgr.add_jobs(reg_task_infos, config_10_1_0, 'FT-Transformer-D-reg', AutoGluonModelInterfaceWrapper(use_gpu=True, hp_family='default_FTT', model_types='FT_TRANSFORMER', max_n_models_per_type=1), tags=[tag], rerun=rerun) job_mgr.run_jobs(scheduler) def run_trees_custom(paths: Paths, n_estimators: int, tag: str = 'paper', with_defaults: bool = True): # only for speed-testing # this generates about 10GB of data # took 7h17m for n_estimators=2 # took about 6h30m for n_estimators=1 (but slightly more tasks were run for that because of the rerun=True) # the large main overhead is probably mainly for evaluating the metrics job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=True) train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) test_class_task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) test_reg_task_infos = TaskCollection.from_name('meta-test-reg', paths).load_infos(paths) class_task_infos = train_class_task_infos + test_class_task_infos reg_task_infos = train_reg_task_infos + test_reg_task_infos all_task_infos = class_task_infos + reg_task_infos job_mgr.add_jobs(all_task_infos, config_10_1_0, f'XGB_hyperopt-50_grinsztajn_nest-{n_estimators}', XGBHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, tree_method='hist', space='grinsztajn'), tags=[tag], rerun=True) job_mgr.add_jobs(all_task_infos, config_10_1_0, f'CatBoost_hyperopt-50_shwartz-ziv_nest-{n_estimators}', CatBoostHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, space='shwartz-ziv'), tags=[tag], rerun=True) job_mgr.add_jobs(all_task_infos, config_10_1_0, f'LGBM_hyperopt-50_cqb_nest-{n_estimators}', LGBMHyperoptInterfaceWrapper(n_estimators=n_estimators, n_hyperopt_steps=50, space='catboost_quality_benchmarks'), rerun=True) if with_defaults: # optimized default parameters # classification job_mgr.add_jobs(class_task_infos, config_10_1_0, f'LGBM-TD-class_nest-{n_estimators}', LGBMInterfaceWrapper(**utils.update_dict(DefaultParams.LGBM_TD_CLASS, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) job_mgr.add_jobs(class_task_infos, config_10_1_0, f'XGB-TD-class_nest-{n_estimators}', XGBInterfaceWrapper(**utils.update_dict(DefaultParams.XGB_TD_CLASS, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) job_mgr.add_jobs(class_task_infos, config_10_1_0, f'CatBoost-TD-class_nest-{n_estimators}', CatBoostInterfaceWrapper(**utils.update_dict(DefaultParams.CB_TD_CLASS, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) # regression job_mgr.add_jobs(reg_task_infos, config_10_1_0, f'LGBM-TD-reg_nest-{n_estimators}', LGBMInterfaceWrapper(**utils.update_dict(DefaultParams.LGBM_TD_REG, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) job_mgr.add_jobs(reg_task_infos, config_10_1_0, f'XGB-TD-reg_nest-{n_estimators}', XGBInterfaceWrapper(**utils.update_dict(DefaultParams.XGB_TD_REG, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) job_mgr.add_jobs(reg_task_infos, config_10_1_0, f'CatBoost-TD-reg_nest-{n_estimators}', CatBoostInterfaceWrapper(**utils.update_dict(DefaultParams.CB_TD_REG, dict(n_estimators=n_estimators))), tags=[tag], rerun=True) job_mgr.run_jobs(scheduler) def run_cumulative_ablations(paths: Paths, tag: str = 'paper_cumulative_ablations', rerun: bool = False): job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) config_10_1_0 = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False) # todo: it's false train_class_task_infos = TaskCollection.from_name('meta-train-class', paths).load_infos(paths) train_reg_task_infos = TaskCollection.from_name('meta-train-reg', paths).load_infos(paths) # lr_grid_ntp = [0.01, 0.015, 0.025, 0.04, 0.07, 0.1, 0.2, 0.3, 0.4] # lr_grid_std = [4e-4, 7e-4, 1e-3, 1.5e-3, 2.5e-3, 4e-3, 7e-3, 1e-2, 2e-2] lr_grid_std = [2e-3, 4e-3] lr_grid_ntp = [0.04, 0.2] config_class = dict() config_reg = dict() ablation_counter = 1 def add_config(name: str, lr_grid: List[float], add: Optional[Dict[str, Any]] = None, add_class: Optional[Dict[str, Any]] = None, add_reg: Optional[Dict[str, Any]] = None, run_this: bool = True): nonlocal ablation_counter nonlocal config_class nonlocal config_reg if add is not None: config_class = utils.join_dicts(config_class, add) config_reg = utils.join_dicts(config_reg, add) if add_class is not None: config_class = utils.join_dicts(config_class, add_class) if add_reg is not None: config_reg = utils.join_dicts(config_reg, add_reg) if run_this: for lr in lr_grid: for task_infos, task_type_name, config in [(train_class_task_infos, 'class', config_class), (train_reg_task_infos, 'reg', config_reg)]: job_mgr.add_jobs(task_infos, config_10_1_0, f'MLP-cumul-abl-{ablation_counter}-{task_type_name}_{name}_lr-{lr:g}', NNInterfaceWrapper(**utils.update_dict(config, dict(lr=lr))), tags=[tag], rerun=rerun) ablation_counter += 1 mlp_rtdl_repr_config_class = dict( hidden_sizes=[128, 256, 128], p_drop=0.1, block_str='w-b-a-d', lr=1e-3, # will be overridden by the lrs from the grid anyway opt='adam', tfms=['quantile_tabr', 'embedding'], embedding_size=8, batch_size=128, n_epochs=1000, use_early_stopping=True, early_stopping_multiplicative_patience=1, early_stopping_additive_patience=20, act='relu', weight_param='standard', weight_init_mode='uniform', weight_init_gain=1. / np.sqrt(3.), bias_init_mode='pytorch-default', max_n_vectorized=1, use_last_best_epoch=False, emb_init_mode='kaiming-uniform-t', ) # for reproducing: weight decay # initialize missing embeddings to zero # have a different early stopping tolerance threshold # MLP-RTDL also uses the two-output + cross-entropy thing # hard to reproduce: handling unknown classes with different embedding category initialized to zero # todo: include all lr factors etc. add_config('rtdl-d-reprod', [1e-3], add=mlp_rtdl_repr_config_class, add_reg=dict(normalize_output=True), run_this=True) add_config('tune-lr', lr_grid_std) add_config('max-epochs-256', lr_grid_std, dict(n_epochs=256)) add_config('batch-size-256', lr_grid_std, dict(batch_size=256)) add_config('hidden-256x3', lr_grid_std, dict(hidden_sizes=[256] * 3)) add_config('normal-emb-init', lr_grid_std, dict(emb_init_mode='normal')) add_config('one-hot-small-cat', lr_grid_std, dict(tfms=['quantile_tabr', 'one_hot', 'embedding'], max_one_hot_cat_size=9)) # quantile_tabr was not well-suited for vectorization, now we can vectorize add_config('robust-scale-smooth-clip', lr_grid_std, dict(tfms=['one_hot', 'median_center', 'robust_scale', 'smooth_clip', 'embedding'], max_n_vectorized=50)) add_config('no-early-stop', lr_grid_std, dict(use_early_stopping=False)) add_config('last-best-epoch', lr_grid_std, dict(use_last_best_epoch=True)) add_config('lr-multi-cycle', lr_grid_std, dict(lr_sched='coslog4')) add_config('beta2-0.95', lr_grid_std, dict(sq_mom=0.95)) add_config('label-smoothing', lr_grid_std, add_class=dict(use_ls=True, ls_eps=0.1)) add_config('output-clipping', lr_grid_std, add_reg=dict(clamp_output=True)) add_config('ntp', lr_grid_ntp, dict(weight_param='ntk', bias_lr_factor=0.1)) add_config('weight-init-std', lr_grid_ntp, dict(weight_init_mode='std', weight_init_gain=1.0)) add_config('bias-init-he+5', lr_grid_ntp, dict(bias_init_mode='he+5')) add_config('different-act', lr_grid_ntp, add_class=dict(act='selu'), add_reg=dict(act='mish')) add_config('param-act', lr_grid_ntp, dict(use_parametric_act=True, act_lr_factor=0.1)) add_config('front-scale', lr_grid_ntp, dict(add_front_scale=True, scale_lr_factor=6.0)) add_config('num-emb-pl', lr_grid_ntp, dict(num_emb_type='pl', plr_sigma=0.1, plr_hidden_1=16, plr_hidden_2=4, plr_lr_factor=0.1)) add_config('num-emb-pbld', lr_grid_ntp, dict(num_emb_type='pbld')) add_config('pdrop-0.15', lr_grid_ntp, dict(p_drop=0.15)) add_config('pdrop-flat-cos', lr_grid_ntp, dict(p_drop_sched='flat_cos')) add_config('wd-0.02', lr_grid_ntp, dict(wd=0.02, bias_wd_factor=0.0)) add_config('wd-flat-cos', lr_grid_ntp, dict(wd_sched='flat_cos'), run_this=True) job_mgr.run_jobs(scheduler) pass if __name__ == '__main__': pass # ----- not in the paper, only experimental ----- # for i in range(50): # if (i + 1) % 5 == 0: # run_extra_realmlp_tuning_configs(paths, n_steps=i + 1) # run_additional_configs(paths) # run_ag_nn_configs(paths, tag='paper', only_meta_train=True, with_ftt=True) # run_mlp_random_configs(paths, n_steps=50) # not in the paper # run_mlp_random_seed_configs(paths, n_steps=20) # not in the paper ## run_seed_opt_configs(paths, random_seed_offset=1, tag='paper_seeds') # run_cumulative_ablations(paths) ================================================ FILE: scripts/run_probclass_experiments.py ================================================ import copy import time from typing import List, Optional, Dict, Any import numpy as np import pandas as pd import sklearn import torch from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsNNInterfaceWrapper, \ RandomParamsXGBInterfaceWrapper, LoadResultsWrapper, NNInterfaceWrapper, XGBInterfaceWrapper from pytabkit.bench.data.common import SplitType from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.run.results import ResultManager from pytabkit.bench.run.task_execution import TabBenchJobManager, RunConfig, run_alg_selection from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models import utils from pytabkit.models.data.data import TaskType from pytabkit.models.data.splits import SplitInfo from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.training.metrics import Metrics class ProbclassExperiments: def __init__(self, paths: Paths, n_tt_splits: int, n_cv: int, n_hpo_steps: int, hpo_models: List[str], default_models: List[str]): self.paths = paths self.n_tt_splits = n_tt_splits self.n_cv = n_cv self.n_hpo_steps = n_hpo_steps self.hpo_models = hpo_models self.default_models = default_models self.job_mgr = None self.scheduler = None self.config = None self.task_infos = None self.val_metric_names = None self.calib_options = None self.hpo_names = None def setup(self): # don't do this in the constructor so we have a new job_mgr etc. every time (to be safe) self.job_mgr = TabBenchJobManager(paths) self.scheduler = SimpleJobScheduler(RayJobManager()) metrics = Metrics(metric_names=[ 'cross_entropy', 'brier', 'n_cross_entropy', 'n_brier', 'logloss-clip1e-06', 'smece', 'ece-15', 'rmsce-15', 'mce-15', 'class_error', '1-mcc', '1-auroc-ovr', 'ref-ll-ts', 'ref-br-ts', 'cal-ll-ts', 'cal-br-ts', ], val_metric_name='logloss', # probably unused anyway task_type=TaskType.CLASSIFICATION) self.config = RunConfig(n_tt_splits=self.n_tt_splits, n_cv=self.n_cv, n_refit=0, save_y_pred=True, metrics=metrics, train_fraction=0.8) self.task_infos = TaskCollection.from_name('talent-class-small', paths).load_infos(paths) self.val_metric_names = ['cross_entropy', 'brier', 'class_error', '1-auroc-ovr', 'ref-ll-ts', 'ref-br-ts'] self.hpo_names = copy.copy(self.hpo_models) if n_cv != 1: self.hpo_names = [bn + f'-cv{n_cv}' for bn in self.hpo_names] self.calib_options = {'ts-mix': dict(calibration_method='temp-scaling', calibrate_with_mixture=True)} def run_hpo_configs(self, n_hpo_steps: Optional[int] = None, rerun: bool = False): # for RealMLP: # 10 steps with 2 splits and n_cv=1: 2h5m # 2 steps with 2 splits and n_cv=5: 48m # 10 steps with 2 splits and n_cv=5: 4h6m # -> run 50 steps with 10 splits and n_cv=1: a bit more than 50h # for XGB: # 10 steps with 2 splits and n_cv=5: 10h22m (but waiting long for results on volkert, otherwise more like 6h30m) # 20 steps with 2 splits and n_cv=1: 3h # -> run 50 steps with 10 splits and n_cv=1: 37h self.setup() if n_hpo_steps is None: n_hpo_steps = self.n_hpo_steps # tag = f'paper_hpo-cv{n_cv}' if n_cv != 1 else 'paper_hpo' tag = 'paper_hpo' cv_str = f'-cv{n_cv}' if n_cv != 1 else '' for step_idx in range(n_hpo_steps): for base_name in self.hpo_names: if base_name.startswith('RealMLP-HPO'): self.job_mgr.add_jobs(self.task_infos, self.config, f'RealMLP-HPO{cv_str}_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='probclass', val_metric_names=self.val_metric_names), tags=[tag + '_' + base_name], rerun=rerun) elif base_name.startswith('XGB-HPO'): self.job_mgr.add_jobs(self.task_infos, self.config, f'XGB-HPO{cv_str}_step-{step_idx}', RandomParamsXGBInterfaceWrapper(model_idx=step_idx, hpo_space_name='probclass', n_estimators=1000, early_stopping_rounds=1000, val_metric_names=self.val_metric_names), tags=[tag + '_' + base_name], rerun=rerun) elif base_name.startswith('MLP-HPO'): self.job_mgr.add_jobs(self.task_infos, self.config, f'MLP-HPO{cv_str}_step-{step_idx}', RandomParamsNNInterfaceWrapper(model_idx=step_idx, hpo_space_name='probclass-mlp', val_metric_names=self.val_metric_names), tags=[tag + '_' + base_name], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) def run_hpo_alg_selection(self, rerun: bool = False): tag = 'paper' self.setup() for base_name in self.hpo_names: for val_metric_name in self.val_metric_names: alg_names = [f'{base_name}_step-{i}_val-{val_metric_name}' for i in range(self.n_hpo_steps)] run_alg_selection(paths, self.config, self.task_infos, f'{base_name}-{self.n_hpo_steps}_val-{val_metric_name}', alg_names, val_metric_name, tags=[tag + '_' + base_name], rerun=rerun) def run_hpo_calibration_configs(self, rerun: bool = False): tag = 'paper' self.setup() for base_name in self.hpo_names: for calib_name, calib_params in self.calib_options.items(): for val_metric_name in self.val_metric_names: alg_name = f'{base_name}-{self.n_hpo_steps}_val-{val_metric_name}' self.job_mgr.add_jobs(self.task_infos, self.config, f'{alg_name}_{calib_name}', LoadResultsWrapper(alg_name=alg_name, **calib_params), tags=[tag + '_' + base_name], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) def run_step_calibration_configs(self, rerun: bool = False): # took 1h10m for 20 steps and 2 tt splits of RealMLP-HPO tag = 'paper_hpo-calib' self.setup() for calib_name, calib_params in self.calib_options.items(): for val_metric_name in self.val_metric_names: for step_idx in range(self.n_hpo_steps): for base_name in self.hpo_names: alg_name = f'{base_name}_step-{step_idx}_val-{val_metric_name}' self.job_mgr.add_jobs(self.task_infos, self.config, f'{alg_name}_{calib_name}', LoadResultsWrapper(alg_name=alg_name, **calib_params), tags=[tag + '_' + base_name], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) def run_default_configs(self, rerun: bool = False): tag = 'paper' cv_str = f'-cv{n_cv}' if n_cv != 1 else '' self.setup() val_metric_names = self.val_metric_names + ['ref-ll-ts-cv5', 'ref-ll-is'] for base_name in self.default_models: if base_name.startswith('RealMLP-TD'): self.job_mgr.add_jobs(self.task_infos, self.config, f'RealMLP-TD{cv_str}', NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_CLASS, dict( use_ls=False, val_metric_names=val_metric_names, ))), tags=[tag + '_' + base_name], rerun=rerun) elif base_name.startswith('XGB-D'): self.job_mgr.add_jobs(self.task_infos, self.config, f'XGB-D{cv_str}', XGBInterfaceWrapper(**DefaultParams.XGB_D, val_metric_names=val_metric_names), tags=[tag + '_' + base_name], rerun=rerun) elif base_name.startswith('MLP-D'): self.job_mgr.add_jobs(self.task_infos, self.config, f'MLP-D{cv_str}', NNInterfaceWrapper(**DefaultParams.VANILLA_MLP_CLASS, val_metric_names=val_metric_names), tags=[tag + '_' + base_name], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) def run_default_calibration_configs(self, rerun: bool = False): tag = 'paper' self.setup() val_metric_names = self.val_metric_names + ['ref-ll-ts-cv5', 'ref-ll-is'] for base_name in self.default_models: for calib_name, calib_params in self.calib_options.items(): for val_metric_name in val_metric_names: alg_name = f'{base_name}_val-{val_metric_name}' self.job_mgr.add_jobs(self.task_infos, self.config, f'{alg_name}_{calib_name}', LoadResultsWrapper(alg_name=alg_name, **calib_params), tags=[tag + '_' + base_name], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) @staticmethod def get_extended_calib_methods() -> Dict[str, Dict[str, Any]]: return { 'ts': dict(calibration_method='temp-scaling'), 'ts-mix': dict(calibration_method='temp-scaling', calibrate_with_mixture=True), 'ag-ts': dict(calibration_method='autogluon-ts'), 'ag-ts-mix': dict(calibration_method='autogluon-ts', calibrate_with_mixture=True), 'ag-inv-ts': dict(calibration_method='autogluon-inv-ts'), 'ag-inv-ts-mix': dict(calibration_method='autogluon-inv-ts', calibrate_with_mixture=True), 'torchunc-ts': dict(calibration_method='torchunc-ts'), 'torchunc-ts-mix': dict(calibration_method='torchunc-ts', calibrate_with_mixture=True), 'torchcal-ts': dict(calibration_method='torchcal-ts'), 'torchcal-ts-mix': dict(calibration_method='torchcal-ts', calibrate_with_mixture=True), 'guo-ts': dict(calibration_method='guo-ts'), 'guo-ts-mix': dict(calibration_method='guo-ts', calibrate_with_mixture=True), 'ir': dict(calibration_method='isotonic'), 'ir-mix': dict(calibration_method='isotonic', calibrate_with_mixture=True), } def run_calibration_benchmark(self, rerun: bool = False): tag = 'paper_calib-bench' self.setup() alg_name = f'XGB-D_val-class_error' calib_methods = self.get_extended_calib_methods() for calib_name, calib_params in calib_methods.items(): self.job_mgr.add_jobs(self.task_infos, self.config, f'{alg_name}_calib-bench_{calib_name}', LoadResultsWrapper(alg_name=alg_name, **calib_params), tags=[tag], rerun=rerun) self.job_mgr.run_jobs(self.scheduler) def run_calibration_timing(self, rerun: bool = False): import probmetrics.calibrators from probmetrics.distributions import CategoricalLogits self.setup() results_list = [] csv_path = paths.base() / 'calib_times' / 'times.csv' if utils.existsFile(csv_path) and not rerun: return alg_name = f'XGB-D_val-class_error' calib_methods = self.get_extended_calib_methods() for i, task_info in enumerate(self.task_infos): print(f'Running calibration timing on {task_info.task_desc} ({i+1}/{len(self.task_infos)})') ds = task_info.load_task(self.paths).ds y_full = ds.tensors['y'].squeeze(-1) random_splits = task_info.get_random_splits(self.n_tt_splits, train_fraction=self.config.train_fraction, trainval_fraction=self.config.trainval_fraction) for split_idx in range(self.n_tt_splits): random_split: SplitInfo = random_splits[split_idx] trainval_split = random_split.splitter.split_ds(ds) trainval_idxs = trainval_split.get_sub_idxs(0) trainval_ds = trainval_split.get_sub_ds(0) sub_splits = random_split.get_sub_splits(trainval_ds, n_splits=self.n_cv, is_cv=True) path = self.paths.results_alg_task_split(task_info.task_desc, alg_name, n_cv=self.n_cv, split_type=SplitType.RANDOM, split_id=split_idx) rm = ResultManager.load(path, load_other=False, load_preds=True) y_logits_torch = torch.as_tensor(rm.y_preds_cv, dtype=torch.float32) for cv_idx in range(self.n_cv): sub_split = sub_splits[cv_idx] val_idxs = trainval_idxs[sub_split.get_sub_idxs(0)] y_val = y_full[val_idxs] y_pred_val = CategoricalLogits(y_logits_torch[cv_idx, val_idxs]) for calib_name, calib_params in calib_methods.items(): cal = probmetrics.calibrators.get_calibrator(**calib_params) if i == 0 and split_idx == 0 and cv_idx == 0: # dry run to avoid measuring import times cal_tmp = sklearn.base.clone(cal) cal_tmp.fit_torch(y_pred_val, y_val) start_time = time.time() cal.fit_torch(y_pred_val, y_val) end_time = time.time() results_list.append(dict( alg_name=alg_name, calib_name=calib_name, task=str(task_info.task_desc), n_val=len(val_idxs), tt_split_idx=split_idx, cv_split_idx=cv_idx, time=end_time - start_time)) results_df = pd.DataFrame(results_list) utils.ensureDir(csv_path) results_df.to_csv(csv_path) if __name__ == '__main__': n_hpo_steps = 30 n_tt_splits = 5 n_cv = 1 paths = Paths.from_env_variables() exp = ProbclassExperiments(paths=paths, n_tt_splits=n_tt_splits, n_cv=n_cv, n_hpo_steps=n_hpo_steps, hpo_models=['MLP-HPO', 'XGB-HPO', 'RealMLP-HPO'], default_models=['MLP-D', 'XGB-D', 'RealMLP-TD']) exp.run_default_configs() exp.run_default_calibration_configs() # took 9h for 20 steps with 5 splits for MLP-HPO # for RealMLP + XGB-HPO: 9h45m + 1h34m + ... # 30 hpo steps with 5 splits for MLP + RealMLP + XGB: 9h + 9h45m + 1h34m + 17h52m = 20h19m + 17h52m = 38h11m exp.run_hpo_configs() exp.run_hpo_alg_selection() exp.run_hpo_calibration_configs() exp.run_calibration_timing() exp.run_calibration_benchmark() # not used in the paper # exp.run_step_calibration_configs() ================================================ FILE: scripts/run_single_task.py ================================================ import time import numpy as np import torch from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskPackage, TaskDescription from pytabkit.bench.scheduling.resources import NodeResources from pytabkit.models import utils from pytabkit.models.sklearn.default_params import DefaultParams from pytabkit.models.training.logging import StdoutLogger from pytabkit.bench.alg_wrappers.interface_wrappers import NNInterfaceWrapper, MLPRTDLInterfaceWrapper, ResNetRTDLInterfaceWrapper, \ TabRInterfaceWrapper from pytabkit.bench.alg_wrappers.extra_interface_wrappers import IterativeImportanceNNInterfaceWrapper, \ IterativeWeightNNInterfaceWrapper, IterativeReinitNNInterfaceWrapper from pytabkit.models.training.metrics import Metrics def run_example(paths: Paths): start_time = time.time() use_gpu = torch.cuda.is_available() wrapper = NNInterfaceWrapper(**utils.join_dicts(DefaultParams.RealMLP_TD_REG)) task_info = TaskDescription('uci-reg', 'parkinson_motor').load_info(paths) print('n_samples:', task_info.n_samples) print('n_cont:', task_info.tensor_infos['x_cont'].get_n_features()) print('x_cat cat sizes:', task_info.tensor_infos['x_cat'].get_cat_sizes()) print('n_classes:', task_info.tensor_infos['y'].get_cat_sizes()) if task_info.tensor_infos['y'].get_cat_sizes() > 0: class_frequencies = torch.bincount(task_info.load_task(paths).ds.tensors['y'].squeeze(-1)) print(f'class frequencies: {class_frequencies.numpy()}') is_nn = (isinstance(wrapper, NNInterfaceWrapper) or isinstance(wrapper, MLPRTDLInterfaceWrapper) or isinstance(wrapper, ResNetRTDLInterfaceWrapper) or isinstance(wrapper, IterativeImportanceNNInterfaceWrapper) or isinstance(wrapper, IterativeWeightNNInterfaceWrapper) or isinstance(wrapper, IterativeReinitNNInterfaceWrapper) or isinstance(wrapper, TabRInterfaceWrapper)) use_gpu = use_gpu and is_nn print(f'Running on task {task_info.task_desc}') if is_nn: split_infos = task_info.get_random_splits(10)[0:1] task_package = TaskPackage(task_info, split_infos=split_infos, n_cv=1, n_refit=0, paths=paths, rerun=False, alg_name='test', save_y_pred=False) else: split_infos = task_info.get_random_splits(10)[1:2] task_package = TaskPackage(task_info, split_infos=split_infos, n_cv=1, n_refit=0, paths=paths, rerun=True, alg_name='test', save_y_pred=False) logger = StdoutLogger(verbosity_level=2) metric_name = Metrics.default_eval_metric_name(task_info.task_type) required_resources = wrapper.get_required_resources(task_package) print(f'Predicted time usage in s: {required_resources.time_s:g}') print(f'Predicted CPU RAM usage in GB: {required_resources.cpu_ram_gb:g}') print(f'Requested n_threads: {required_resources.n_threads:g}') # metric_name = '1-auroc' gpu_usages = np.array([1.0]) if use_gpu and is_nn else np.array([], dtype=np.float32) gpu_rams_gb = np.array([5.0]) if use_gpu and is_nn else np.array([], dtype=np.float32) tmp_folders = [paths.results_alg_task_split(task_package.task_info.task_desc, alg_name=task_package.alg_name, n_cv=task_package.n_cv, split_type=split_info.split_type, split_id=split_info.id) / 'tmp' for split_info in task_package.split_infos] result_managers = wrapper.run(task_package, logger, assigned_resources=NodeResources(node_id=0, n_threads=16.0, cpu_ram_gb=2.0, gpu_usages=gpu_usages, gpu_rams_gb=gpu_rams_gb, physical_core_usages=np.array([0.0])), tmp_folders=tmp_folders) for rm in result_managers: print(rm.metrics_dict) print(rm.other_dict) result_pairs = [('val', [rm.metrics_dict['cv']['val']['1']['0'][metric_name] for rm in result_managers])] for is_cv in [True, False] if task_package.n_refit > 0 else [True]: cv_str = 'cv' if is_cv else 'refit' max_n_models = task_package.n_cv if is_cv else task_package.n_refit for n_models in {1, max_n_models}: # use a set in case max_n_models == 1 try: name = 'test-' + cv_str + '-' + str(n_models) results = [rm.metrics_dict[cv_str]['test'][str(n_models)][str(start_idx)][metric_name] for rm in result_managers for start_idx in range(1 if n_models > 1 else max_n_models)] result_pairs.append((name, results)) except KeyError as e: print(e) pass # might happen if wrapper is not a randomized alg and therefore does not do ensembling for name, results in result_pairs: print(f'Mean {name} error: {np.mean(results):g} +- {np.std(results) / np.sqrt(len(results)):g}') # for rm in rms: # print('val:', rm.val_dict) # print('test:', rm.test_dict) print(f'Time: {time.time() - start_time:g} s') if __name__ == '__main__': run_example(Paths.from_env_variables()) ================================================ FILE: scripts/run_slurm.py ================================================ import functools import fire from run_experiments import run_gbdt_rs_configs from pytabkit.bench.data.paths import Paths if __name__ == '__main__': # paths = Paths.from_env_variables() # run_configs(paths) fire.Fire(run_gbdt_rs_configs) ================================================ FILE: scripts/run_time_measurement.py ================================================ import random import time import torch import numpy as np import sklearn from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskInfo, TaskCollection from pytabkit.models import utils from pytabkit.models.data.splits import RandomSplitter from pytabkit.models.sklearn.sklearn_base import AlgInterfaceEstimator from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier, CatBoost_TD_Classifier, \ LGBM_TD_Classifier, \ XGB_TD_Classifier, LGBM_D_Classifier, CatBoost_D_Classifier, XGB_D_Classifier, LGBM_HPO_Classifier, \ CatBoost_HPO_Classifier, \ XGB_HPO_Classifier, RealMLP_HPO_Classifier, XGB_PBB_D_Classifier, RF_SKL_D_Classifier, MLP_SKL_D_Classifier, \ MLP_SKL_D_Regressor, \ RF_SKL_D_Regressor, RealMLP_HPO_Regressor, XGB_HPO_Regressor, CatBoost_HPO_Regressor, LGBM_HPO_Regressor, \ XGB_D_Regressor, \ CatBoost_D_Regressor, LGBM_D_Regressor, RealMLP_TD_Regressor, RealMLP_TD_S_Regressor, RealMLP_TD_S_Classifier, \ XGB_TD_Regressor, \ CatBoost_TD_Regressor, LGBM_TD_Regressor, MLP_RTDL_D_Classifier, Resnet_RTDL_D_Classifier, MLP_RTDL_D_Regressor, \ Resnet_RTDL_D_Regressor, TabR_S_D_Classifier, TabR_S_D_Regressor, MLP_RTDL_HPO_Classifier, \ MLP_RTDL_HPO_Regressor, XGB_HPO_TPE_Regressor, LGBM_HPO_TPE_Regressor, \ CatBoost_HPO_TPE_Regressor, XGB_HPO_TPE_Classifier, LGBM_HPO_TPE_Classifier, CatBoost_HPO_TPE_Classifier, \ MLP_PLR_D_Classifier, MLP_PLR_HPO_Classifier, MLP_PLR_D_Regressor, MLP_PLR_HPO_Regressor, Resnet_RTDL_HPO_Regressor, \ Resnet_RTDL_HPO_Classifier, RealTabR_D_Classifier, RealTabR_D_Regressor, TabR_HPO_Classifier, TabR_HPO_Regressor, \ RF_HPO_Classifier, RF_HPO_Regressor, FTT_HPO_Classifier, FTT_D_Classifier, FTT_D_Regressor, FTT_HPO_Regressor def measure_times(paths: Paths, alg_name: str, estimator: AlgInterfaceEstimator, coll_name: str, device: str, rerun: bool = False, n_predict_reps: int = 20) -> None: task_infos = TaskCollection.from_name(coll_name, paths).load_infos(paths) times_list = [] for task_info in task_infos: file_path = paths.times_alg_task(alg_name=alg_name, task_desc=task_info.task_desc) / 'times.yaml' if utils.existsFile(file_path) and not rerun: times_list.append(utils.deserialize(file_path, use_yaml=True)) # print(f'Results exist already') continue print(f'Measuring time for alg {alg_name} on task {task_info.task_desc}: ', end='') estimator: AlgInterfaceEstimator = sklearn.base.clone(estimator) estimator.device = device task = task_info.load_task(paths) ds = task.ds seed = task_info.n_samples random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) trainval_test_split = RandomSplitter(seed).split_ds(ds) trainval_ds, test_ds = trainval_test_split.get_sub_ds(0), trainval_test_split.get_sub_ds(1) train_val_split = RandomSplitter(seed + 1, first_fraction=0.75).split_ds(trainval_ds) val_idxs = train_val_split.get_sub_idxs(1).numpy() x_trainval = trainval_ds.without_labels().to_df() y_trainval = trainval_ds.tensors['y'].numpy().squeeze(-1) x_test = test_ds.without_labels().to_df() start_time = time.time() estimator.fit(x_trainval, y_trainval, val_idxs=val_idxs) end_time = time.time() fit_time = end_time - start_time start_time = time.time() for i in range(n_predict_reps): estimator.predict(x_test) end_time = time.time() predict_time = (end_time - start_time) / n_predict_reps times = {'fit_time': fit_time, 'predict_time': predict_time} utils.serialize(file_path, times, use_yaml=True) times_list.append(times) print(f'{fit_time=:g}s, {predict_time=:g}s') avg_fit_time = np.mean([times['fit_time'] for times in times_list]) avg_predict_time = np.mean([times['predict_time'] for times in times_list]) print(f'Average times for {alg_name} on {coll_name}: {avg_fit_time=:g}s, {avg_predict_time=:g}s') def measure_times_cpu_class(n_threads: int, rerun: bool = False): paths = Paths.from_env_variables() estimators = { 'LGBM-TD_CPU': LGBM_TD_Classifier(n_threads=n_threads, verbosity=-1), 'CatBoost-TD_CPU': CatBoost_TD_Classifier(n_threads=n_threads), 'XGB-TD_CPU': XGB_TD_Classifier(n_threads=n_threads), 'RealMLP-TD_CPU': RealMLP_TD_Classifier(n_threads=n_threads), 'RealMLP-TD-S_CPU': RealMLP_TD_S_Classifier(n_threads=n_threads), 'LGBM-D_CPU': LGBM_D_Classifier(n_threads=n_threads, verbosity=-1), 'CatBoost-D_CPU': CatBoost_D_Classifier(n_threads=n_threads), 'XGB-D_CPU': XGB_D_Classifier(n_threads=n_threads), 'RF-SKL-D_CPU': RF_SKL_D_Classifier(n_threads=n_threads), 'MLP-SKL-D_CPU': MLP_SKL_D_Classifier(n_threads=n_threads), 'MLP-RTDL-D_CPU': MLP_RTDL_D_Classifier(n_threads=n_threads), 'MLP-PLR-D_CPU': MLP_PLR_D_Classifier(n_threads=n_threads), 'ResNet-RTDL-D_CPU': Resnet_RTDL_D_Classifier(n_threads=n_threads), 'XGB-PBB-D_CPU': XGB_PBB_D_Classifier(n_threads=n_threads), 'TabR-S-D_CPU': TabR_S_D_Classifier(n_threads=n_threads), 'RealTabR-D_CPU': RealTabR_D_Classifier(n_threads=n_threads), 'FTT-D_CPU': FTT_D_Classifier(n_threads=n_threads), 'RealMLP-HPO-2_CPU': RealMLP_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'MLP-RTDL-HPO-2_CPU': MLP_RTDL_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'MLP-PLR-HPO-2_CPU': MLP_PLR_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'ResNet-RTDL-HPO-2_CPU': Resnet_RTDL_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'XGB-HPO-TPE_CPU': XGB_HPO_TPE_Classifier(n_threads=n_threads), 'LGBM-HPO-TPE_CPU': LGBM_HPO_TPE_Classifier(n_threads=n_threads, verbosity=-1), 'CatBoost-HPO-TPE_CPU': CatBoost_HPO_TPE_Classifier(n_threads=n_threads), 'XGB-HPO-2_CPU': XGB_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'LGBM-HPO-2_CPU': LGBM_HPO_Classifier(n_threads=n_threads, verbosity=-1, n_hyperopt_steps=2), 'CatBoost-HPO-2_CPU': CatBoost_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'RF-HPO-2_CPU': RF_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=2), 'TabR-HPO-1_CPU': TabR_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=1), 'FTT-HPO-1_CPU': FTT_HPO_Classifier(n_threads=n_threads, n_hyperopt_steps=1), 'LGBM-D_val-ce_CPU': LGBM_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy', verbosity=-1), 'XGB-D_val-ce_CPU': XGB_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'), 'CatBoost-D_val-ce_CPU': CatBoost_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'), 'LGBM-TD_val-ce_CPU': LGBM_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy', verbosity=-1), 'XGB-TD_val-ce_CPU': XGB_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'), 'CatBoost-TD_val-ce_CPU': CatBoost_TD_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'), 'XGB-PBB-D_val-ce_CPU': XGB_PBB_D_Classifier(n_threads=n_threads, val_metric_name='cross_entropy'), 'RealMLP-TD_val-ce_no-ls_CPU': RealMLP_TD_Classifier(val_metric_name='cross_entropy', use_ls=False, n_threads=n_threads), 'RealMLP-TD-S_val-ce_no-ls_CPU': RealMLP_TD_S_Classifier(val_metric_name='cross_entropy', use_ls=False, n_threads=n_threads), 'RealMLP-TD_no-ls_CPU': RealMLP_TD_Classifier(device='cpu', use_ls=False, n_threads=n_threads), 'RealMLP-TD-S_no-ls_CPU': RealMLP_TD_S_Classifier(device='cpu', use_ls=False, n_threads=n_threads), 'RealMLP-TD_val-ce_CPU': RealMLP_TD_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'RealMLP-TD-S_val-ce_CPU': RealMLP_TD_S_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'MLP-RTDL-D_val-ce_CPU': MLP_RTDL_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'MLP-PLR-D_val-ce_CPU': MLP_PLR_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'ResNet-RTDL-D_val-ce_CPU': Resnet_RTDL_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'TabR-S-D_val-ce_CPU': TabR_S_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'RealTabR-D_val-ce_CPU': RealTabR_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'RealTabR-D_no-ls_CPU': RealTabR_D_Classifier(ls_eps=0.0, n_threads=n_threads), 'RealTabR-D_val-ce_no-ls_CPU': RealTabR_D_Classifier(ls_eps=0.0, val_metric_name='cross_entropy', n_threads=n_threads), 'FTT-D_val-ce_CPU': FTT_D_Classifier(val_metric_name='cross_entropy', n_threads=n_threads), 'MLP-RTDL-D_rssc_CPU': MLP_RTDL_D_Classifier(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'ResNet-RTDL-D_rssc_CPU': Resnet_RTDL_D_Classifier(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'TabR-S-D_rssc_CPU': TabR_S_D_Classifier(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'FTT-D_rssc_CPU': FTT_D_Classifier(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'MLP-PLR-D_rssc_CPU': MLP_PLR_D_Classifier(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), } for alg_name, estimator in estimators.items(): measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-class', device='cpu', rerun=rerun) def measure_times_cpu_reg(n_threads: int, rerun: bool = False): paths = Paths.from_env_variables() estimators = { 'LGBM-TD_CPU': LGBM_TD_Regressor(n_threads=n_threads, verbosity=-1), 'CatBoost-TD_CPU': CatBoost_TD_Regressor(n_threads=n_threads), 'XGB-TD_CPU': XGB_TD_Regressor(n_threads=n_threads), 'RealMLP-TD_CPU': RealMLP_TD_Regressor(n_threads=n_threads), 'RealMLP-TD-S_CPU': RealMLP_TD_S_Regressor(n_threads=n_threads), 'LGBM-D_CPU': LGBM_D_Regressor(n_threads=n_threads, verbosity=-1), 'CatBoost-D_CPU': CatBoost_D_Regressor(n_threads=n_threads), 'XGB-D_CPU': XGB_D_Regressor(n_threads=n_threads), 'RF-SKL-D_CPU': RF_SKL_D_Regressor(n_threads=n_threads), 'MLP-SKL-D_CPU': MLP_SKL_D_Regressor(n_threads=n_threads), 'MLP-RTDL-D_CPU': MLP_RTDL_D_Regressor(n_threads=n_threads), 'MLP-PLR-D_CPU': MLP_PLR_D_Regressor(n_threads=n_threads), 'ResNet-RTDL-D_CPU': Resnet_RTDL_D_Regressor(n_threads=n_threads), 'TabR-S-D_CPU': TabR_S_D_Regressor(n_threads=n_threads), 'RealTabR-D_CPU': RealTabR_D_Regressor(n_threads=n_threads), 'FTT-D_CPU': FTT_D_Regressor(n_threads=n_threads), 'RealMLP-HPO-2_CPU': RealMLP_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'MLP-RTDL-HPO-2_CPU': MLP_RTDL_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'MLP-PLR-HPO-2_CPU': MLP_PLR_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'ResNet-RTDL-HPO-2_CPU': Resnet_RTDL_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'XGB-HPO-2_CPU': XGB_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'LGBM-HPO-2_CPU': LGBM_HPO_Regressor(n_threads=n_threads, verbosity=-1, n_hyperopt_steps=2), 'CatBoost-HPO-2_CPU': CatBoost_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'XGB-HPO-TPE_CPU': XGB_HPO_TPE_Regressor(n_threads=n_threads), 'LGBM-HPO-TPE_CPU': LGBM_HPO_TPE_Regressor(n_threads=n_threads, verbosity=-1), 'CatBoost-HPO-TPE_CPU': CatBoost_HPO_TPE_Regressor(n_threads=n_threads), 'RF-HPO-2_CPU': RF_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=2), 'TabR-HPO-1_CPU': TabR_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=1), 'FTT-HPO-1_CPU': FTT_HPO_Regressor(n_threads=n_threads, n_hyperopt_steps=1), 'MLP-RTDL-D_rssc_CPU': MLP_RTDL_D_Regressor(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'ResNet-RTDL-D_rssc_CPU': Resnet_RTDL_D_Regressor(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'TabR-S-D_rssc_CPU': TabR_S_D_Regressor(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'FTT-D_rssc_CPU': FTT_D_Regressor(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), 'MLP-PLR-D_rssc_CPU': MLP_PLR_D_Regressor(n_threads=n_threads, tfms=['median_center', 'robust_scale', 'smooth_clip']), } for alg_name, estimator in estimators.items(): measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-reg', device='cpu', rerun=rerun) def measure_times_gpu_class(n_threads: int, rerun: bool = False): paths = Paths.from_env_variables() # todo: add XGB-GPU and CatBoost-GPU? estimators = { 'MLP-TD_GPU': RealMLP_TD_Classifier(device='cuda:0', n_threads=n_threads), 'MLP-TD-S_GPU': RealMLP_TD_S_Classifier(device='cuda:0', n_threads=n_threads), 'MLP-HPO-2_GPU': RealMLP_HPO_Classifier(device='cuda:0', n_threads=n_threads, n_hyperopt_steps=2), } import torch # have torch cuda initialization before running the first NN _ = torch.zeros(1, device='cuda:0') for alg_name, estimator in estimators.items(): measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-class', device='cuda:0', rerun=rerun) def measure_times_gpu_reg(n_threads: int, rerun: bool = False): paths = Paths.from_env_variables() estimators = { 'MLP-TD_GPU': RealMLP_TD_Regressor(device='cuda:0', n_threads=n_threads), 'MLP-TD-S_GPU': RealMLP_TD_S_Regressor(device='cuda:0', n_threads=n_threads), 'MLP-HPO-2_GPU': RealMLP_HPO_Regressor(device='cuda:0', n_threads=n_threads, n_hyperopt_steps=2), } import torch # have torch cuda initialization before running the first NN _ = torch.zeros(1, device='cuda:0') for alg_name, estimator in estimators.items(): measure_times(paths, alg_name=alg_name, estimator=estimator, coll_name='meta-train-reg', device='cuda:0', rerun=rerun) if __name__ == '__main__': # may take a day or so on a good CPU n_threads = 32 measure_times_cpu_class(n_threads=n_threads, rerun=False) measure_times_cpu_reg(n_threads=n_threads, rerun=False) # measure_times_gpu_class(n_threads=n_threads, rerun=False) # not used in the paper # measure_times_gpu_reg(n_threads=n_threads, rerun=False) # not used in the paper ================================================ FILE: scripts/run_xrfm_large_ablations.py ================================================ import fire from pytabkit.bench.alg_wrappers.interface_wrappers import RandomParamsxRFMInterfaceWrapper from pytabkit.bench.run.task_execution import RunConfig, TabBenchJobManager, run_alg_selection from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskCollection from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models.data.data import TaskType def run_xrfm_large_ablations(hpo_space_name: str, n_hpo_steps: int = 30, rerun: bool = False): # todo: install xrfm directly from the repo # todo: set env variable for the tab_bench_data_path # todo: measure runtime # todo: ensure that only one job runs per GPU, so that the time measurements are accurate # todo: make sure to install the version with kermac paths = Paths.from_env_variables() task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)) task_infos = [ti for ti in task_infos if 70_000 <= ti.n_samples] class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION] reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION] TaskCollection('meta-test-large-class', [info.task_desc for info in class_task_infos]).save(paths) TaskCollection('meta-test-large-reg', [info.task_desc for info in reg_task_infos]).save(paths) for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]: print(f'{name} task infos:') for info in infos: print(f'{info.task_desc}: n_samples={info.n_samples}') print() config = RunConfig(n_tt_splits=1, n_cv=1, n_refit=0, save_y_pred=False) job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) for step_idx in range(n_hpo_steps): job_mgr.add_jobs(task_infos, config, f'xRFM-HPO-{hpo_space_name}_new_step-{step_idx}', RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name, M_batch_size=8192, max_leaf_size=40_000), tags=[f'xrfm_hpo_{hpo_space_name}_new_steps'], rerun=rerun) job_mgr.run_jobs(scheduler) alg_names = [f'xRFM-HPO-{hpo_space_name}_new_step-{i}' for i in range(n_hpo_steps)] run_alg_selection(paths, config, class_task_infos, f'xRFM-HPO-{hpo_space_name}_new', alg_names, val_metric_name='class_error', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) run_alg_selection(paths, config, reg_task_infos, f'xRFM-HPO-{hpo_space_name}_new', alg_names, val_metric_name='rmse', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) def run_xrfm_large_ablations_old(hpo_space_name: str = 'paper-large-pca', n_hpo_steps: int = 30, rerun: bool = False): # todo: install xrfm directly from the repo # todo: set env variable for the tab_bench_data_path # todo: measure runtime # todo: ensure that only one job runs per GPU, so that the time measurements are accurate # todo: make sure to install the version with kermac paths = Paths.from_env_variables() task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)) task_infos = [ti for ti in task_infos if 70_000 <= ti.n_samples <= 200_000] class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION] reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION] TaskCollection('meta-test-medlarge-class', [info.task_desc for info in class_task_infos]).save(paths) TaskCollection('meta-test-medlarge-reg', [info.task_desc for info in reg_task_infos]).save(paths) for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]: print(f'{name} task infos:') for info in infos: print(f'{info.task_desc}: n_samples={info.n_samples}') print() config = RunConfig(n_tt_splits=1, n_cv=1, n_refit=0, save_y_pred=False) job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) for step_idx in range(n_hpo_steps): job_mgr.add_jobs(task_infos, config, f'xRFM-HPO-{hpo_space_name}_Mbs-8192_step-{step_idx}', RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name, M_batch_size=8192), tags=[f'xrfm_hpo_{hpo_space_name}_steps'], rerun=rerun) job_mgr.run_jobs(scheduler) alg_names = [f'xRFM-HPO-{hpo_space_name}_Mbs-8192_step-{i}' for i in range(n_hpo_steps)] run_alg_selection(paths, config, class_task_infos, f'xRFM-HPO-{hpo_space_name}_Mbs-8192', alg_names, val_metric_name='class_error', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) run_alg_selection(paths, config, reg_task_infos, f'xRFM-HPO-{hpo_space_name}_Mbs-8192', alg_names, val_metric_name='rmse', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) def run_xrfm_small_test_ablations(n_hpo_steps: int = 50, rerun: bool = False): # todo: install xrfm directly from the repo # todo: set env variable # todo: measure runtime # todo: ensure that only one job runs per GPU, so that the time measurements are accurate paths = Paths.from_env_variables() task_infos = TaskCollection.from_name('meta-test-class', paths).load_infos(paths) task_infos.extend(TaskCollection.from_name('meta-test-reg', paths).load_infos(paths)) task_infos = [ti for ti in task_infos if 100 <= ti.n_samples <= 2000] class_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.CLASSIFICATION] reg_task_infos = [ti for ti in task_infos if ti.task_type == TaskType.REGRESSION] TaskCollection('meta-test-small-class', [info.task_desc for info in class_task_infos]).save(paths) TaskCollection('meta-test-small-reg', [info.task_desc for info in reg_task_infos]).save(paths) for name, infos in [('class', class_task_infos), ('reg', reg_task_infos)]: print(f'{name} task infos:') for info in infos: print(f'{info.task_desc}: n_samples={info.n_samples}') print() config = RunConfig(n_tt_splits=10, n_cv=1, n_refit=0, save_y_pred=False) job_mgr = TabBenchJobManager(paths) scheduler = SimpleJobScheduler(RayJobManager()) hpo_space_name = 'paper-large-pca' for step_idx in range(n_hpo_steps): job_mgr.add_jobs(task_infos, config, f'xRFM-HPO-{hpo_space_name}_small_step-{step_idx}', RandomParamsxRFMInterfaceWrapper(model_idx=step_idx, hpo_space_name=hpo_space_name, max_leaf_size=200), tags=[f'xrfm_hpo_{hpo_space_name}_steps'], rerun=rerun) job_mgr.run_jobs(scheduler) alg_names = [f'xRFM-HPO-{hpo_space_name}_small_step-{i}' for i in range(n_hpo_steps)] run_alg_selection(paths, config, class_task_infos, f'xRFM-HPO-{hpo_space_name}_small', alg_names, val_metric_name='class_error', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) run_alg_selection(paths, config, reg_task_infos, f'xRFM-HPO-{hpo_space_name}_small', alg_names, val_metric_name='rmse', tags=[f'xrfm_hpo_{hpo_space_name}', 'xrfm_hpo', 'default'], rerun=True) if __name__ == '__main__': # run_xrfm_small_test_ablations() fire.Fire(run_xrfm_large_ablations) # run_xrfm_large_ablations(hpo_space_name='paper-large-pca') # run_xrfm_large_ablations(hpo_space_name='paper-large') ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test_bench.py ================================================ from pathlib import Path from sklearn.datasets import make_classification import torch from pytabkit import XGB_TD_Classifier from pytabkit.bench.alg_wrappers.interface_wrappers import XGBInterfaceWrapper from pytabkit.bench.data.paths import Paths from pytabkit.bench.data.tasks import TaskDescription, TaskInfo, Task, TaskCollection from pytabkit.bench.run.task_execution import TabBenchJobManager, RunConfig from pytabkit.bench.scheduling.execution import RayJobManager from pytabkit.bench.scheduling.schedulers import SimpleJobScheduler from pytabkit.models import utils from pytabkit.models.data.data import TensorInfo, DictDataset from pytabkit.models.sklearn.default_params import DefaultParams # Running this test before the sklearn tests can cause an error in the pickling test for NNs using skorch: # _pickle.PicklingError: Can't pickle : it's not the same object as builtins.print # The error occurs when ray.init() and FunctionProcess() are both used. # def test_bench_simple(tmp_path: Path): # paths = Paths(base_folder=str(tmp_path/'tab_bench_data')) # # # ----- import dataset ----- # # n_samples = 1000 # # X, Y = make_classification( # n_samples=n_samples, # random_state=1 # ) # x_cont = torch.as_tensor(X, dtype=torch.float32) # x_cat = torch.zeros(n_samples, 0, dtype=torch.long) # print(f'{Y.shape=}') # y = torch.as_tensor(Y, dtype=torch.long) # tensors = dict(x_cont=x_cont, x_cat=x_cat, y=y[:, None]) # tensor_infos = dict(x_cont=TensorInfo(feat_shape=[x_cont.shape[1]]), x_cat=TensorInfo(feat_shape=[0]), # y=TensorInfo(cat_sizes=[2])) # ds = DictDataset(tensors, tensor_infos) # # task_desc = TaskDescription('custom-class', 'ds_custom') # task_info = TaskInfo.from_ds(task_desc=task_desc, ds=ds) # task = Task(task_info=task_info, ds=ds) # task.save(paths) # TaskCollection.from_source('custom-class', paths).save(paths) # # # # ----- run benchmark ----- # job_mgr = TabBenchJobManager(paths) # scheduler = SimpleJobScheduler(RayJobManager()) # config_10_1_0 = RunConfig(n_tt_splits=2, n_cv=1, n_refit=0, save_y_pred=False) # task_infos = TaskCollection.from_name('custom-class', paths).load_infos(paths) # # ds_x, ds_y = task_infos[0].load_task(paths).ds.split_xy() # # xgb = XGBInterfaceWrapper(**utils.join_dicts(DefaultParams.XGB_D, dict(n_estimators=2))) # xgb = XGB_TD_Classifier(n_estimators=2) # xgb.fit(ds_x.to_df(), ds_y.to_df()) # # job_mgr.add_jobs(task_infos, config_10_1_0, # 'XGB-D-class', # XGBInterfaceWrapper(**utils.join_dicts(DefaultParams.XGB_D, dict(n_estimators=2))), # tags=['default'], rerun=False) # # job_mgr.run_jobs(scheduler) ================================================ FILE: tests/test_ensemble.py ================================================ import pytest import sklearn.base import numpy as np from pytabkit import Ensemble_TD_Classifier, Ensemble_TD_Regressor from pytabkit.models.sklearn.sklearn_interfaces import Ensemble_HPO_Classifier, Ensemble_HPO_Regressor @pytest.mark.parametrize('model', [ Ensemble_TD_Classifier(calibration_method='ts-mix', val_metric_name='ref-ll-ts', device='cpu'), Ensemble_TD_Regressor(device='cpu'), Ensemble_HPO_Classifier(calibration_method='ts-mix', val_metric_name='ref-ll-ts', n_hpo_steps=1, device='cpu'), Ensemble_HPO_Regressor(n_hpo_steps=1, device='cpu'), ]) def test_ensemble(model): np.random.seed(0) X = np.random.randn(100, 2) y = np.random.randn(100, 1) if sklearn.base.is_classifier(model): y = y > 0.0 model.fit(X, y) model.predict(X) ================================================ FILE: tests/test_metrics.py ================================================ import numpy as np import torch import sklearn from pytabkit.models.training.metrics import Metrics def test_pinball(): torch.manual_seed(0) y_pred = torch.randn(100)[:, None] y = torch.randn(100)[:, None] loss = Metrics.apply(y_pred, y, 'pinball(0.95)').item() sklearn_loss = sklearn.metrics.mean_pinball_loss(y.numpy(), y_pred.numpy(), alpha=0.95) assert np.isclose(loss, sklearn_loss) ================================================ FILE: tests/test_rtdl_nns.py ================================================ import numpy as np import pandas as pd from sklearn.utils.estimator_checks import check_estimator from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, r2_score from pytabkit.models.sklearn.sklearn_interfaces import Resnet_RTDL_D_Classifier, Resnet_RTDL_D_Regressor, \ MLP_RTDL_D_Classifier, MLP_RTDL_D_Regressor, FTT_D_Classifier, FTT_D_Regressor from sklearn.datasets import make_classification, make_regression import pytest import torch # def test_estimator_compliance(): # # Check if the custom estimators comply with scikit-learn's conventions # check_estimator(Resnet_RTDL_D_Classifier()) # check_estimator(Resnet_RTDL_D_Regressor()) # @pytest.mark.parametrize("n_classes", [2, 3]) # @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"]) # def test_numerical_data(n_classes, model_name): # # Generate synthetic data # X, y = make_classification(n_samples=1000, n_features=20, n_informative=3,n_classes=n_classes, random_state=42) # X = pd.DataFrame(X) # y = pd.Series(y) # # # Split the data # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # Train the classifier # if model_name == "resnet": # clf = Resnet_RTDL_D_Classifier(device="cpu") # elif model_name == "mlp": # clf = MLP_RTDL_D_Classifier(device="cpu") # elif model_name == "ft_transformer": # clf = FTT_D_Classifier(device="cpu") # clf.fit(X_train, y_train, cat_indicator=[False] * 20) # Assuming no categorical features # # # Predict and evaluate # predictions = clf.predict(X_test) # accuracy = accuracy_score(y_test, predictions) # assert accuracy > 0.5, "Accuracy should be greater than 50%" # # # @pytest.mark.parametrize("n_classes", [2, 3]) # @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"]) # def test_categorical_data(n_classes, model_name): # # Generate synthetic data with a categorical feature # X, y = make_classification(n_samples=1000, n_features=20, n_informative=3, n_classes=n_classes, random_state=42) # # Add a categorical feature # cat_col = np.random.choice([0, 1, 2], size=X.shape[0]) # X = np.hstack((X, cat_col.reshape(-1, 1))) # # # Split the data # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # Train the classifier with categorical feature # if model_name == "resnet": # clf = Resnet_RTDL_D_Classifier(device="cpu") # elif model_name == "mlp": # clf = MLP_RTDL_D_Classifier(device="cpu") # elif model_name == "ft_transformer": # clf = FTT_D_Classifier(device="cpu") # clf.fit(X_train, y_train, cat_indicator=[False] * 20 + [True]) # # # Predict and evaluate # predictions = clf.predict(X_test) # accuracy = accuracy_score(y_test, predictions) # assert accuracy > 0.5, "Accuracy should be greater than 50%" # # # Check if the classifier can handle unseen categories # X_test[0, -1] = -1 # Unseen category # predictions = clf.predict(X_test) # # If no error is raised, the classifier can handle unseen categories # # @pytest.mark.parametrize("transformed_target", [True, False]) # @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"]) # def test_regressor_numerical_categorical(transformed_target, model_name): # # Generate synthetic data with a mix of numerical and categorical features # X, y = make_regression(n_samples=1000, n_features=3, n_informative=2, random_state=43) # cat_feature = np.random.choice([1, 2, 3], size=X.shape[0]) # X = np.column_stack((X, cat_feature)) # # X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(X.shape[1] - 1)] + ['cat']) # cat_features = [False]*3 + [True] # # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43) # # if model_name == "resnet": # regressor = Resnet_RTDL_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu") # elif model_name == "mlp": # regressor = MLP_RTDL_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu") # elif model_name == "ft_transformer": # regressor = FTT_D_Regressor(transformed_target=transformed_target, random_state=41, device="cpu") # regressor.fit(X_train, y_train, cat_indicator=cat_features) # predictions = regressor.predict(X_test) # # # Evaluate the regressor with R2 score # score = r2_score(y_test, predictions) # assert score > 0.1, f"Regressor R2 score too low with mixed features, got {score}" # # # Test handling of unseen categories # X_test.iloc[0, -1] = 4 # Introduce a new category # predictions = regressor.predict(X_test) # # If no errors and predictions are returned, the regressor can handle unseen categories during test time # # # def create_model(regression, model_name, **kwargs): # if model_name == "resnet": # model = Resnet_RTDL_D_Regressor(device="cpu", **kwargs) if regression else Resnet_RTDL_D_Classifier(device="cpu", **kwargs) # elif model_name == "mlp": # model = MLP_RTDL_D_Regressor(device="cpu", **kwargs) if regression else MLP_RTDL_D_Classifier(device="cpu", **kwargs) # elif model_name == "ft_transformer": # model = FTT_D_Regressor(device="cpu", **kwargs) if regression else FTT_D_Classifier(device="cpu", **kwargs) # return model # # # # @pytest.mark.parametrize("regression", [True, False]) # # @pytest.mark.parametrize("resnet_or_mlp", ["resnet", "mlp"]) # # def test_determinist(regression, resnet_or_mlp): # # # generate toy data # # if regression: # # X, y = make_regression(n_samples=300, n_features=20, n_informative=2, random_state=42) # # else: # # X, y = make_classification(n_samples=300, n_features=20, n_informative=2, random_state=42) # # # # # add categorical feature # # cat_feature = np.random.choice([1, 2, 3], size=X.shape[0]) # # X = np.column_stack((X, cat_feature)) # # # # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # # # # random_states = [42, 42, 43] # # res_list = [] # # for random_state in random_states: # # model = create_model(regression, resnet_or_mlp, random_state=random_state) # # model.fit(X_train, y_train, cat_features=[False]*20 + [True]) # # predictions = model.predict(X_test) # # res_list.append(predictions) # # # # assert np.allclose(res_list[0], res_list[1]), "Predictions should be the same with the same random_state" # # assert not np.allclose(res_list[0], res_list[2]), "Predictions should be different with different random_state" # # # @pytest.mark.parametrize("regression", [True, False]) # @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"]) # @pytest.mark.parametrize("n_classes", [2, 3]) # def test_all_categorical(regression, model_name, n_classes): # X = np.random.randint(n_classes, size=(1000, 10)) # if regression: # y = np.random.rand(1000) # else: # y = np.random.randint(n_classes, size=(1000,)) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # # model = create_model(regression, model_name, random_state=42) # model.fit(X_train, y_train, cat_indicator=[True] * 10) # # model.predict(X_test) # # # @pytest.mark.parametrize("seed", list(range(10))) # @pytest.mark.parametrize("model_name", ["resnet", "mlp", "ft_transformer"]) # def test_high_cardinality(seed, model_name): # np.random.seed(seed) # torch.manual_seed(seed) # # x_df = pd.DataFrame({'cat_1': [270, 86, 154, 80, 56, 80, 80, 283, 199, 291]}).astype('category') # y = np.zeros(len(x_df)) # # reg = create_model(True, model_name, random_state=seed) # reg.fit(x_df, y, cat_indicator=[True]) # reg.predict(x_df) # # # # @pytest.mark.parametrize("resnet_or_mlp", ["resnet", "mlp"]) # # @pytest.mark.parametrize("transformed_target", [True, False]) # # def test_constant_predictor(resnet_or_mlp, transformed_target): # # # test that the prediction are replaced by the mean of the training set if the val loss # # # is infinite or too bad # # X, y = make_regression(n_samples=1000, n_features=20, n_informative=2, random_state=42) # # # # # first lr 3 to get bad but finite val_loss # # model = create_model(True, resnet_or_mlp, random_state=42, lr=1, max_epochs=10, transformed_target=transformed_target) # # model.fit(X, y, val_idxs=np.arange(100)) # # # check that val_loss is finite # # history = model.alg_interface_.sub_split_interfaces[0].model.history # # assert np.isfinite(history[:, 'valid_loss']).any() # # predictions = model.predict(X) # # assert np.allclose(predictions, np.mean(y[100:])), "Predictions should be the mean of the training set" # # # this should also correspond to model.alg_interface_.sub_split_interfaces[0].model.y_train_mean if transformed_target=False # # if not transformed_target: # # assert np.allclose(model.alg_interface_.sub_split_interfaces[0].model.y_train_mean, np.mean(y[100:])) # # assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == True # # # # # now lr 1000 to get bad but infinite val_loss # # model = create_model(True, resnet_or_mlp, random_state=42, lr=10000, max_epochs=10, transformed_target=transformed_target) # # model.fit(X, y, val_idxs=np.arange(100)) # # # check that val_loss is infinite # # history = model.alg_interface_.sub_split_interfaces[0].model.history # # assert ~np.isfinite(history[:, 'valid_loss']).all() # # predictions = model.predict(X) # # assert np.allclose(predictions, np.mean(y[100:])), "Predictions should be the mean of the training set" # # # this should also correspond to model.alg_interface_.sub_split_interfaces[0].model.y_train_mean if transformed_target=False # # if not transformed_target: # # assert np.allclose(model.alg_interface_.sub_split_interfaces[0].model.y_train_mean, np.mean(y[100:])) # # assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == True # # # # # now lr=1e-5 to check that the predictions are not replaced by the mean of the training set # # model = create_model(True, resnet_or_mlp, random_state=42, lr=1e-5, max_epochs=10, transformed_target=transformed_target) # # model.fit(X, y, val_idxs=np.arange(100)) # # # check that val_loss is finite # # history = model.alg_interface_.sub_split_interfaces[0].model.history # # assert np.isfinite(history[:, 'valid_loss']).any() # # predictions = model.predict(X) # # assert not np.allclose(predictions, np.mean(y[100:])), "Predictions should not be the mean of the training set" # # assert model.alg_interface_.sub_split_interfaces[0].model.predict_mean == False ================================================ FILE: tests/test_sklearn_interfaces.py ================================================ import pytest from sklearn.utils.estimator_checks import parametrize_with_checks from pytabkit import XRFM_D_Classifier, XRFM_D_Regressor from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Classifier, RealMLP_TD_Regressor, \ RealMLP_TD_S_Regressor, LGBM_TD_Classifier, LGBM_TD_Regressor, XGB_TD_Classifier, XGB_TD_Regressor, \ CatBoost_TD_Classifier, \ CatBoost_TD_Regressor, MLP_RTDL_D_Classifier, MLP_RTDL_D_Regressor, Resnet_RTDL_D_Classifier, TabR_S_D_Classifier, \ Resnet_RTDL_D_Regressor, TabR_S_D_Regressor, TabM_D_Classifier, TabM_D_Regressor, MLP_PLR_D_Regressor, \ MLP_PLR_D_Classifier, FTT_D_Classifier, FTT_D_Regressor, RealMLP_TD_S_Classifier # decrease min_data_in_leaf for LGBMTDClassifier since otherwise the test check_classifiers_classes fails, # because LGBM only predicts a single class on the training set # also increase subsample to 1.0 because otherwise LightGBM fails with n_samples=1. @parametrize_with_checks([ XRFM_D_Classifier(device='cpu'), XRFM_D_Regressor(device='cpu'), LGBM_TD_Classifier(min_data_in_leaf=2, subsample=1.0, calibration_method='ts-mix', val_metric_name='ref-ll-ts', n_estimators=100), LGBM_TD_Classifier(min_data_in_leaf=2, subsample=1.0, n_estimators=100), LGBM_TD_Regressor(subsample=1.0, n_estimators=100), XGB_TD_Classifier(n_estimators=100), XGB_TD_Regressor(n_estimators=100), CatBoost_TD_Classifier(n_estimators=100), CatBoost_TD_Regressor(n_estimators=100), # use CPU to avoid Mac OS errors with MPS backend RealMLP_TD_Classifier(n_epochs=8, device='cpu'), RealMLP_TD_Regressor(n_epochs=64, device='cpu'), TabM_D_Classifier(device='cpu', tabm_k=2, num_emb_type='pwl', arch_type='tabm-mini', num_emb_n_bins=2), TabM_D_Regressor(device='cpu', tabm_k=2, num_emb_type='pwl', arch_type='tabm-mini', num_emb_n_bins=2), MLP_RTDL_D_Classifier(device='cpu', max_epochs=50), #MLP_RTDL_D_Regressor(device='cpu'), Resnet_RTDL_D_Classifier(device='cpu'), Resnet_RTDL_D_Regressor(device='cpu'), MLP_PLR_D_Classifier(device='cpu'), MLP_PLR_D_Regressor(device='cpu'), FTT_D_Classifier(device='cpu', module_d_token=128, module_n_heads=8, max_epochs=32), FTT_D_Regressor(device='cpu', module_d_token=128, module_n_heads=8, max_epochs=32), # Tabr_D_Classifier(), Tabr_D_Regressor(), # needs faiss which is not in the dependencies, so don't test ]) def test_sklearn_compatible_estimator(estimator, check): check(estimator) ================================================ FILE: tests/test_tabr.py ================================================ import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, r2_score from pytabkit.models.sklearn.sklearn_interfaces import TabR_S_D_Classifier, TabR_S_D_Regressor from sklearn.datasets import make_classification, make_regression import pytest import torch # tests are currently not executed since TabR needs faiss which is not available via pip, # therefore it cannot run via hatch test / in CI # @pytest.mark.parametrize("n_classes", [2, 3]) # def test_numerical_data(n_classes): # # Generate synthetic data # X, y = make_classification(n_samples=1000, n_features=20, n_informative=3,n_classes=n_classes, random_state=42) # X = pd.DataFrame(X) # y = pd.Series(y) # # # Split the data # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # Train the classifier # clf = TabR_S_D_Classifier(n_epochs=5) # clf.fit(X_train, y_train, cat_features=[False] * 20) # Assuming no categorical features # # # Predict and evaluate # predictions = clf.predict(X_test) # accuracy = accuracy_score(y_test, predictions) # assert accuracy > 0.5, "Accuracy should be greater than 50%" # # # @pytest.mark.parametrize("n_classes", [2, 3]) # def test_categorical_data(n_classes): # # Generate synthetic data with a categorical feature # X, y = make_classification(n_samples=1000, n_features=20, n_informative=3, n_classes=n_classes, random_state=42) # # Add a categorical feature # cat_col = np.random.choice([0, 1, 2], size=X.shape[0]) # X = np.hstack((X, cat_col.reshape(-1, 1))) # # # Split the data # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # Train the classifier with categorical feature # clf = TabR_S_D_Classifier(n_epochs=5) # clf.fit(X_train, y_train, cat_features=[False] * 20 + [True]) # # # Predict and evaluate # predictions = clf.predict(X_test) # accuracy = accuracy_score(y_test, predictions) # assert accuracy > 0.5, "Accuracy should be greater than 50%" # # # Check if the classifier can handle unseen categories # X_test[0, -1] = -1 # Unseen category # predictions = clf.predict(X_test) # # If no error is raised, the classifier can handle unseen categories # # # @pytest.mark.parametrize("transformed_target", [True, False]) # def test_regressor_numerical_categorical(transformed_target): # # Generate synthetic data with a mix of numerical and categorical features # X, y = make_regression(n_samples=1000, n_features=5, n_informative=3, random_state=42) # cat_feature = np.random.choice([1, 2, 3], size=X.shape[0]) # X = np.column_stack((X, cat_feature)) # # X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(X.shape[1] - 1)] + ['cat']) # cat_features = [False]*5 + [True] # # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # Train the regressor # regressor = TabR_S_D_Regressor(n_epochs=20, transformed_target=transformed_target) # regressor.fit(X_train, y_train, cat_features=cat_features) # predictions = regressor.predict(X_test) # # # Evaluate the regressor with R2 score # score = r2_score(y_test, predictions) # assert score > 0.1, f"Regressor R2 score too low with mixed features, got {score}" # # # Test handling of unseen categories # X_test.iloc[0, -1] = 4 # Introduce a new category # predictions = regressor.predict(X_test) # # If no errors and predictions are returned, the regressor can handle unseen categories during test time # # # @pytest.mark.parametrize("regression", [True, False]) # def test_determinist(regression): # # generate toy data # if regression: # X, y = make_regression(n_samples=300, n_features=20, n_informative=2, random_state=42) # else: # X, y = make_classification(n_samples=300, n_features=20, n_informative=2, random_state=42) # # # add categorical feature # cat_feature = np.random.choice([1, 2, 3], size=X.shape[0]) # X = np.column_stack((X, cat_feature)) # # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # random_states = [42, 42, 43] # res_list = [] # for random_state in random_states: # if regression: # model = TabR_S_D_Regressor(random_state=random_state, n_epochs=5) # else: # model = TabR_S_D_Classifier(random_state=random_state, n_epochs=5) # model.fit(X_train, y_train, cat_features=[False]*20 + [True]) # predictions = model.predict(X_test) # res_list.append(predictions) # # assert np.allclose(res_list[0], res_list[1]), "Predictions should be the same with the same random_state" # assert not np.allclose(res_list[0], res_list[2]), "Predictions should be different with different random_state" # # # @pytest.mark.parametrize("regression", [True, False]) # @pytest.mark.parametrize("n_classes", [2, 3]) # @pytest.mark.parametrize("cat_size", [2, 5]) # def test_all_categorical(regression, n_classes, cat_size): # X = np.random.randint(cat_size, size=(1000, 10)) # if regression: # y = np.random.rand(1000) # else: # y = np.random.randint(n_classes, size=(1000,)) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # # model = TabR_S_D_Regressor(n_epochs=5) if regression else TabR_S_D_Classifier(n_epochs=5) # model.fit(X_train, y_train, cat_features=[True] * 10) # # model.predict(X_test) # # # @pytest.mark.parametrize("seed", list(range(10))) # def test_high_cardinality(seed): # np.random.seed(seed) # torch.manual_seed(seed) # # x_df = pd.DataFrame({'cat_1': [270, 86, 154, 80, 56, 80, 80, 283, 199, 291]}).astype('category') # y = np.zeros(len(x_df)) # # reg = TabR_S_D_Regressor(n_epochs=5) # reg.fit(x_df, y, cat_features=[True]) # reg.predict(x_df) ================================================ FILE: tests/test_variants.py ================================================ import pytest import numpy as np import pandas as pd import sklearn from sklearn.base import ClassifierMixin import torch from pytabkit import TabM_D_Classifier, RealMLP_HPO_Classifier, Ensemble_HPO_Classifier, TabM_HPO_Regressor, \ TabM_HPO_Classifier, LGBM_HPO_Classifier, CatBoost_HPO_Classifier, XGB_HPO_Classifier, Ensemble_HPO_Regressor, \ LGBM_HPO_TPE_Regressor, RealMLP_TD_Regressor, RealMLP_HPO_Regressor, TabM_D_Regressor, XRFM_D_Classifier, \ XRFM_D_Regressor, XRFM_HPO_Classifier, XRFM_HPO_Regressor @pytest.mark.parametrize('estimator', [ RealMLP_TD_Regressor(n_cv=2, n_refit=2, n_repeats=2), RealMLP_HPO_Regressor(n_hyperopt_steps=2, train_metric_name='multi_pinball(0.1,0.9)', val_metric_name='multi_pinball(0.1,0.9)'), TabM_D_Classifier(val_metric_name='cross_entropy', num_emb_type='pwl', tabm_k=16, random_state=0), TabM_D_Regressor(val_metric_name='cross_entropy', num_emb_type='pwl', tabm_k=16, random_state=0), TabM_HPO_Regressor(val_metric_name='mae', n_hyperopt_steps=2, hpo_space_name='tabarena', random_state=0), TabM_HPO_Classifier(val_metric_name='mae', n_hyperopt_steps=2, hpo_space_name='default', random_state=0, use_caruana_ensembling=True), XRFM_D_Classifier(val_metric_name='cross_entropy'), XRFM_D_Regressor(), XRFM_HPO_Classifier(n_hyperopt_steps=2), XRFM_HPO_Regressor(n_hyperopt_steps=2), # use CPU since GPU might not support some features in the search space (it has problems with rsm for catboost) LGBM_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'), XGB_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'), CatBoost_HPO_Classifier(use_caruana_ensembling=True, n_hyperopt_steps=2, hpo_space_name='tabarena', device='cpu'), RealMLP_HPO_Classifier(val_metric_name='cross_entropy', n_hyperopt_steps=3, use_caruana_ensembling=True, hpo_space_name='tabarena', n_caruana_steps=10, random_state=0), Ensemble_HPO_Classifier(val_metric_name='brier', n_hpo_steps=2, use_full_caruana_ensembling=True, use_tabarena_spaces=True), Ensemble_HPO_Regressor(val_metric_name='brier', n_hpo_steps=2, use_full_caruana_ensembling=True, use_tabarena_spaces=True), LGBM_HPO_TPE_Regressor(n_cv=2, n_refit=2, n_hyperopt_steps=2), ]) def test_sklearn_not_crash(estimator): np.random.seed(0) n_train = 100 X = pd.DataFrame({'a': np.random.randn(n_train), 'b': np.random.randint(5, size=(n_train,))}) X['b'] = X['b'].astype('category') est = sklearn.base.clone(estimator) if not torch.cuda.is_available(): # don't use mps even if it's available est.device = 'cpu' if isinstance(est, ClassifierMixin): y = np.random.randint(3, size=(n_train,)) else: y = np.random.randn(n_train) est.fit(X, y) est.predict(X)