Repository: nebuly-ai/optimate
Branch: main
Commit: a6d302f912b4
Files: 306
Total size: 1.6 MB

Directory structure:
gitextract_7q29s3ew/

├── .gitignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── README.md
├── monitoring/
│   └── nebuly/
│       └── __init__.py
└── optimization/
    ├── .github/
    │   └── workflows/
    │       └── tests.yml
    ├── chatllama/
    │   ├── LICENSE
    │   ├── README.md
    │   ├── artifacts/
    │   │   ├── config/
    │   │   │   ├── config.yaml
    │   │   │   ├── ds_config.json
    │   │   │   └── peft_config.yaml
    │   │   ├── datasets/
    │   │   │   ├── actor_dataset.json
    │   │   │   ├── reward_dataset.json
    │   │   │   └── rlhf_dataset.json
    │   │   ├── download_dataset.py
    │   │   ├── extend_rlhf_dataset.py
    │   │   ├── generate_actor_dataset.py
    │   │   ├── generate_rewards.py
    │   │   ├── main.py
    │   │   └── templates.json
    │   ├── chatllama/
    │   │   ├── __init__.py
    │   │   ├── langchain_modules/
    │   │   │   ├── __init__.py
    │   │   │   └── prompt_templates.py
    │   │   ├── llama_model.py
    │   │   └── rlhf/
    │   │       ├── __init__.py
    │   │       ├── actor.py
    │   │       ├── config.py
    │   │       ├── dataset.py
    │   │       ├── model_list.py
    │   │       ├── model_loader.py
    │   │       ├── reward.py
    │   │       ├── trainer.py
    │   │       └── utils.py
    │   └── setup.py
    ├── cloud_surfer/
    │   └── README.md
    ├── forward_forward/
    │   ├── README.md
    │   ├── forward_forward/
    │   │   ├── __init__.py
    │   │   ├── api/
    │   │   │   ├── __init__.py
    │   │   │   └── functions.py
    │   │   ├── app.py
    │   │   ├── operations/
    │   │   │   ├── __init__.py
    │   │   │   ├── build_models.py
    │   │   │   ├── data.py
    │   │   │   ├── fetch_operations.py
    │   │   │   └── trainers.py
    │   │   ├── root_op.py
    │   │   └── utils/
    │   │       ├── __init__.py
    │   │       ├── labels.py
    │   │       ├── modules.py
    │   │       └── utils.py
    │   ├── requirements.txt
    │   └── setup.py
    ├── large_speedster/
    │   └── README.md
    ├── nebullvm/
    │   ├── .pre-commit-config.yaml
    │   ├── CONTRIBUTING.md
    │   ├── Dockerfile
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── README.md
    │   ├── azure-pipelines.yml
    │   ├── docker_build.sh
    │   ├── docs/
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── conf.py
    │   │   ├── index.rst
    │   │   ├── modules/
    │   │   │   ├── api.rst
    │   │   │   ├── converters.rst
    │   │   │   ├── index.rst
    │   │   │   ├── inference_learners.rst
    │   │   │   ├── installers.rst
    │   │   │   └── optimizers.rst
    │   │   └── requirements-docs.txt
    │   ├── nebullvm/
    │   │   ├── __init__.py
    │   │   ├── api/
    │   │   │   └── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   └── base.py
    │   │   ├── config.py
    │   │   ├── core/
    │   │   │   ├── __init__.py
    │   │   │   ├── models.py
    │   │   │   ├── tests/
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_models.py
    │   │   │   └── types.py
    │   │   ├── installers/
    │   │   │   ├── __init__.py
    │   │   │   ├── auto_installer.py
    │   │   │   ├── install_bladedisc.sh
    │   │   │   ├── install_fastertransformer.sh
    │   │   │   ├── install_tensor_rt.sh
    │   │   │   ├── install_tvm.sh
    │   │   │   ├── install_tvm_prerequisites.sh
    │   │   │   ├── installers.py
    │   │   │   ├── tests/
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_install_frameworks.py
    │   │   │   └── tvm_installers/
    │   │   │       ├── arm/
    │   │   │       │   └── config.cmake
    │   │   │       ├── arm_cuda/
    │   │   │       │   └── config.cmake
    │   │   │       ├── x86/
    │   │   │       │   └── config.cmake
    │   │   │       └── x86_cuda/
    │   │   │           └── config.cmake
    │   │   ├── operations/
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── conversions/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── converters.py
    │   │   │   │   ├── huggingface.py
    │   │   │   │   ├── pytorch.py
    │   │   │   │   ├── tensorflow.py
    │   │   │   │   └── utils.py
    │   │   │   ├── fetch_operations/
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── local.py
    │   │   │   ├── inference_learners/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── blade_disc.py
    │   │   │   │   ├── builders.py
    │   │   │   │   ├── deepsparse.py
    │   │   │   │   ├── faster_transformer.py
    │   │   │   │   ├── huggingface.py
    │   │   │   │   ├── neural_compressor.py
    │   │   │   │   ├── onnx.py
    │   │   │   │   ├── openvino.py
    │   │   │   │   ├── tensor_rt.py
    │   │   │   │   ├── tensorflow.py
    │   │   │   │   ├── torch_dynamo.py
    │   │   │   │   ├── torch_neuron.py
    │   │   │   │   ├── torch_xla.py
    │   │   │   │   ├── torchscript.py
    │   │   │   │   ├── tvm.py
    │   │   │   │   └── utils.py
    │   │   │   ├── measures/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── measures.py
    │   │   │   │   └── utils.py
    │   │   │   └── optimizations/
    │   │   │       ├── __init__.py
    │   │   │       ├── compilers/
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── base.py
    │   │   │       │   ├── deepsparse.py
    │   │   │       │   ├── faster_transformer/
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── bert/
    │   │   │       │   │   │   ├── __init__.py
    │   │   │       │   │   │   ├── checkpoint_quantization.py
    │   │   │       │   │   │   └── modeling_bert.py
    │   │   │       │   │   └── gpt/
    │   │   │       │   │       ├── __init__.py
    │   │   │       │   │       └── utils/
    │   │   │       │   │           ├── __init__.py
    │   │   │       │   │           ├── gpt_decoder.py
    │   │   │       │   │           └── huggingface_gpt_convert.py
    │   │   │       │   ├── intel_neural_compressor.py
    │   │   │       │   ├── onnxruntime.py
    │   │   │       │   ├── openvino.py
    │   │   │       │   ├── quantizations/
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── intel_neural_compressor.py
    │   │   │       │   │   ├── onnx.py
    │   │   │       │   │   ├── openvino.py
    │   │   │       │   │   ├── pytorch.py
    │   │   │       │   │   ├── tensor_rt.py
    │   │   │       │   │   ├── tensorflow.py
    │   │   │       │   │   ├── tvm.py
    │   │   │       │   │   └── utils.py
    │   │   │       │   ├── tensor_rt.py
    │   │   │       │   ├── tensorflow.py
    │   │   │       │   ├── torch_dynamo.py
    │   │   │       │   ├── torch_neuron.py
    │   │   │       │   ├── torch_xla.py
    │   │   │       │   ├── torchscript.py
    │   │   │       │   ├── tvm.py
    │   │   │       │   └── utils.py
    │   │   │       ├── compressors/
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── base.py
    │   │   │       │   ├── intel.py
    │   │   │       │   ├── scripts/
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── neural_magic_training.py
    │   │   │       │   └── sparseml.py
    │   │   │       ├── optimize_inference.py
    │   │   │       ├── optimizers/
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── base.py
    │   │   │       │   └── optimizers.py
    │   │   │       ├── tests/
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── test_deepsparse.py
    │   │   │       │   ├── test_intel_neural_compressor.py
    │   │   │       │   ├── test_onnxruntime.py
    │   │   │       │   ├── test_openvino.py
    │   │   │       │   ├── test_tensor_rt.py
    │   │   │       │   ├── test_tensorflow.py
    │   │   │       │   ├── test_torch_dynamo.py
    │   │   │       │   ├── test_torchscript.py
    │   │   │       │   ├── test_tvm.py
    │   │   │       │   └── utils.py
    │   │   │       └── utils.py
    │   │   ├── optional_modules/
    │   │   │   ├── __init__.py
    │   │   │   ├── blade_disc.py
    │   │   │   ├── deepsparse.py
    │   │   │   ├── diffusers.py
    │   │   │   ├── dummy.py
    │   │   │   ├── huggingface.py
    │   │   │   ├── neural_compressor.py
    │   │   │   ├── onnx.py
    │   │   │   ├── onnxruntime.py
    │   │   │   ├── onnxsim.py
    │   │   │   ├── openvino.py
    │   │   │   ├── tensor_rt.py
    │   │   │   ├── tensorflow.py
    │   │   │   ├── torch.py
    │   │   │   ├── torch_neuron.py
    │   │   │   ├── torch_tensorrt.py
    │   │   │   ├── torch_xla.py
    │   │   │   ├── tvm.py
    │   │   │   └── utils.py
    │   │   └── tools/
    │   │       ├── __init__.py
    │   │       ├── adapters.py
    │   │       ├── benchmark.py
    │   │       ├── data.py
    │   │       ├── diffusers.py
    │   │       ├── feedback_collector.py
    │   │       ├── hardware_utils.py
    │   │       ├── huggingface.py
    │   │       ├── logger.py
    │   │       ├── onnx.py
    │   │       ├── pytorch.py
    │   │       ├── tests/
    │   │       │   ├── __init__.py
    │   │       │   ├── test_data.py
    │   │       │   ├── test_hardware_utils.py
    │   │       │   └── test_utils.py
    │   │       ├── tf.py
    │   │       ├── transformations.py
    │   │       ├── utils.py
    │   │       └── venv.py
    │   ├── nebullvm.toml
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   └── setup.py
    ├── open_alpha_tensor/
    │   ├── README.md
    │   ├── config.json
    │   ├── main.py
    │   ├── open_alpha_tensor/
    │   │   ├── __init__.py
    │   │   ├── api/
    │   │   │   ├── __init__.py
    │   │   │   └── functions.py
    │   │   ├── config.py
    │   │   ├── core/
    │   │   │   ├── __init__.py
    │   │   │   ├── actors/
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── stage.py
    │   │   │   ├── data/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── basis_change.py
    │   │   │   │   ├── dataset.py
    │   │   │   │   ├── generation.py
    │   │   │   │   └── utils.py
    │   │   │   ├── modules/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── alpha_tensor.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── extras.py
    │   │   │   │   ├── heads.py
    │   │   │   │   └── torso.py
    │   │   │   └── training.py
    │   │   ├── operations/
    │   │   │   ├── __init__.py
    │   │   │   ├── checkpoint_op.py
    │   │   │   ├── model_op.py
    │   │   │   └── training_op.py
    │   │   └── root_op.py
    │   ├── resources/
    │   │   └── open_alpha_tensor.md
    │   └── setup.py
    ├── optimate/
    │   └── README.md
    └── speedster/
        ├── README.md
        ├── docs/
        │   └── en/
        │       ├── docs/
        │       │   ├── advanced_options.md
        │       │   ├── benchmarks.md
        │       │   ├── getting_started/
        │       │   │   ├── diffusers_getting_started.md
        │       │   │   ├── hf_getting_started.md
        │       │   │   ├── onnx_getting_started.md
        │       │   │   ├── pytorch_getting_started.md
        │       │   │   └── tf_getting_started.md
        │       │   ├── hardware.md
        │       │   ├── installation.md
        │       │   ├── key_concepts.md
        │       │   ├── notebooks.md
        │       │   ├── overview.md
        │       │   └── telemetry.md
        │       └── mkdocs.yaml
        ├── notebooks/
        │   ├── README.md
        │   ├── diffusers/
        │   │   ├── Accelerate_Stable_Diffusion_with_Speedster.ipynb
        │   │   └── Readme.md
        │   ├── huggingface/
        │   │   ├── Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb
        │   │   ├── Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb
        │   │   ├── Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb
        │   │   ├── Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb
        │   │   ├── Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb
        │   │   ├── Readme.md
        │   │   └── faster_transformer_bert.py
        │   ├── onnx/
        │   │   ├── Accelerate_ONNX_ResNet50_with_Speedster.ipynb
        │   │   └── Readme.md
        │   ├── pytorch/
        │   │   ├── Accelerate_PyTorch_ResNet50_with_Speedster.ipynb
        │   │   ├── Accelerate_PyTorch_ViT_with_Speedster.ipynb
        │   │   ├── Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb
        │   │   ├── Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb
        │   │   ├── Accelerate_fast_ai_Resnet34_with_Speedster.ipynb
        │   │   └── Readme.md
        │   └── tensorflow/
        │       ├── Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb
        │       └── Readme.md
        ├── requirements.txt
        ├── setup.py
        ├── speedster/
        │   ├── __init__.py
        │   ├── api/
        │   │   ├── __init__.py
        │   │   ├── functions.py
        │   │   └── tests/
        │   │       ├── __init__.py
        │   │       ├── test_huggingface.py
        │   │       ├── test_onnx.py
        │   │       ├── test_pytorch.py
        │   │       ├── test_tensorflow.py
        │   │       └── utils.py
        │   ├── root_op.py
        │   ├── speedster.py
        │   ├── tests/
        │   │   ├── __init__.py
        │   │   └── test_root_op.py
        │   └── utils.py
        └── speedster.toml

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
optimization/nebullvm/docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version
.idea

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# MacOS DS_Store
.DS_Store

# Pickle folder
.pkl_memoize_py3

# Folder where optimized models are stored
optimized_model

# Config file for tests coverage
.coveragerc


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Nebuly"
  given-names: "S.r.l"
- family-names: "Fiori"
  given-names: "Diego"
  orcid: "https://orcid.org/0000-0003-1910-0565"
- family-names: "Sofi"
  given-names: "Valerio"
  orcid: "https://orcid.org/0000-0001-5978-897X"
title: "nebullvm"
version: 0.4.3
date-released: 2022-10-10
url: "https://github.com/nebuly-ai/nebullvm"


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
social@nebuly.ai.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: README.md
================================================
# OptiMate

**[Legacy]**

This repository is now in a legacy phase and is no longer actively maintained. Although the source code is still available in the Git history, there will be no additional updates or official support.

**[About Nebuly]**

Our team is fully committed on creating the best user-experience platform for LLMs so that companies can understand user behavior at scale when interacting with their LLM-based products. 
- To learn more on how to get started, visit our [official documentation](https://docs.nebuly.com/welcome/overview)
- If you need enterprise support, please contact us [here](https://www.nebuly.com/nebuly-book-a-demo)

**[About optimate]**

We have open-sourced a couple of internal projects to the community, but we are not currently maintaining them. Optimate is a collection of libraries designed to help you optimize your AI models. It is an open-source project developed by Nebuly AI but is **not actively maintained**.

The tools available to assist you in your optimization are:

✅ [Speedster](https://github.com/nebuly-ai/optimate/tree/main/optimization/speedster): reduce inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs)

✅ [Nos](https://github.com/nebuly-ai/nos): reduce infrastructure costs by leveraging real-time dynamic partitioning and elastic quotas to maximize the utilization of your Kubernetes GPU cluster

✅ [ChatLLaMA](https://github.com/nebuly-ai/optimate/tree/main/optimization/chatllama): reduce hardware and data costs by leveraging fine-tuning optimization techniques and RLHF alignment


================================================
FILE: monitoring/nebuly/__init__.py
================================================


================================================
FILE: optimization/.github/workflows/tests.yml
================================================
name: Run tests

on:
  push:
    branches:
      - "main"
    paths-ignore:
      - ".github/**"
      - "*.md"
      - "docs/**"
      - "notebooks/**"
  pull_request:
    branches:
      - "main"
    paths-ignore:
      - ".github/**"
      - "*.md"
      - "docs/**"
      - "notebooks/**"

jobs:
  test_on_ubuntu_cpu:
    runs-on: ubuntu-20.04

    strategy:
      matrix:
        # Run in all these versions of Python
        python-version: [ 3.8, 3.9, "3.10" ]

    steps:
      # Checkout the latest code from the repo
      - name: Checkout repo
        uses: actions/checkout@v2
        # Setup which version of Python to use
      - name: Set Up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
        # Display the Python version being used
      - name: Display Python version
        run: python -c "import sys; print(sys.version)"
        # Install nebullvm
      - name: Install nebullvm
        run: |
          python -m pip install --upgrade pip
          pip install .
        # Install Speedster
      - name: Install Speedster
        run: |
          cd apps/accelerate/speedster
          pip install .
          cd ../../..
        # Install PyTorch
      - name: Install PyTorch
        run: python -m pip install torch==2.0.0
        # Install compilers except tvm
      - name: Install deep learning compilers
        run: python -m nebullvm.installers.auto_installer --compilers all
        # Install requirements for testing
      - name: Install requirements for testing
        run: pip install -r "requirements-dev.txt"
        # Run api tests
      - name: Run api tests
        run: |
          export SPEEDSTER_DISABLE_TELEMETRY=1
          cd apps/accelerate/speedster
          pytest
          cd ../../..
        # Run components tests
      - name: Run components tests
        run: |
          cd nebullvm
          pytest
          cd ../

#  test_on_windows_cpu:
#    runs-on: windows-latest
#
#    strategy:
#      matrix:
#        # Run in all these versions of Python
#        python-version: [ 3.8, 3.9, "3.10" ]
#
#    steps:
#      # Checkout the latest code from the repo
#      - name: Checkout repo
#        uses: actions/checkout@v2
#        # Setup which version of Python to use
#      - name: Set Up Python ${{ matrix.python-version }}
#        uses: actions/setup-python@v2
#        with:
#          python-version: ${{ matrix.python-version }}
#        # Display the Python version being used
#      - name: Display Python version
#        run: python -c "import sys; print(sys.version)"
#        # Install nebullvm
#      - name: Install nebullvm
#        run: |
#          python -m pip install --upgrade pip
#          pip install .
#        # Install Speedster
#      - name: Install Speedster
#        run: |
#          cd apps/accelerate/speedster
#          pip install .
#          cd ../../..
#      - name: Install PyTorch
#        run: python -m pip install torch==2.0.0
#        # Install compilers except tvm
#      - name: Install deep learning compilers
#        run: python -m nebullvm.installers.auto_installer --compilers all
#        # Install requirements for testing
#      - name: Install requirements for testing
#        run: pip install -r "requirements-dev.txt"
#        # Run api tests
#      - name: Run api tests
#        run: |
#          $env:SPEEDSTER_DISABLE_TELEMETRY=1
#          cd apps/accelerate/speedster
#          pytest
#          cd ../../..
#        # Run components tests
#      - name: Run components tests
#        run: |
#          cd nebullvm
#          pytest
#          cd ../
#


================================================
FILE: optimization/chatllama/LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

================================================
FILE: optimization/chatllama/README.md
================================================
# **🦙 ChatLLaMA**

> :warning: Please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's form.

`ChatLLaMA` 🦙 is a library that allows you to efficiently leverage LLMs fine-tuning capabilities using your own data and the least amount of compute possible. 
Its purpose is to give developers peace of mind, by abstracting the efforts required for computational optimization and for the collection of large amounts of data.

If you like the project, please show your support by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers).

## Quick install
You can install the package with pip:
```bash
pip install chatllama-py
```
Then you need to install the Llama models cloned from [Meta's repository](https://github.com/facebookresearch/llama):
```bash
git clone https://github.com/facebookresearch/llama.git
cd llama
pip install -r requirements.txt
pip install -e .
```
Follow the instructions in the Llama repository to download the model weights and tokenizer.

## What can ChatLLaMA help with?

`ChatLLaMA` 🦙 has been designed to help developers with various use cases, all related to RLHF training and optimized inference. These are some of the use cases that better resonate with our community wishlist:

- I want to train an efficient ChatGPT-like assistant on my local hardware infrastructure using a limited amount of data;
- I want to create my own personalized version of ChatGPT-like assistant without costs getting out of control;
- I want to understand which model architecture (LLaMA, OPT, GPTJ, etc.) best fits my requirements in terms of hardware, compute budget, and performance;

## Getting started

In this Getting Started we will set up a local RLHF training that will allow you to create your own ChatGPT-like assistant. In this example, we used OPT-1.3B, wherever possible we used open-source datasets and ran the training on a NVIDIA A100. If you want to use other models or hardware, we recommend reading the [supported models](#supported-models), [hardware requirements](#hardware-requirements) and [dataset preparation](#dataset-preparation) sections. In this example, we ran a few epochs of the training; this took a few hours. Any feedback on total training time, on any hardware, would be greatly appreciated. Please share your experience with our community on our Discord channel.

To quickly get you started, we will focus on 3 key steps:

1. Download YAML files to customize your training process. Please note that all the parameters of the library can be managed in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml);
2. Prepare the 3 datasets needed to train the actor model, the reward model and perform RLHF;
3. Train the models on your local infrastructure.

<details>
<summary>1 - YAML download </summary>
First, let’s get the artifacts for running ChatLLaMA. The artifacts contain:

- [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml): config file for model and data set. This allows you to 1) select the model you prefer (LLaMA, OPT, BLOOM, etc) 2) change all the hyperparameters of the training process;
- [`ds_config.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/ds_config.json): config file to define DeepSpeed training parameters;
- [`peft_config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/peft_config.yaml): config file to define PEFT parameters; PEFT is used for efficient training with Hugging Face models. It can be used for setting the LoRA parameters as rank and precision.

- [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json): synthetic data generation templates that can be used to personalize the creation of the dataset. The templates are used for feeding LLMs during the data generation. Note that the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file contains a dictionary having as *keys* the training steps (`actor`, `reward`, `rlhf`) and as *values* a string containing the personalization requests of the user. For more details see the [dataset preparation](#dataset-preparation) section;
- [`main.py`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/main.py): file to train the model.
        
```bash
wget -O artifacts.zip https://nbllabartifacts.blob.core.windows.net/chatllama/artifacts.zip\?sp\=r\&st\=2023-03-08T14:53:24Z\&se\=2100-03-08T22:53:24Z\&spr\=https\&sv\=2021-06-08\&sr\=b\&sig\=jqr%2B2ZkR0SW9RjV0pDOdQ%2BDulLXLjbZ36vmNd4XxxyQ%3D
unzip artifacts.zip 
```
        
Once you have run the command above, you will find the all artificats in the [`artifacts/`](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama/artifacts) directory. Now you can move on to the next section regarding the dataset preparation.

</details>

<details>
<summary> 2 - Dataset preparation </summary>
    
Before training the model, we need to prepare 3 datasets:

- `actor_training_data`: this is the JSON dataset used in the supervised fine-tuning. It consists of examples of unlabelled conversations, e.g. collection of prompts and responses;
- `rlhf_training_data`: this is the JSON dataset used for RLHF training. It consists of a collection of possible input user prompts;
- `reward_training_data`: this is the JSON dataset used to train the reward model. It consists of responses with associated scores.

In this example, we are using only publicly available dataset and synthetic generation; if you want to use your own data instead, please see the [Dataset preparation](#dataset-preparation) section.

First, let’s download the `actor_training_data` and the `rlhf_training_data`: 

```bash
python artifacts/download_dataset.py ARLHF --path ./datasets --number_of_samples 200
```

Finally, let’s create the `reward_training_data` using `davinci-003` for synthetic data generation.

```bash
export OPENAI_API_KEY=YOUR_API_KEY
python artifacts/generate_rewards.py ./datasets/reward_training_data.json
```

> :warning: Creating the `reward_training_data` with `davinci-003` is not free, i.e. it costs a few $$. If you prefer avoiding external paid APIs, we suggest using HuggingFace’s models (e.g. flan_t5_xl) as described in more detail in the [Supported models](#supported-models) section.
> 
> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".

At this point, we have successfully created the 3 datasets. We can therefore move on to the final section and start the training.

</details>

<details>
<summary> 3 - Training </summary>
    
You can train the 3 models in separate steps:

- Train the Reward Model

    ```bash
    python artifacts/main.py artifacts/config/config.yaml --type REWARD
    ```

- Pre-Train the Actor Model

    ```bash
    python artifacts/main.py artifacts/config/config.yaml --type ACTOR
    ```

- Training the Actor with reinforcement learning.

    ```bash
    python artifacts/main.py artifacts/config/config.yaml --type RL
    ```


or, equivantly, the 3 trainings can also be pipelined using the flag ALL.

```bash
python artifacts/main.py artifacts/config/config.yaml --type ALL
```

Note that the path to the datasets and the training hyper-parameters of the training process are specified in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml) file.

</details>

## Contributing and Roadmap

As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see our [Roadmap page](https://github.com/users/nebuly-ai/projects/1/views/1) for more information on how to get involved.

You can participate in the following ways:

1. Submit an issue or PR on GitHub
2. Join our [Discord group](https://discord.gg/77d5kGSa8e) to chat

## Supported models

<details><summary><b><i> Actor models </i></b></summary>

We support models that can be run efficiently with a limited amount of compute, such as LLaMA and 🤗 transformers. These are the models with less than 20B parameters currently supported :

- LLaMA: 7B and 13B, please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's [form](https://forms.gle/jk851eBVbX1m5TAv5).
- GPTJ: 6B
- GPTNeoX: 1.3B, 20B
- **(⚠️WIP)** Flan-T5: 80M, 259M, 780M, 3B, 11B
- OPT: 125M, 359M, 1.3B, 2.7B, 6.7B, 13B
- BLOOM: 560M, 1.1B, 1.7B, 3B, 7.1B
- BLOOMZ: 560M, 1.1B, 1.7B, 3B, 7.1B
- Galactica: 125M, 1.3B, 6.7B
</details>

<details><summary><b><i> Reward models </i></b></summary>

We suggest using models under 6B from 🤗 transformers: 

- GPT2: 124M, 355M, 774M, 1.5B
- OPT: 125M, 359M, 1.3B, 2.7B
- GPTJ: 6B
- BLOOMZ: 560M, 1.1B, 1.7B, 3B
- **(⚠️WIP)** OpenAssistant [pre-trained reward models](https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2)
</details>

<details>
<summary><b><i> Synthetic data generation models </i></b></summary>

We support both APIs from OpenAI and  🤗 transformers:

- OpenAI: da-vinci-003, gpt-3.5-turbo **(⚠️WIP)**
- HuggingFace: Flan-T5 (3B and 11B)

> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".

:watninh

If you need support for different models, please open an issue and we will get to work.
</details>

## Hardware requirements

<details><summary><b><i> Training </i></b></summary>

Larger actor models require more powerful hardware. Here is a rough hardware recommendation table, suggesting the right type of hardware for different actor model sizes:

- 125M to 1.3B → 1x Nvidia 3090/4090
- 1.3B to 3B → 1x Nvidia A100 (80Gb)
- 3B with DeepSpeed CPU off-loading → 1x Nvidia 3090/4090
- 3B to 7B with DeepSpeed ZeRO → 4x Nvidia T4
- 3B to 13B → 4x Nvidia A100 (80Gb)
- 13B to 20B with DeepSpeed ZeRO → 4x Nvidia A100 (80Gb)
- 13B to 20B → 8x Nvidia A100 (80Gb)
</details>

<details><summary><b><i> Inference </i></b></summary>

**(⚠️WIP)** When it comes to inference optimization, ChatLLaMA will support the following optimization techniques:

- [ ]  DeepSpeed ZeRO
- [ ]  FlexGen
- [ ]  HF Accelerate
- [ ]  PyTorch Vanilla
</details>

Please note that inference optimization has yet to be implemented. If you would like to contribute, please see the **issue roadmap**, community contributions are always welcome 😊.

## Dataset preparation

To successfully train a ChatLLaMA assistant, you need 3 different datasets: `actor_training_data`, `rlhf_training_data` and `reward_training_data`.

<details>
<summary> Dataset for supervised fine-tuning of the actor model </summary>
    
The `actor_training_data` is a collection of prompts with the associated responses as highlighted below:

```json
[
  {
      "user_input": "here the input of the user",
      "completion": "here the model completion"
  }
]
```

ChatLLaMA supports 4 different options to prepare the `actor_training_data`:

* <details><summary> Use 100% synthetic data </summary>

  The dataset can be synthetically generated by running the following command:

  ```bash
  python artifacts/generate_actor_dataset.py
  ```

  > :warning: Note that this command will require a subscription to OpenAI. Generating the full dataset with `davinci-003` could cost approximately ~200$.
  > 
  > :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".

  Alternatively, you can generate the dataset for free using 🤗 tranformers as described in the section [Supported models](#supported-models).
  </details>
  
* <details><summary> Use one of the open source datasets with assistant interactions </summary>

  Currently, we support:

  - [Anthropic HH RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf): this dataset consists of structured question/answer pairs with an LLM chatbot that includes selected and rejected answers;
  - [Stanford Human Preferences Dataset (SHP)](https://huggingface.co/datasets/stanfordnlp/SHP): this dataset is curated from selected "ask" subreddits, and includes questions that span a wide range of question/answer pairs based on the most upvoted responses. Please note that, unlike HH RLHF, this dataset is not intended to reduce harassment by selecting the ideal chatbot response, but instead weights the most helpful human responses.

  The datasets can be downloaded running the following command:

  ```bash
  python artifacts/download_dataset.py <dataset_name> --path <path_to_folder_for_download> --number_of_samples <N>
  ```

  Where: 

  - `<dataset_name>` could be "SHP" for the StanfordNLP/SHP dataset or "ARLHF" for the Anthropic/hh-rlhf dataset;
  - `<path_to_folder_for_download>` is the folder path to where the datasets are going to be created;
  - `<N>` is the number of samples of which the reward_dataset.json is composed.
  </details>
  
  
* <details><summary> Use 100% personalized dataset </summary>

  The user provides his own personalized full dataset. Datasets must be JSON files with the following format:

  ```
  [
      {
          "user_input": "here the input of the user",
          "completion": "here the model completion"
      }
  ]
  ```

  Where the list contains multiple dictionaries, and each dictionary corresponds to a data sample. We suggest using more than 1000 data samples to run the actor training.
  </details>

* <details><summary> (⚠️WIP) Create the full dataset augmenting few custom data samples </summary>

  The dataset can be generated synthetically from a few prompt+response examples provided by the user (few =>10).
  </details>
</details>

<details>
<summary> Dataset for RLHF </summary>
    
The dataset for RLHF consists just of prompt examples:

```json
[
    {
        "user_input": "here the example of user input"
    }
]
```

It can be provided in 2 different ways:

* <details><summary> Few examples provided by the user and dataset synthetically expanded using LLM </summary>

    You need to add the key `rlhf` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file with the information about the task you want to perform and extra context needed by the LLM for the generation. Here is an example of template:

    ```json
    {
      "rlhf": "Here is the template for the generating RLHF prompts. The task we want to perform is ..."
    }
    ```

     *Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json)*
     </details>

* <details><summary> The user provides the full dataset with possible interactions with the model </summary>

    The dataset needs to contain more than 1000 prompt examples:

    ```json
    [
        {
            "user_input": "here the example of user input"
        }
    ]
    ```

    The file must be named `rlhf_training_data.json`.
    </details>
</details>
<details>
<summary><b> Dataset to train the reward model </b></summary>

The `reward_training_data` is a collection of i) prompts, ii) completion and iii) score of the completion assigned accordingly to the user feedback (the Human Feedback in RLHF). 

```json
[{
	"user_input": "...",
	"completion": "...",
	"score": 1
},
	...
]
```

We support 3 different options to prepare the `reward_training_data`: 

- Fully Synthetic Score Generation
    
    In this case the reward dataset can be synthetically scored using a LLM as Human Feedback. We recommend the `reward_training_data` having at least 100 data samples.
    
    ```json
    [{
    	"user_input": "...",
    	"completion": "...",
    	"score": None
    },
    	...
    ]
    ```
    
    A LLM model is used to assign the score to each entry. 
    
    The LLM needs a prompt template containing all the instructions to evaluate the generated text. To do this, you should add the key `reward` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file. Here is an example:
    
    ```json
    {
    	"reward": "Here is the template for the reward model. The rules are:\n\n1.Rule 1\n\n2. Rule 2"
    }
    ```
    
    If no template is provided the default one is used. You can find the default template in `artifacts/generate_rewards.py`. Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json). 
    
    Once you have the unlabelled dataset, you can generate the scores by running the following command:
    
    ```bash
    python artifacts/generate_rewards.py <dataset_path> --model <model_to_use> --temperature <t> --max_tokens <n> --reward_template <path_to_file.json>
    ```
    
    Where:
    
    - `<dataset_path>` path to the reward dataset to be scored;
    - `<model_to_use>` model to use for the reward. Default and suggested text-davinci-003 (More to come);
    - `<temperature>` temperature used to score the model; temperature=0.1;
    - `<max_tokens>` max_tokens of the generation;
    - `<reward_template>` is the path to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file containing the template to be used for generating the reward. If no path is provided, the default template will be used.
- The user provides their personalized full dataset
    
    Datasets must be JSON files in the following format:
    
    ```json
    [
        {
            "user_input": "here type the user input",
            "completion": "here type the completion",
            "score": 4.0
        },
        {
            "user_input": "here type the user input",
            "completion": "random garbage",
            "score": 0.0
        }
    ]
    ```
    
    Note that at least 100 data samples are required in this case. The file must be named `reward_training_data.json`
    
- **(⚠️WIP)** Few examples provided by the user and dataset synthetically expanded using LLM
</details>

# License

See the [LICENSE](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/LICENSE) file.


================================================
FILE: optimization/chatllama/artifacts/config/config.yaml
================================================
---
trainer_config:
  # learning rates
  actor_lr: 0.000005
  critic_lr: 0.000009
  # PPO Hyperparameters
  actor_eps_clip: 0.2
  critic_eps_clip: 0.2
  beta_s: 0.02
  # coefficient for the discounted rewards
  gamma_discounted: 1 
  # path to examples to be sampled (training dataset) see rlhf_dataset.json
  examples_path: "./datasets/rlhf_training_data.json"
  # number of episodes and generation performed for each episode
  # in the train() method
  num_episodes: 100
  max_timesteps: 32
  # number of timesteps after which the learn() method is called 
  # (to update the weights)
  update_timesteps: 32
  # number of example sampled at each timestep
  num_examples: 1
  # batch and epochs for the training
  batch_size: 1
  epochs: 1
  # number of episodes after which update the checkpoints in RL training
  checkpoint_steps: 1000
  # here specify the name of the actor_rl checkpoint from which resume 
  # during actor RL training. If null load the last one.
  checkpoint_name: null

actor_config:
  model: "facebook/opt-1.3b"
  model_folder: "./models"
  tokenizer_path: "path-to-tokenizer"
  train_dataset_path: "./datasets/actor_training_data.json"
  validation_dataset_path: null
  # froze model embedding during training
  froze_embeddings: True
  # use fairscale layers to build the model instead of vanilla pytorch
  # only for llama
  use_fairscale: False
  # max sequence length for the actor (i.e. prompt + completion) it depends on
  # the model used.
  max_sequence_length: 2048
  # max tokens generated by the actor (completion only)
  max_tokens: 2048
  # minimum number of tokens generated by the actor
  min_tokens: 100
  # additional prompt tokens to be used for template or as safety
  additonal_prompt_tokens: 20
  # temperature for the actor
  temperature: 0.1
  batch_size: 2
  # number iteration after print
  iteration_per_print: 1
  lr: 0.000009
  epochs: 1
  # number of backpropagation after saving the checkpoints
  checkpoint_steps: 5000
  # number of checkpoints to keep while removing the older 
  # (keep memory consumption of checkpoints reasonable)
  n_checkpoints_to_keep: 5
  # here specify the name of the actor checkpoint from which resume 
  # during actor training. If null load the last one.
  checkpoint_name: null
  # deepspeed settings
  deepspeed_enable: False
  deepspeed_config_path: "./artifacts/config/ds_config.json"
  # accelerate settings
  accelerate_enable: False
  # use_peft - the parameters of PEFT can be modified in the peft_config.yaml
  peft_enable: False
  peft_config_path: "./artifacts/config/peft_config.yaml"

reward_config:
  # model to be chosen are gp2-large, bart-base, longformer-base-4096
  # more can be simply added in the reward.py __init__()
  model: "facebook/opt-125m"
  model_folder: "./models"
  # hidden size of the additional ffw head to produce the scores
  model_head_hidden_size: 2048
  max_sequence_length: 2048
  train_dataset_path: "./datasets/reward_training_data.json"
  validation_dataset_path: null
  batch_size: 8
  epochs: 1
  iteration_per_print: 1
  # steps after which the checkpoint are saved
  checkpoint_steps: 10000
  # here specify the name of the reward checkpoint from which resume 
  # during reward training. If null load the last one.
  checkpoint_name: null
  lr: 0.000009
  # deepspeed settings
  deepspeed_enable: False
  deepspeed_config_path: "./artifacts/config/ds_config.json"
  # accelerate settings
  accelerate_enable: False

critic_config:
  # model to be chosen are gp2-large, bart-base, longformer-base-4096
  # more can be simply added in the reward.py __init__()
  model: "facebook/opt-125m"
  # hidden size of the additional ffw head to produce the scores
  model_head_hidden_size: 2048
  max_sequence_length: 2048
  model_folder: "./models"
  # here specify the name of the critic checkpoint from which resume 
  # during critic training. If null load the last one.
  checkpoint_name: null


================================================
FILE: optimization/chatllama/artifacts/config/ds_config.json
================================================
{
    "train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.00015
      }
    },
    "fp16": {
      "enabled": false,
      "auto_cast": false,
      "loss_scale": 0,
      "initial_scale_power": 16,
      "loss_scale_window": 1000,
      "hysteresis": 2,
      "min_loss_scale": 1
  },
  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "overlap_comm": false,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "contiguous_gradients" : true,
    "offload_param": {
      "device": "cpu",
      "nvme_path": "/local_nvme",
      "pin_memory": true,
      "buffer_count": 5,
      "buffer_size": 1e8,
      "max_in_cpu": 1e9
    },
    "offload_optimizer": {
      "device": "cpu",
      "nvme_path": "/local_nvme",
      "pin_memory": true,
      "buffer_count": 4,
      "fast_init": false
    },
    "stage3_max_live_parameters" : 1e9,
    "stage3_max_reuse_distance" : 1e9,
    "stage3_prefetch_bucket_size" : 5e8,
    "stage3_param_persistence_threshold" : 1e6,
    "sub_group_size" : 1e12,
    "elastic_checkpoint" : true,
    "stage3_gather_16bit_weights_on_model_save": true,
    "ignore_unused_parameters": true,
    "round_robin_gradients": true
    }
  }

================================================
FILE: optimization/chatllama/artifacts/config/peft_config.yaml
================================================
---
inference_mode: False
r: 8
lora_alpha: 32
lora_dropout: 0.1


================================================
FILE: optimization/chatllama/artifacts/datasets/actor_dataset.json
================================================
[
    {
        "user_input": "here the input of the user",
        "completion": "here the model completion"
    }
]

================================================
FILE: optimization/chatllama/artifacts/datasets/reward_dataset.json
================================================
[
    {
        "user_input": "here type the user input",
        "completion": "here type the completion",
        "score": 4.0
    },
    {
        "user_input": "here type the user input",
        "completion": "if score is null, it can be evaluated by davinci using reward_trainer.distill()",
        "score": null 
    }
]


================================================
FILE: optimization/chatllama/artifacts/datasets/rlhf_dataset.json
================================================
[
    {
        "user_input": "here the example of user input"
    }
]

================================================
FILE: optimization/chatllama/artifacts/download_dataset.py
================================================
import argparse
import os

from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset


if __name__ == "__main__":

    # Setup argument parser
    parser = argparse.ArgumentParser(
        prog="generate_rewards.py",
        description="Generate rewards using LangChain and LLMs",
    )

    parser.add_argument(
        "dataset_name",
        help="dataset name it can be. SSHP: stanfordnlp/SHP or ",
        choices=["SHP", "ARLHF"],
    )
    parser.add_argument(
        "-p",
        "--path",
        help="Specify the path for the dataset",
        default="./datasets",
    )
    parser.add_argument(
        "-n",
        "--number_of_samples",
        help="Specify the number of samples for the reward dataset",
        default=200,
    )

    args = parser.parse_args()
    if os.path.exists(args.path) is False:
        os.mkdir(args.path)

    try:
        n_samples = int(args.number_of_samples)
    except ValueError:
        raise ValueError("Number of samples should be an integer")

    if args.dataset_name == "SHP":
        dataset = StanfordNLPSHPDataset()
        dataset.save_dataset(args.path, n_samples)

    elif args.dataset_name == "ARLHF":
        dataset = AnthropicRLHF()
        dataset.save_dataset(
            args.path,
            n_samples,
        )


================================================
FILE: optimization/chatllama/artifacts/extend_rlhf_dataset.py
================================================
import os.path

import numpy as np
from langchain import OpenAI, LLMChain, PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


def _get_template_and_variables(prompt: str, with_examples: bool):
    if with_examples:
        template = prompt + "\n\nExample: {example}"
        variables = ["example"]
    else:
        template = prompt
        variables = []
    return template, variables


def use_langchain_model(
    user_prompt: str,
    model_name: str,
    temperature: float = 0.7,
    max_tokens: int = 2048,
    with_examples: bool = False,
) -> LLMChain:
    llm = OpenAI(
        model_name=model_name, temperature=temperature, max_tokens=max_tokens
    )
    template, input_variables = _get_template_and_variables(
        user_prompt, with_examples=with_examples
    )
    prompt_template = PromptTemplate(
        template=template,
        input_variables=input_variables,
    )

    return LLMChain(llm=llm, prompt=prompt_template)


class HuggingFaceChain:
    def __init__(
        self, model_name: str, user_prompt: str, with_examples: bool = False
    ):
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.prompt, self.input_variables = _get_template_and_variables(
            user_prompt, with_examples=with_examples
        )

    def run(self, **kwargs):
        prompt = self.prompt.format(**kwargs)
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
        output = self.model.generate(
            input_ids, max_length=100, num_beams=5, early_stopping=True
        )
        return self.tokenizer.decode(output[0], skip_special_tokens=True)


def use_huggingface_model(
    user_prompt: str,
    model_name: str,
    with_examples: bool = False,
) -> HuggingFaceChain:
    return HuggingFaceChain(
        model_name, user_prompt, with_examples=with_examples
    )


def main():
    import json
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        help="Model name.",
        default="google/flan-t5-xl",
    )
    parser.add_argument("--templates", type=str, help="Path to templates.")
    parser.add_argument("--num_prompts", type=int, default=1000)
    parser.add_argument(
        "--data_dir", type=str, help="Path where data are stored"
    )

    args = parser.parse_args()
    model_name = args.model
    templates_path = args.templates
    data_dir = args.data_dir

    with open(os.path.join(data_dir, "rlhf_training_data.json"), "r") as f:
        examples = json.load(f)

    with open(templates_path, "r") as f:
        templates = json.load(f)
    user_prompt = templates.get("rlhf")
    if user_prompt is None:
        raise ValueError("No rlhs template found.")

    if "davinci" in model_name:
        chain = use_langchain_model(
            user_prompt, model_name, with_examples=True
        )
    else:
        if "t5" not in model_name:
            raise ValueError("Only Flan-t5 models are supported for HF.")
        chain = use_huggingface_model(
            user_prompt, model_name, with_examples=True
        )

    for i in range(args.num_prompts):
        example = np.random.choice(examples)
        new_example = chain.run(example=example["user_input"])
        example_dict = {"user_input": new_example}
        examples.append(example_dict)

    with open(os.path.join(data_dir, "rlhf_training_data.json"), "w") as f:
        json.dump(examples, f)


if __name__ == "__main__":
    main()


================================================
FILE: optimization/chatllama/artifacts/generate_actor_dataset.py
================================================
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain.chains.conversation.memory import (
    ConversationBufferWindowMemory,
)

from chatllama.langchain_modules.prompt_templates import (
    PERSON_CHATBOT_TEMPLATE,
    AI_CHATBOT_TEMPLATE,
)


CONVERSATION_LENGTH = 20


def create_conversation(human_agent: LLMChain, bot_agent: LLMChain):
    conversation = []
    chatbot_output = ""
    for i in range(CONVERSATION_LENGTH):
        # Human agent goes first
        human_output = human_agent.run(chatbot_input=chatbot_output)
        conversation.append(f"Human: {human_output}")
        chatbot_output = bot_agent.run(human_input=human_output)
        conversation.append(f"AI: {chatbot_output}")
    return "\n".join(conversation)


def build_agents():
    # be aware that too long completions will not fit the sequence length
    # of possible critic or reward models ...
    llm = OpenAI(max_tokens=2048, temperature=0.7)
    human_template = PromptTemplate(**PERSON_CHATBOT_TEMPLATE)
    human_agent = LLMChain(
        llm=llm,
        prompt=human_template,
        memory=ConversationBufferWindowMemory(k=4),
    )
    bot_template = PromptTemplate(**AI_CHATBOT_TEMPLATE)
    bot_agent = LLMChain(
        llm=llm,
        prompt=bot_template,
        memory=ConversationBufferWindowMemory(k=4),
    )
    return human_agent, bot_agent


def get_sub_conversations(conversation: str, system_prompt: str):
    interactions = conversation.split("AI:")
    sub_conversations = []
    for i in range(len(interactions) - 1):
        user_input = system_prompt + "AI:".join(interactions[: i + 1])
        completion = interactions[i + 1].split("Human:")[0].strip()
        sub_conversations.append(
            {"user_input": user_input, "completion": completion}
        )
    return sub_conversations


def main():
    import json
    import os
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument("--num_conversations", type=int, default=1000)
    parser.add_argument("--output_dir", type=str, default="conversations")
    parser.add_argument("--templates", type=str, default=None)
    args = parser.parse_args()

    if args.templates is not None:
        with open(args.templates, "r") as f:
            templates = json.load(f)
        template = templates["actor"]
    else:
        template = ""
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    for conv in range(args.num_conversations):
        human_agent, bot_agent = build_agents()
        conversation = create_conversation(human_agent, bot_agent)
        with open(
            os.path.join(args.output_dir, f"conversation_{conv}.txt"), "w"
        ) as f:
            f.write(conversation)

    # convert the conversations to a single json file
    data = []
    for conv in range(args.num_conversations):
        with open(
            os.path.join(args.output_dir, f"conversation_{conv}.txt"), "r"
        ) as f:
            conversation = f.read()
        sub_conversations = get_sub_conversations(conversation, template)
        data.extend(sub_conversations)
    with open(
        os.path.join(args.output_dir, "actor_training_data.json"), "w"
    ) as f:
        json.dump(data, f)


if __name__ == "__main__":
    main()


================================================
FILE: optimization/chatllama/artifacts/generate_rewards.py
================================================
import argparse
import json

from langchain import OpenAI, LLMChain, PromptTemplate


class ScoreGenerator:
    def __init__(
        self,
        llm_model: str,
        llm_temperature: float,
        llm_max_tokens: int,
        reward_template: dict,
    ) -> None:

        self.llm_max_tokens = llm_max_tokens
        self.llm_temperature = llm_temperature
        self.llm_model = llm_model

        # initialize LLM and LangChain
        openai_llm = OpenAI(
            model_name=llm_model,
            temperature=llm_temperature,
            max_tokens=llm_max_tokens,
        )

        # Customaize your own Reward template by changing the
        # prompt_template
        prompt_template = PromptTemplate(**reward_template)
        print(prompt_template)
        self.llm = LLMChain(llm=openai_llm, prompt=prompt_template)

    def distill(
        self,
        dataset_path: str,
    ) -> None:
        """Parse the dataset and assign scores using LLMs
        then save back the dataset with the uploaded scores
        """

        print("Assigning scores to the reward dataset...")

        # load the dataset
        with open(dataset_path, "r") as f:
            train_data = json.load(f)

        # for each element of the dataset, assing a score.
        for i, data in enumerate(train_data):
            if data.get("score", None) is None:

                user_input = data["user_input"]
                completion = data["completion"]
                print(
                    f"#### Data {i}:\n"
                    f"#### User_input:\n {user_input}\n"
                    f"#### Completion:\n {completion}\n"
                )
                prompt_tokens = (
                    data["user_input"]
                    + data["completion"]
                    + self.llm.prompt.template
                )
                prompt_len = int(len(prompt_tokens.split(" ")) / 0.75)
                # 80% of the max length as safety margin
                if prompt_len > self.llm_max_tokens * 0.8:
                    print(
                        f"The prompt of the data {i} is too long\n"
                        f"tokens: {prompt_len}\n"
                        f"max_tokens: {self.llm_max_tokens * 0.8}"
                    )
                    continue
                score = self.llm.run(
                    user_input=data["user_input"],
                    completion=data["completion"],
                ).strip()
                # TODO: extract from score the float value with a regex
                try:
                    score = float(score)
                except Exception:
                    print(
                        f"The score returned by the LLM for the"
                        f"data, {i}, is not a float float:\n{score}"
                    )
                    continue
                data["score"] = score
                print(f"### Score: {score} \n\n")
        # remove all the data that have no score
        train_data = [data for data in train_data if data.get("score", None)]
        # save the dataset back
        print("Writing the updated dataset back to disk ... ")
        with open(dataset_path, "w") as f:
            json.dump(train_data, f)

        print("Score Assignment Completed")


if __name__ == "__main__":

    REWARD_TEMPLATE = dict(
        template=(
            "You have to evaluate the following chat with a score"
            "between 0 and 5"
            "You MUST evaluate: text quality, content quality and"
            "coherence.\n"
            "You MUST return only the number that represents your"
            "judgment.\n"
            "The input of the user is: {user_input}\n"
            "The output of the chatbot is: {completion}\n"
            "The score is:\n"
        ),
        input_variables=["user_input", "completion"],
    )

    # Setup argument parser
    parser = argparse.ArgumentParser(
        prog="generate_rewards.py",
        description="Generate rewards using LangChain and LLMs",
    )

    parser.add_argument("dataset_path", help="Path to the dataset")
    parser.add_argument(
        "-m",
        "--model",
        help="Specify the model to be used",
        default="text-davinci-003",
    )
    parser.add_argument(
        "-t",
        "--temperature",
        help="Specify the temperature of the score assignment",
        default=0.5,
    )
    parser.add_argument(
        "-k",
        "--max_tokens",
        help="Specify the max tokens of the score assignement",
        default=2048,
    )
    parser.add_argument(
        "-r",
        "--reward_template",
        help="Specify the reward template to be used",
        default=None,
    )

    # parse arguments
    args = parser.parse_args()

    if args.reward_template:
        templates = json.loads(args.reward_template)
        if templates.get("reward", None) is None:
            rw_template = REWARD_TEMPLATE
        else:
            rw_template = templates["reward"]
    else:
        rw_template = REWARD_TEMPLATE

    score_generator = ScoreGenerator(
        args.model, args.temperature, args.max_tokens, rw_template
    )

    score_generator.distill(args.dataset_path)


================================================
FILE: optimization/chatllama/artifacts/main.py
================================================
import argparse

from chatllama.rlhf.actor import ActorTrainer
from chatllama.rlhf.config import Config
from chatllama.rlhf.dataset import BaseDataset
from chatllama.rlhf.reward import RewardTrainer
from chatllama.rlhf.trainer import RLTrainer


# Setup argument parser
parser = argparse.ArgumentParser(
    prog="main.py", description="RLHF Training of ChatBots"
)

parser.add_argument("configfile", help="Path to config.yaml file")
parser.add_argument(
    "-t",
    "--type",
    help=(
        "Specify the training type. RL: Training of the model using RL."
        "ACTOR: Training of the actor model. "
        "REWARD: Training of the reward model."
        "RL: The whole pipeline with the three training steps"
    ),
    default="ALL",
    choices=["ALL", "RL", "ACTOR", "REWARD"],
)
parser.add_argument(
    "-a", "--actor", help="Specify actor model by name", default=None
)
parser.add_argument(
    "-r", "--reward", help="Specify reward model by name", default=None
)
parser.add_argument("--local_rank", help="Local rank parameter for deepspeed", default=None)

# parse arguments
args = parser.parse_args()

# load config.yaml with all the project informations
config = Config(args.configfile)

# overwrite config if specified differently
if args.actor is not None:
    config.actor.model = args.actor
if args.reward is not None:
    config.reward.model = args.reward

# perform the desired training
if args.type == "RL":
    max_seq = min(
        config.actor.max_sequence_length,
        config.reward.max_sequence_length,
        config.critic.max_sequence_length,
    )
    config.actor.max_sequence_length = max_seq
    BaseDataset.clean_dataset(config)
    rlhf_trainer = RLTrainer(config)
    rlhf_trainer.train()
elif args.type == "ACTOR":
    BaseDataset.clean_dataset(config.actor)
    actor_trainer = ActorTrainer(config.actor)
    actor_trainer.train()
elif args.type == "REWARD":
    BaseDataset.clean_dataset(config.reward)
    reward_trainer = RewardTrainer(config.reward)
    reward_trainer.train()
elif args.type == "ALL":
    reward_trainer = RewardTrainer(config.reward)
    reward_trainer.train()
    actor_trainer = ActorTrainer(config.actor)
    actor_trainer.train()
    rlhf_trainer = RLTrainer(config)
    rlhf_trainer.train()


================================================
FILE: optimization/chatllama/artifacts/templates.json
================================================
{
    "rlhf": "You are an AI assistant used to generate possible prompts instructions for a chatbot, here is an example of conversation."
}

================================================
FILE: optimization/chatllama/chatllama/__init__.py
================================================


================================================
FILE: optimization/chatllama/chatllama/langchain_modules/__init__.py
================================================


================================================
FILE: optimization/chatllama/chatllama/langchain_modules/prompt_templates.py
================================================
REWARD_TEMPLATE = dict(
    template=(
        "You have to evaluate the following chat with a score between 0 and 5"
        "You MUST evaluate: text quality, content quality and"
        "coherence.\n"
        "You MUST return only the number that represents your"
        "judgment.\n"
        "The assignement is:\n{user_input}\n"
        "The completion is:\n{completion}\n"
    ),
    input_variables=["user_input", "completion"],
)


AI_CHATBOT_TEMPLATE = dict(
    template=(
        "Assistant is a large language model trained by Meta and Nebuly.ai\n"
        "Assistant is designed to be able to assist with a wide range of "
        "tasks, from answering simple questions to providing in-depth "
        "explanations and discussions on a wide range of topics. As a "
        "language model, Assistant is able to generate human-like text "
        "based on the input it receives, allowing it to engage in "
        "natural-sounding conversations and provide responses that are "
        "coherent and relevant to the topic at hand.\n\n"
        "Assistant is constantly learning and improving, and its capabilities "
        "are constantly evolving. It is able to process and understand large "
        "amounts of text, and can use this knowledge to provide accurate and "
        "informative responses to a wide range of questions. Additionally, "
        "Assistant is able to generate its own text based on the input it "
        "receives, allowing it to engage in discussions and provide "
        "explanations and descriptions on a wide range of topics.\n\n"
        "Overall, Assistant is a powerful tool that can help with a wide "
        "range of tasks and provide valuable insights and information on a "
        "wide range of topics. Whether you need help with a specific "
        "question or just want to have a conversation about a particular "
        "topic, Assistant is here to assist.\n\n{history}\n\n"
        "Human: {human_input}\n"
        "Assistant:"
    ),
    input_variables=["history", "human_input"],
)


PERSON_CHATBOT_TEMPLATE = dict(
    template=(
        "You are a human chatting with a chatbot. The chatbot is a large "
        "language model trained by Meta and Nebuly-ai\n"
        "The chatbot is designed to be able to assist you with a wide range "
        "of tasks, from answering simple questions to providing in-depth "
        "explanations and discussions on a wide range of topics. You are a "
        "human and you are testing the chatbot. Ask the chatbot questions and"
        "see how it responds. You can also ask the chatbot to tell you a "
        "story."
        "\n\n{history}\n\n"
        "Chatbot: {chatbot_input}\n"
        "Human:"
    ),
    input_variables=["history", "chatbot_input"],
)


================================================
FILE: optimization/chatllama/chatllama/llama_model.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms
# of the GNU General Public License version 3.

import json
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple, List, Union, Optional

import deepspeed
import torch
import torch.distributed
import torch.nn.functional as F
import fairscale.nn.model_parallel.initialize as fs_init
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from fairscale.nn.model_parallel.layers import (
    ParallelEmbedding,
    RowParallelLinear,
    ColumnParallelLinear,
)
from torch import nn
from transformers import AutoTokenizer

from llama import Tokenizer
from llama.generation import sample_top_p


class MyTokenizer:
    """Masked tokenizer of hugging face to be similar to the one of meta,
    just used for testing purposes.
    """

    def __init__(self, model_path: Optional[str] = None):

        if model_path is None:
            self.sp_model = AutoTokenizer.from_pretrained("gpt2")
        else:
            self.sp_model = AutoTokenizer.from_pretrained(model_path)

        self.n_words = self.sp_model.vocab_size
        self.bos_id = self.sp_model.bos_token_id
        self.eos_id = self.sp_model.eos_token_id
        self.pad_id = self.sp_model.eos_token_id

    def encode(
        self,
        s: str,
        bos: bool = True,
        eos: bool = True,
        truncation: bool = True,
    ) -> List[int]:
        output = self.sp_model.encode(s, truncation=truncation)
        t = list(output)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        input = torch.as_tensor(t)
        output = self.sp_model.decode(input)
        return output


class HFLikeTokenizer:
    def __init__(self, tokenizer: Tokenizer):
        self.tokenizer = tokenizer

        # assign attributes from real tokenizer to masked one
        self.pad_id = self.tokenizer.pad_id
        self.eos_id = self.tokenizer.eos_id
        self.bos_id = self.tokenizer.bos_id

        # mask attribute to be similar to hugging face
        self.eos_token_id = self.tokenizer.eos_id
        self.pad_token_id = self.tokenizer.pad_id

        # to match hugging face attribute
        self.pad_token_id = self.pad_id

    def create_sequence_mask(self, tokens: torch.Tensor) -> torch.Tensor:
        mask = torch.where(
            tokens == self.tokenizer.pad_id,
            torch.zeros_like(tokens),
            torch.ones_like(tokens),
        )
        mask = torch.where(
            tokens == self.tokenizer.bos_id, torch.zeros_like(tokens), mask
        )
        mask = torch.where(
            tokens == self.tokenizer.eos_id, torch.zeros_like(tokens), mask
        )
        return mask

    def __call__(self, texts: Union[List[str], str], *args, **kwargs):
        if isinstance(texts, str):
            text = self.tokenizer.encode(texts, bos=True, eos=True)
            tokens = torch.tensor(text).long()
            mask = torch.ones_like(tokens)
        else:
            texts = [
                self.tokenizer.encode(text, bos=True, eos=True)
                for text in texts
            ]
            max_len = max(len(text) for text in texts)
            tokens = torch.full(
                (len(texts), max_len), self.tokenizer.pad_id
            ).long()
            for i, text in enumerate(texts):
                tokens[i, -len(text) :] = torch.tensor(  # noqa E203
                    text
                ).long()

            # TODO: decide how eos and bos should be handled - i need to mask
            # them? or not?
            mask = self.create_sequence_mask(tokens)
            for i in range(tokens.shape[0]):
                current_tokens = tokens[i, mask[i] == 1]
                tokens[
                    i, -len(current_tokens) - 1 : -1  # noqa E203
                ] = current_tokens
            mask = self.create_sequence_mask(tokens)

            # convert `pad_id` from -1 to 0, otherwise embedding will cause out
            # of bounds.
            tokens = torch.where(
                tokens == self.tokenizer.pad_id,
                torch.zeros_like(tokens),
                tokens,
            )
        output = {
            "input_ids": tokens,
            "attention_mask": mask,
        }
        return output

    def decode(self, tokens):
        return self.tokenizer.decode(tokens)


@dataclass
class ModelArgs:
    """This class is a modification of the ModelArgs class implemented in
    the LLaMA repo. The class has been modified for training, since the
    original one just supports inference.
    """

    dim: int = 512
    n_layers: int = 8
    n_heads: int = 8
    # defined later by tokenizer
    vocab_size: int = -1
    # make SwiGLU hidden layer size multiple of large power of 2
    multiple_of: int = 256
    norm_eps: float = 1e-5

    max_batch_size: int = 32
    max_seq_len: int = 1024

    # added attributes
    froze_embeddings: bool = True
    use_fairscale: bool = True


class RMSNorm(torch.nn.Module):
    """This class is a modification of the RMSNorm class implemented in
    the LLaMA repo. The class has been modified for training, since the
    original one just supports inference.
    """

    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight


def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    freqs = 1.0 / (
        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
    )
    t = torch.arange(end, device=freqs.device)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis


def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
    ndim = x.ndim
    assert 0 <= 1 < ndim
    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
    shape = [
        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
    ]
    return freqs_cis.view(*shape)


def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)


class Attention(nn.Module):
    """This class is a modification of the Attention class implemented in
    the LLaMA repo. The class has been modified for training, since the
    original one just supports inference.
    """

    def __init__(self, args: ModelArgs):
        super().__init__()

        if args.use_fairscale:
            self.n_local_heads = (
                args.n_heads // fs_init.get_model_parallel_world_size()
            )
        else:
            self.n_local_heads = args.n_heads
        self.head_dim = args.dim // args.n_heads

        if args.use_fairscale:
            self.wq = ColumnParallelLinear(
                args.dim,
                args.n_heads * self.head_dim,
                bias=False,
                gather_output=False,
                init_method=lambda x: x,
            )
            self.wk = ColumnParallelLinear(
                args.dim,
                args.n_heads * self.head_dim,
                bias=False,
                gather_output=False,
                init_method=lambda x: x,
            )
            self.wv = ColumnParallelLinear(
                args.dim,
                args.n_heads * self.head_dim,
                bias=False,
                gather_output=False,
                init_method=lambda x: x,
            )
            self.wo = RowParallelLinear(
                args.n_heads * self.head_dim,
                args.dim,
                bias=False,
                input_is_parallel=True,
                init_method=lambda x: x,
            )
        else:
            self.wq = nn.Linear(
                args.dim, args.n_heads * self.head_dim, bias=False
            )
            self.wk = nn.Linear(
                args.dim, args.n_heads * self.head_dim, bias=False
            )
            self.wv = nn.Linear(
                args.dim, args.n_heads * self.head_dim, bias=False
            )
            self.wo = nn.Linear(
                args.n_heads * self.head_dim, args.dim, bias=False
            )

        self.dim_cache = (
            args.max_batch_size,
            args.max_seq_len,
            self.n_local_heads,
            self.head_dim,
        )
        self.cache_k = torch.zeros(self.dim_cache).cuda()

        self.cache_v = torch.zeros(self.dim_cache).cuda()

    def forward(
        self,
        x: torch.Tensor,
        kv_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        cache_k: Optional[torch.Tensor] = None,
        cache_v: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        start_pos = 0  # Temporary

        bsz, seqlen, _ = x.shape
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
        xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
        xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)

        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        # Modified code to allow training, caching is not good for training
        if (cache_k is None and cache_v is not None) or (
            cache_k is not None and cache_v is None
        ):
            raise ValueError("cache_k is None while cache_v is not None")
        if cache_k is None:
            keys = xk
            values = xv
        else:
            cache_k.to(xk.device)
            cache_v.to(xv.device)
            cache_k[:bsz, start_pos : start_pos + seqlen] = xk  # noqa E203
            cache_v[:bsz, start_pos : start_pos + seqlen] = xv  # noqa E203
            keys = self.cache_k[:bsz, : start_pos + seqlen]  # noqa E203
            values = self.cache_v[:bsz, : start_pos + seqlen]  # noqa E203

        xq = xq.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(
            self.head_dim
        )
        if kv_mask is not None:
            scores = scores + kv_mask
        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
        output = torch.matmul(scores, values)
        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
        if cache_k is None:
            return self.wo(output), None, None
        else:
            return self.wo(output), self.cache_k, self.cache_v


class FeedForward(nn.Module):
    """This class is a modification of the FeedForward class implemented in
    the LLaMA repo. The class has been modified for training, since the
    original one just supports inference.
    """

    def __init__(
        self, dim: int, hidden_dim: int, multiple_of: int, use_fairscale: bool
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        hidden_dim = multiple_of * (
            (hidden_dim + multiple_of - 1) // multiple_of
        )

        if use_fairscale:
            self.w1 = ColumnParallelLinear(
                dim,
                hidden_dim,
                bias=False,
                gather_output=False,
                init_method=lambda x: x,
            )
            self.w2 = RowParallelLinear(
                hidden_dim,
                dim,
                bias=False,
                input_is_parallel=True,
                init_method=lambda x: x,
            )
            self.w3 = ColumnParallelLinear(
                dim,
                hidden_dim,
                bias=False,
                gather_output=False,
                init_method=lambda x: x,
            )
        else:
            self.w1 = nn.Linear(dim, hidden_dim, bias=False)
            self.w2 = nn.Linear(hidden_dim, dim, bias=False)
            self.w3 = nn.Linear(dim, hidden_dim, bias=False)

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class TransformerBlock(nn.Module):
    """This class is a modification of the TransformerBlock class
    implemented in the LLaMA repo. The class has been modified for training,
    since the original one just supports inference.
    """

    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
            hidden_dim=4 * args.dim,
            multiple_of=args.multiple_of,
            use_fairscale=args.use_fairscale,
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.use_fairscale = args.use_fairscale

    def forward(
        self,
        x: torch.Tensor,
        attention_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        cache_k: Optional[torch.Tensor] = None,
        cache_v: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
        # modified from orignal code to enable external cache
        attention_mask = attention_mask[:, None, :, :]
        if self.use_fairscale:
            attention_mask = attention_mask.expand(
                -1,
                self.n_heads // fs_init.get_model_parallel_world_size(),
                -1,
                -1,
            )
        else:
            attention_mask = attention_mask.expand(-1, self.n_heads, -1, -1)
        attn, cache_k, cache_v = self.attention.forward(
            self.attention_norm(x), attention_mask, freqs_cis, cache_k, cache_v
        )
        h = x + attn
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out, cache_k, cache_v


class Transformer(nn.Module):
    """This class is a modification of the Transformer class implemented in
    the LLaMA repo. The class has been modified for training, since the
    original one just supports inference. The generate method was inspired by
    the generate function you can find in `llama.generation`.
    """

    def __init__(self, params: ModelArgs):
        super().__init__()

        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers
        if params.use_fairscale:
            self.n_local_heads = (
                params.n_heads // fs_init.get_model_parallel_world_size()
            )
        else:
            self.n_local_heads = params.n_heads

        self.head_dim = params.dim // params.n_heads
        dim = (
            params.max_batch_size,
            params.max_seq_len,
            self.n_local_heads,
            self.head_dim,
        )
        self.cache_k = [torch.zeros(dim) for _ in range(self.n_layers)]
        self.cache_v = [torch.zeros(dim) for _ in range(self.n_layers)]

        if params.use_fairscale:
            self.tok_embeddings = ParallelEmbedding(
                params.vocab_size, params.dim, init_method=lambda x: x
            )
        else:
            self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)

        if params.froze_embeddings:
            for param in self.tok_embeddings.parameters():
                param.requires_grad = False

        self.layers = torch.nn.ModuleList()
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(layer_id, params))

        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        if params.use_fairscale:
            self.output = ColumnParallelLinear(
                params.dim,
                params.vocab_size,
                bias=False,
                init_method=lambda x: x,
            )
        else:
            self.output = nn.Linear(params.dim, params.vocab_size, bias=False)

        # TODO: How too modify this for training?
        self.freqs_cis = precompute_freqs_cis(
            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
        )

    def forward(
        self, tokens: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        attention_mask = attention_mask.detach()
        logits = self._forward(tokens, attention_mask)
        return logits

    def _forward(
        self, tokens: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        # TEMPORARY FIX, need to understand how to manage the positioning
        # embedding and the batch size with the current padding and masking.
        start_pos = 1
        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]  # noqa E203
        # mask has size (bsz, seqlen). It should be transformed in
        # (bsz, seqlen, seqlen)
        # if the mask is a boolean tensor, convert it to int
        if attention_mask.dtype == torch.bool:
            attention_mask = attention_mask.long()
        kv_mask = attention_mask[:, None, :].expand(_bsz, seqlen, seqlen)
        kv_mask = torch.tril(kv_mask, diagonal=0)
        kv_mask = 1 - kv_mask
        kv_mask = (
            torch.where(
                kv_mask == 1, kv_mask.new_tensor(-9223372036854775808), kv_mask
            )
            .detach()
            .long()
        )

        for i, layer in enumerate(self.layers):
            if not self.training:
                cache_k = self.cache_k[i]
                cache_v = self.cache_v[i]
                h, cache_k, cache_v = layer(
                    h, kv_mask, freqs_cis, cache_k, cache_v
                )
            else:
                h, _, _ = layer(h, kv_mask, freqs_cis)
            if not self.training:
                self.cache_k[i] = cache_k.detach()
                self.cache_v[i] = cache_v.detach()

        h = self.norm(h)
        output = self.output(h)
        return output

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        max_new_tokens: int,
        temperature: float,
        top_p: float = 1.0,
        no_repeat_ngram_size=None,
    ):
        generated_tokens = []
        for cur_pos in range(max_new_tokens):
            logits = self._forward(input_ids, attention_mask)[:, -1, :]
            if temperature > 0:
                probs = torch.softmax(logits / temperature, dim=-1)
                next_token = sample_top_p(probs, top_p)
            else:
                next_token = torch.argmax(logits, dim=-1)
            next_token = next_token.reshape(-1)
            input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
            attention_mask = torch.cat(
                [attention_mask, torch.ones_like(next_token).unsqueeze(1)],
                dim=1,
            )
            generated_tokens.append(next_token)
        sequences = torch.concat(
            (input_ids, torch.stack(generated_tokens, dim=1)), dim=1
        )
        return sequences


def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))
    print("local_rank:", local_rank, "world_size:", world_size)

    torch.distributed.init_process_group("nccl")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)

    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size


def setup_model_deepspeed() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))

    deepspeed.init_distributed()
    torch.cuda.set_device(local_rank)

    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size


def load_checkpoints(
    ckpt_dir: str, local_rank: int, world_size: int
) -> Tuple[dict, dict]:
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert world_size == len(checkpoints), (
        f"Loading a checkpoint for MP={len(checkpoints)} but world "
        f"size is {world_size}"
    )
    ckpt_path = checkpoints[local_rank]
    print("Loading")
    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())
    return checkpoint, params


def load_model(
    ckpt_dir: str,
    tokenizer_path: str,
    local_rank: int,
    world_size: int,
    froze_embeddings: bool,
    use_fairscale: bool,
    max_batch_size: int = 32,
) -> Tuple[Transformer, HFLikeTokenizer]:

    checkpoint, params = load_checkpoints(ckpt_dir, local_rank, world_size)
    model_args: ModelArgs = ModelArgs(
        max_seq_len=1024, max_batch_size=max_batch_size, **params
    )
    model_args.froze_embeddings = froze_embeddings
    model_args.use_fairscale = use_fairscale
    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)
    tokenizer = HFLikeTokenizer(tokenizer)

    return model, tokenizer


def load_tokenizer(tokenizer_path: str):
    tokenizer = Tokenizer(model_path=tokenizer_path)
    return tokenizer


def load_tokenizer_test(tokenizer_path: Optional[str] = None):
    tokenizer = MyTokenizer(model_path=tokenizer_path)
    return tokenizer


def load_model_test(
    ckpt_dir: str,
    tokenizer_path: str,
    local_rank: int,
    world_size: int,
    froze_embeddings: bool,
    use_fairscale: bool,
    max_batch_size: int = 32,
) -> Tuple[Transformer, HFLikeTokenizer]:

    # test the model with hf tokenizer
    model_args = ModelArgs()
    model_args.froze_embeddings = froze_embeddings
    model_args.use_fairscale = use_fairscale
    tokenizer = MyTokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    model = Transformer(model_args).cuda()
    tokenizer = HFLikeTokenizer(tokenizer)

    return model, tokenizer


================================================
FILE: optimization/chatllama/chatllama/rlhf/__init__.py
================================================
"""RLHF implementation inspired to Lucidrains' implementation."""


================================================
FILE: optimization/chatllama/chatllama/rlhf/actor.py
================================================
import json
import yaml
import os
import shutil

import deepspeed
import torch
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Tuple
from einops import rearrange
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

from chatllama.rlhf.config import ConfigActor
from chatllama.rlhf.model_list import (
    hf_models_causal_lm,
    llama_models,
    hf_models,
)

from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.utils import TrainingStats


class ActorModel(torch.nn.Module):
    """Actor model that generates the augmented prompt from the initial
    user_input. The aim is to train this model to generate better prompts.

    Attributes:
        model: The model from LLaMA to be used
        tokenizer: The LLaMA tokenizer
        config (ConfigActor): Configuration for the actor model

    Methods:
        load: Load the model from a path
        save: Save the model to a path
        forward: Compute the action logits for a given sequence.
        generate: Generate a sequence from a given prompt
    """

    def __init__(self, config: ConfigActor) -> None:
        super().__init__()

        # save config
        self.config = config

        # initialize the self.model
        if config.model in llama_models:
            # llama module might not be present when HF models are used
            from chatllama.llama_model import (
                load_model,
                setup_model_parallel,
            )  # noqa

            local_rank, world_size = setup_model_parallel()

            # use load_model_test for testing
            self.model, self.tokenizer = load_model(
                ckpt_dir=config.model_folder,
                tokenizer_path=config.tokenizer_path,
                local_rank=local_rank,
                world_size=world_size,
                froze_embeddings=config.froze_embeddings,
                use_fairscale=config.use_fairscale,
                max_batch_size=config.batch_size,
            )
        elif config.model in hf_models_causal_lm:
            self.tokenizer = self.load_tokenizer(config)
            self.model = AutoModelForCausalLM.from_pretrained(
                config.model,
            )

            # Setup PEFT model
            if config.peft_enable:

                # check that the peft config exist
                if os.path.exists(config.peft_config_path):
                    # Read the peft config from yaml
                    with open(config.peft_config_path, "r") as c:
                        config_peft = yaml.safe_load(c)
                else:
                    raise ValueError(
                        f"PEFT config {config.peft_config_path} not found"
                    )

                print(config_peft)
                # define lora config for peft
                peft_config = LoraConfig(
                    task_type=TaskType.CAUSAL_LM, **config_peft
                )

                # create peft model
                self.model = get_peft_model(
                    model=self.model,
                    peft_config=peft_config,
                )

            self.model.to(config.device)

        else:
            raise ValueError(f"Model {config.model} not supported")

        # load the model from model_folder
        self.load()

    @beartype
    def load(self) -> None:
        """Load the model from the path"""
        # check if there is a model to load
        path = ModelLoader.check_model_path(
            config=self.config,
            is_checkpoint=False,
            current_epoch=None,
        )

        # if there is a model to load
        if path is not None:

            # load the model
            print("Loading ...")
            model_dict = torch.load(path)
            self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model"))

    @beartype
    def save(self) -> None:
        """Save the model to the path"""
        # get the path to save the model
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=False,
            current_epoch=None,
        )

        # save the model
        print(f"Saving model to {path} ...")
        torch.save(
            {"state_dict": self.model.state_dict()},
            path,
        )

    @staticmethod
    def load_tokenizer(config: ConfigActor):
        """Load the tokenizer from the model name"""
        if config.model in hf_models:
            # load the tokenizer from HF
            tokenizer = AutoTokenizer.from_pretrained(
                config.model,
                padding_side="left",
                padding=True,
                truncation=True,
                model_max_length=config.max_sequence_length,
            )

            # add eos token if not present
            if tokenizer.eos_token is None:
                tokenizer.eos_token = "</s>"
                tokenizer.eos_token_id = 2  # OPT eos-token-id

            # add pad token if not present
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                tokenizer.pad_token_id = tokenizer.eos_token_id
        elif config.model in llama_models:

            # llama module might not be present when HF models are used
            from chatllama.llama_model import (
                load_tokenizer,
            )  # noqa

            tokenizer = load_tokenizer(config.tokenizer_path)
        return tokenizer

    def parameters(self):
        """Return the parameters of the model"""
        return self.model.parameters()

    @beartype
    def forward(
        self, sequences: torch.Tensor, sequences_mask: torch.Tensor
    ) -> torch.Tensor:
        """Generate logits to have probability distribution over the vocabulary
            of the actions

        Args:
            sequences (torch.Tensor): Sequences of states and actions used to
                    compute token logits for the whole list of sequences
            attention_mask (torch.Tensor): Mask for the sequences attention

        Returns:
            logits (torch.Tensor): Logits for the actions taken
        """
        model_output = self.model.forward(
            sequences, attention_mask=sequences_mask
        )
        # need to return logits for the actions
        if self.config.model in hf_models_causal_lm:
            model_output = model_output.logits
        if self.config.debug:
            print("ActorModel.forward")
            print("model_output_logits shape", model_output.shape)
            print("model_output logits", model_output)
        return model_output

    @beartype
    @torch.no_grad()
    def generate(
        self, states: torch.Tensor, state_mask: torch.Tensor
    ) -> Tuple:
        """Generate actions and sequences=[states, actions] from state
            (i.e. input of the prompt generator model)

        Args:
            state (torch.Tensor): the input of the user
            state_mask (torch.Tensor): Mask for the state input (for padding)

        Returns:
            actions (torch.Tensor): Actions generated from the state
            sequences (torch.Tensor): Sequences generated from the
                state as [states, actions]
        """
        # temperature for the actor
        temperature = self.config.temperature

        # max sequence length for the actor (i.e. prompt + completion)
        max_sequence_length = self.config.max_sequence_length

        # max and min number of tokens to generate
        max_tokens = self.config.max_tokens
        min_tokens = self.config.min_tokens

        # max generation possible given the state and the max sequence length
        max_generation_possible = max_sequence_length - states.shape[1]
        if max_generation_possible < min_tokens:
            raise ValueError(
                f"The prompt is too long w.r.t the "
                f"model sequence length \n"
                f"max_sequence_length={max_sequence_length}\n"
                f"state_length={states.shape[1]}\n"
                f"min_tokens={min_tokens}\n"
                f"max_tokens={max_tokens}\n"
                f"max_generation_possible={max_generation_possible}\n"
            )

        # take the minimum the max_tokens and the max_generation_possible
        max_completion = min(max_tokens, max_generation_possible)

        sequences = self.model.generate(
            input_ids=states,
            attention_mask=state_mask,
            temperature=temperature,
            max_new_tokens=max_completion,
            no_repeat_ngram_size=3,
        )
        actions = sequences[:, states.shape[1] :]  # noqa E203
        if self.config.debug:
            print(
                f"input length {states.shape[1]} \n"
                f"max sequence length {max_sequence_length} \n"
                f"max completion {max_completion} \n"
                f"generated sequence {sequences.shape[1]} \n"
            )
            print("ActorModel.generate")
            print("state", states)
            print("state shape", states.shape)
            print("sequence shape", sequences.shape)
            print("sequence", sequences)
            print("actions shape", actions.shape)
            print("actions", actions)
        return actions, sequences


class ActorDataset(Dataset):
    """Dataset for the pretraining of the actor model
    read a json file with the following format:
    [
        {
            "user_input": "..."
            "completion": "..."
        },
        ...
    ]
    Where:
        user_input: the input of the user
        completion: the output of the user
    """

    def __init__(
        self,
        path: str,
    ) -> None:
        self.path = path
        with open(path, "r") as f:
            data = json.load(f)
        self.data = [d["user_input"] + d["completion"] for d in data]

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(
        self,
    ):
        return len(self.data)


class ActorTrainer:
    """Used to pre-train the actor model to generate better prompts.

    Args:
        config (ConfigActor): Configuration for the actor model

    Attributes:
        config (ConfigActor): Configuration for the actor model
        model (ActorModel): Actor model
        loss_function (torch.nn.CrossEntropyLoss): Loss function
        optimizer (torch.optim.Adam): Optimizer
        validation_flag (bool): Flag to indicate if the validation dataset
            is provided
        train_dataset (ActorDataset): Training dataset
        train_dataloader (DataLoader): Training dataloader
        validation_dataset (ActorDataset): Validation dataset
        validation_dataloader (DataLoader): Validation dataloader
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler
        training_stats (TrainingStats): Training statistics
        model_engine (ModelEngine): Model engine for deepspeed training
        accelerator (Accelerator): Accelerator for accelerate training

    Methods:
        train: Train the actor model
        load_checkpoint: Load a checkpoint
        save_checkpoint: Save a checkpoint
    """

    def __init__(self, config: ConfigActor) -> None:

        # store config
        self.config = config

        # load the model
        self.actor = ActorModel(config)

        # define loss function
        self.loss_function = torch.nn.CrossEntropyLoss()

        # define optimizer
        self.optimizer = torch.optim.AdamW(
            self.actor.parameters(), lr=config.lr, weight_decay=1e-5
        )

        # check if validation dataset is provided
        self.validation_flag = False
        if config.validation_dataset_path is not None:
            self.validation_flag = True

        # create dataset and dataloaders
        self.train_dataset = ActorDataset(config.train_dataset_path)
        self.train_dataloader = DataLoader(
            self.train_dataset, batch_size=config.batch_size
        )
        if self.validation_flag:
            self.eval_dataset = ActorDataset(config.validation_dataset_path)
            self.validation_dataloader = DataLoader(
                self.eval_dataset, batch_size=config.batch_size
            )

        # define scheduler for the learning rate
        # learning rate is decreased until 10% of the initial value
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer,
            T_0=len(self.train_dataset) // config.batch_size,
            T_mult=1,
            eta_min=config.lr * 0.1,
        )

        # define training statistics
        stat_path = ModelLoader.get_training_stats_path(config)
        self.training_stats = TrainingStats(stat_path)

        # consistency check between accelerate and deepspeed
        if config.accelerate_enable and config.deepspeed_enable:
            raise ValueError(
                "Both DeepSpeed and Accelerate are enabled for the Actor."
                "Please choose one of them."
            )

        # initialize deepspeed
        self.model_engine = None
        if config.deepspeed_enable is True:
            if config.deepspeed_config_path is None:
                raise ValueError(
                    "DeepSpeed config path is None, but deepspeed is enabled"
                )
            if os.path.exists(config.deepspeed_config_path) is False:
                raise ValueError(
                    f"DeepSpeed config path {config.deepspeed_config_path}"
                    f"does not exist"
                )
            (
                self.model_engine,
                self.optimizer,
                self.train_dataloader,
                _,
            ) = deepspeed.initialize(
                args=None,
                model=self.actor,
                model_parameters=self.actor.parameters(),
                training_data=self.train_dataset,
                config=self.config.deepspeed_config_path,
            )
            print("Training with DeepSpeed")

        # initialize accelerate
        self.accelerator = None
        if config.accelerate_enable is True:
            self.accelerator = Accelerator()
            (
                self.actor,
                self.optimizer,
                self.train_dataloader,
                self.scheduler,
            ) = self.accelerator.prepare(
                self.actor,
                self.optimizer,
                self.train_dataloader,
                self.scheduler,
            )
            print("Training with Accelerate")

    @beartype
    def save_checkpoint(
        self,
        current_epoch: int,
        current_step: int,
        max_epochs: int,
        max_steps: int,
    ) -> None:
        """Save the current checkpoint

        Args:
            current_epoch (int): Current epoch
            current_step (int): Current step
            max_epochs (int): Maximum number of epochs
            max_steps (int): Maximum number of steps
        """

        print(
            f"Saving checkpoint for epoch {current_epoch + 1}, "
            f"step {current_step + 1} ..."
        )
        # look for path to save the checkpoint
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=current_epoch,
            current_step=current_step,
            max_epochs=max_epochs,
            max_steps=max_steps,
        )

        # remove the checkpoint if it already exists
        if os.path.exists(path):
            if self.config.deepspeed_enable:
                shutil.rmtree(path)
            else:
                os.remove(path)

        if self.config.deepspeed_enable:
            client_state = {
                "epoch": current_epoch,
                "step": current_step,
            }
            self.model_engine.save_checkpoint(path, client_state=client_state)
        else:
            # save the checkpoint
            torch.save(
                {
                    "state_dict": self.actor.model.state_dict(),
                    "optim_state_dict": self.optimizer.state_dict(),
                    "training_stats": self.training_stats,
                    "epoch": current_epoch,
                    "step": current_step,
                },
                path,
            )

        # remove old checkpoints
        n_checkpoints_to_keep = self.config.n_checkpoints_to_keep
        ModelLoader.delete_old_checkpoints(
            model_folder, model_name, n_checkpoints_to_keep
        )

    @beartype
    def load_checkpoint(
        self,
    ) -> Tuple[int, int]:
        """Load a checkpoint from the model folder

        Returns:
            Tuple[int, int]: Current epoch and current step to resume
                training
        """

        print("Looking for checkpoints...")
        # look for a checkpoint
        path = ModelLoader.check_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=None,
        )

        # if there is a checkpoint
        if path is not None:
            print("Loading ...")

            if self.config.deepspeed_enable:
                # try to load the checkpoint
                try:
                    _, client_state = self.model_engine.load_checkpoint(path)
                except Exception:
                    print(
                        "Checkpoint corrupted!"
                        "Try to remove the last checkpoint."
                        "Now Starting from epoch 0, step 0"
                    )
                    return 0, 0
                # load epoch and step to resume loops
                epoch = client_state["epoch"]
                step = client_state["step"]
            else:
                # try to load the checkpoint
                try:
                    checkpoint = torch.load(path)
                except Exception:
                    print(
                        "Checkpoint corrupted!"
                        "Try to remove the last checkpoint."
                        "Now Starting from epoch 0, step 0"
                    )
                    return 0, 0

                # assing the checkpoint to the model
                epoch = checkpoint["epoch"]
                self.actor.model.load_state_dict(checkpoint["state_dict"])
                self.optimizer.load_state_dict(checkpoint["optim_state_dict"])
                self.trainign_stats = checkpoint["training_stats"]
                step = checkpoint["step"]
                return epoch, step + 1  # return the next episode to train
        return 0, 0

    def add_eos_token(
        self, tokens: torch.Tensor, mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # given tokens and mask, add eos token to the end of each sequence
        # and update the mask
        batch_size, seq_len = tokens.shape
        eos_token = self.actor.tokenizer.eos_token_id

        # see if i can append 1 token
        n_tokens_to_append = min(self.config.max_sequence_length - seq_len, 1)
        n_tokens_to_append = max(n_tokens_to_append, 0)

        # concatenate eos to tokens and mask
        if n_tokens_to_append > 0:
            tokens = torch.cat(
                [
                    tokens,
                    torch.ones(batch_size, n_tokens_to_append)
                    .long()
                    .to(tokens.device)
                    * eos_token,
                ],
                dim=1,
            )
            mask = torch.cat(
                [
                    mask,
                    torch.ones(batch_size, n_tokens_to_append)
                    .long()
                    .to(mask.device),
                ],
                dim=1,
            )
        return tokens, mask

    def train(
        self,
    ) -> None:
        """Train the model"""
        print("Start Actor Model Pretraining")

        # get config parameters
        if self.config.deepspeed_enable:
            batch_size = self.train_dataloader.batch_size
        else:
            batch_size = self.config.batch_size
        epochs = self.config.epochs
        device = self.config.device
        checkpoint_steps = self.config.checkpoint_steps

        # compute the number of iterations
        n_iter = int(len(self.train_dataset) / batch_size)

        # load model_checkpoint
        start_epoch, start_step = self.load_checkpoint()

        if start_epoch == 0 and start_step == 0:
            self.training_stats.clear()

        # counter for the checkpoint
        cnt_checkpoint = 1

        # traing loop
        for epoch in range(start_epoch, epochs):
            self.actor.train()
            for i, input_text in enumerate(self.train_dataloader):

                # skip the first steps if we are resuming training
                if i < start_step:
                    continue

                # tokenize input
                with torch.no_grad():
                    input_tokenized = self.actor.tokenizer(
                        input_text,
                        return_tensors="pt",
                        truncation=True,
                        padding=True,
                    )

                    # split tokens and mask
                    input_tokenized_id = input_tokenized["input_ids"]
                    input_tokenized_mask = input_tokenized["attention_mask"]

                    # add eos token
                    (
                        input_tokenized_id,
                        input_tokenized_mask,
                    ) = self.add_eos_token(
                        input_tokenized_id,
                        input_tokenized_mask,
                    )

                    # split into input and output
                    training_output = input_tokenized_id[:, 1:]
                    training_input = input_tokenized_id[:, :-1]
                    attention_mask = input_tokenized_mask[:, :-1]

                    # move to device
                    training_output = training_output.to(device)
                    training_input = training_input.to(device)
                    attention_mask = attention_mask.to(device)

                # forward pass
                if self.config.deepspeed_enable:
                    est_output = self.model_engine(
                        training_input, attention_mask
                    )
                else:
                    est_output = self.actor(training_input, attention_mask)

                # compute loss
                est_output = rearrange(est_output, "b s v -> (b s) v")
                training_output = rearrange(training_output, "b s -> (b s)")
                loss = self.loss_function(est_output, training_output)
                self.training_stats.training_loss.append(loss.item())

                # backward pass
                if self.config.deepspeed_enable:
                    self.model_engine.backward(loss)
                    self.model_engine.step()
                elif self.config.accelerate_enable:
                    self.optimizer.zero_grad()
                    self.accelerator.backward(loss)
                    self.optimizer.step()
                    self.scheduler.step()
                else:
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    self.scheduler.step()

                # print progress
                if i % self.config.iteration_per_print == 0:
                    print(
                        f"Epoch: {epoch+1}/{epochs}, "
                        f"Iteration: {i+1}/{n_iter}, "
                        f"Training Loss: {loss}"
                    )

                # save checkpoint periodically
                if cnt_checkpoint % checkpoint_steps == 0:
                    self.save_checkpoint(epoch, i, epochs, n_iter)
                    self.training_stats.save()
                    cnt_checkpoint = 1
                else:
                    cnt_checkpoint += 1

            # Validation
            if self.validation_flag:
                self.actor.eval()
                with torch.no_grad():
                    for i, input_text in enumerate(self.validation_dataloader):

                        # tokenize input
                        input_tokenized = self.actor.tokenizer(
                            input_text, return_tensors="pt", padding=True
                        )
                        validation_output = input_tokenized["input_ids"][:, 1:]
                        validation_input = input_tokenized["input_ids"][:, :-1]
                        attention_mask = input_tokenized["attention_mask"][
                            :, :-1
                        ]

                        # forward pass
                        est_output = self.actor.forward(
                            validation_input, attention_mask
                        )
                        validation_output = rearrange(
                            validation_output, "b s -> (b s)"
                        )

                        # compute loss
                        est_output = rearrange(est_output, "b s v -> (b s) v")
                        loss = self.loss_function(
                            est_output, validation_output
                        )
                        self.training_stats.validation_loss.append(loss.item())

                        # print progress
                        if i % self.config.iteration_per_print == 0:
                            print(
                                f"Epoch: {epoch+1}/{epochs}, "
                                f"Iteration: {i+1}/{n_iter}, "
                                f"Validation Loss: {loss}"
                            )
            # reset start_step after training is resumed
            start_step = 0

        # save the model
        self.actor.save()
        print("Training Finished ")


================================================
FILE: optimization/chatllama/chatllama/rlhf/config.py
================================================
import yaml
import os
from dataclasses import dataclass

import torch
from beartype import beartype
from beartype.typing import Optional


@dataclass
class ConfigReward:
    """Config parameters for the reward model

    Attributes:
        device (torch.device): Device to be used for the reward model
        model (str): Model to be used for the reward model
        model_folder (str): Path to the folder where model are stored (used
            to load / store finetuned model or checkpoints)
        model_head_hidden_size (int): Hidden size of the reward model head
        max_sequence_length (int): Max sequence length of the reward model
        train_dataset_path (Optional[str]): Path to the training dataset.
            Default to None. To be specified only for the reward model trainig.
        validation_dataset_path (Optional[str]): Path to the validation
            dataset. Default to None. To be specified only for the reward
            model trainig.
        batch_size (Optional[int]): Batch size to train the reward model.
            Default to None. To be specified only for the reward model
            trainig.
        epochs (Optional[int]): Number of epochs to train the reward model.
            Default to None. To be specified only for the reward model
            trainig.
        iteration_per_print (Optional[int]): Number of iterations to print
            the training loss. Default to None. To be specified only for the
            reward model trainig.
        checkpoint_steps (Optional[int]): Number of steps (backProp) to
            interleave checkpoints. Default to None. To be specified only for
            the reward model trainig.
        checkpoint_name (Optional[str]): Name of the checkpoint. Default to
            None.
        lr (Optional[float]): Learning rate for the reward model. Default to
            None. To be specified only for the reward model distillation.
        llm_enable (bool): Enable reward model distillation. Default to True.
            Disable it if you dont have an API key.
        llm_model (Optional[str]): Model to be used for the reward model
            distillation. Default to "text-davinci-003".
        llm_temperature (Optional[float]): Temperature for the reward model
            distillation. Default to 0.9.
        llm_max_tokens (Optional[int]): Max tokens for the reward model
            distillation. Default to 64.
        deepspeed_enable (bool): Enable deepspeed for the reward model
            training. Default to False.
        deepspeed_config_path (str): Path to the deepspeed config file.
            Default to None.
        is_reward (bool): True if the model is a reward model. Default to True.
        accelerate_enable (bool): Enable accelerate for the reward model
        debug (bool): enable prints for Debugging
    """

    device: torch.device
    model: str
    model_folder: str
    model_head_hidden_size: int
    max_sequence_length: int
    train_dataset_path: Optional[str] = None
    validation_dataset_path: Optional[str] = None
    batch_size: Optional[int] = None
    epochs: Optional[int] = None
    iteration_per_print: Optional[int] = None
    checkpoint_steps: Optional[int] = None
    checkpoint_name: Optional[str] = None
    lr: Optional[float] = None
    llm_enable: Optional[bool] = False
    llm_model: Optional[str] = "text-davinci-003"
    llm_temperature: Optional[float] = 0.9
    llm_max_tokens: Optional[int] = 64
    deepspeed_enable: bool = False
    deepspeed_config_path: Optional[str] = None

    # critic specific parameters
    is_reward: bool = True
    accelerate_enable: bool = False

    debug: bool = False


# just for naming consistency
ConfigCritic = ConfigReward


@dataclass
class ConfigActor:
    """Config parameters for models

    Attributes:
        model (str): Model to be used for the actor
        model_folder (str): Path to the folder where model are stored (used
            to load / store finetuned model or checkpoints)
        tokenizer_path (str): Path to the folder where tokenizer are stored
        train_dataset_path (str): Path to the training dataset
        validation_dataset_path (Optional[str]): Path to the validation dataset
        froze_embeddings (bool): Froze embeddings for the actor
        use_fairscale (bool): Use fairscale module for the actor instead of
            pytorch native modules.
        max_sequence_length (int): Max sequence length for the actor
        max_tokens (int): Max tokens for actor generation
        min_tokens (int): Min tokens for actor generation
        additonal_prompt_tokens (int): Number of tokens to be used as safety
            to avoid too large sequences and to add a template to the
            dataset
        temperature (float): Temperature for the actor
        batch_size (int): Batch size to train the actor
        iteration_per_print (int): Number of iterations to print the
            training loss
        lr (float): Learning rate for the actor
        epochs (int): Number of epochs to train the actor
        checkpoint_steps (int): Number of steps (backProp) to interleave
            checkpoints.
        n_checkpoints_to_keep (int): Number of checkpoints to keep
            for the actor.
        deepspeed_enable (bool): Enable deepspeed for the actor.
            Default to False.
        deepspeed_config_path (str): Path to the deepspeed config file.
            Default to None.
        accelerate_enable (bool): Enable accelerate for the actor
        device (torch.device): Device to be used for the actor
        checkpoint_name (Optional[str]): Name of the checkpoint. Default to
            None.
        peft_enable (bool): Enable peft for the actor
        peft_config_path (str): Path to the peft config file.
        debug (bool): Enable prints for debugging

    """

    model: str
    model_folder: str
    tokenizer_path: str
    train_dataset_path: str
    validation_dataset_path: Optional[str]
    froze_embeddings: bool
    use_fairscale: bool
    max_sequence_length: int
    max_tokens: int
    min_tokens: int
    additonal_prompt_tokens: int
    temperature: float
    batch_size: int
    iteration_per_print: int
    lr: float
    epochs: int
    checkpoint_steps: int
    n_checkpoints_to_keep: int

    deepspeed_enable: bool
    deepspeed_config_path: Optional[str]

    accelerate_enable: bool

    device: torch.device
    peft_enable: bool
    peft_config_path: str
    checkpoint_name: Optional[str] = None
    debug: bool = False


@dataclass
class ConfigTrainer:
    """Config parameters for the trainer, used to configure the reinforcement
    learning training loop

    Attributes:
        actor_lr (float): Learning rate for the actor when training with
            reinforcement learning
        critic_lr (float): Learning rate for the critic when training with
            reinforcement learning
        actor_eps_clip (float): Epsilon clip for the actor
        critic_eps_clip (float): Epsilon clip for the critic
        beta_s (float): Beta for the actor and critic
        gamma (float): coefficient for the discounted rewards.
        examples_path (str): Path to the examples dataset
        num_episodes (int): Number of episodes, each episodes consist of
            a number of timesteps that are used to generate examples
            stored in the memory buffer.
        max_timesteps (int): Max timesteps for the actor and critic.
            for each timestep a set of examples are sampled and used to
            generate a completion and a reward.
        update_timesteps (int): Number of timesteps to update the actor and
            critic
        num_examples (int): Number of examples to generate for the actor
            and critic. For each iteration of timestep, num_examples are
            sampled from the prompt dataset, processed and stored in the
            memory buffer.
        batch_size (int): Batch size to train the actor and critic.
            This batch is used to aggregate the memory from the memory buffer
            for the actual training of the actor and critic models.
        epochs (int): Number of epochs to train the actor and critic.
        checkpoint_steps (int): Number of episodes to interleave checkpoints.
        device (torch.device): Device to be used for the actor and critic
        checkpoint_name (Optional[str]): Name of the checkpoint. Default to
            None.
    """

    actor_lr: int
    critic_lr: int
    actor_eps_clip: float
    critic_eps_clip: float
    beta_s: float
    gamma_discounted: float
    examples_path: str
    num_episodes: int
    max_timesteps: int
    update_timesteps: int
    num_examples: int
    batch_size: int
    epochs: int
    checkpoint_steps: int
    device: torch.device
    checkpoint_name: Optional[str] = None
    debug: bool = False


class Config:
    """Store the config parameters for the whole pipeline

    Args:
        trainer_dict (Optional[Dict]): Dictionary with the config parameters
            for the trainer. Default to None. If None, the config.yaml is
            used.
        actor_dict (Optional[Dict]): Dictionary with the config parameters
            for the actor. Default to None. If None, the config.yaml is
            used.
        critic_dict (Optional[Dict]): Dictionary with the config parameters
            for the critic. Default to None. If None, the config.yaml is
            used.
        reward_dict (Optional[Dict]): Dictionary with the config parameters
            for the reward. Default to None. If None, the config.yaml is
            used.
        device (Optional[torch.device]): Device to be used for the actor
            and critic. Default to None. If None, the device available is
            used.
        debug (Optional[bool]): Enable prints for debugging. Default to False.

    Attributes:
        trainer (ConfigTrainer): Config parameters for the trainer
        actor (ConfigActor): Config parameters for the actor
        critic (ConfigCritic): Config parameters for the critic
        reward (ConfigReward): Config parameters for the reward
    """

    @beartype
    def __init__(
        self,
        path: str,
        device: Optional[torch.device] = None,
        debug: Optional[bool] = False,
    ) -> None:

        # if not specified use the device available
        if device is None:
            if torch.cuda.is_available():
                device = torch.device("cuda")
            else:
                raise ValueError("No GPU available")
            print(f"Current device used :{str(device)}")

        if path is None or os.path.exists(path) is False:
            raise ValueError("Path to the config.yaml is not valid")

        # Read the config from yaml
        with open(path, "r") as c:
            config = yaml.safe_load(c)

        trainer_dict = config["trainer_config"]
        actor_dict = config["actor_config"]
        critic_dict = config["critic_config"]
        reward_dict = config["reward_config"]

        # Trainer Config
        trainer_dict["device"] = device
        trainer_dict["debug"] = debug
        self.trainer = ConfigTrainer(**trainer_dict)
        # Actor Config
        actor_dict["device"] = device
        actor_dict["debug"] = debug
        self.actor = ConfigActor(**actor_dict)
        # Critic Config
        critic_dict["device"] = device
        critic_dict["debug"] = debug
        self.critic = ConfigCritic(**critic_dict)
        self.critic.is_reward = False
        # Reward Config
        reward_dict["device"] = device
        reward_dict["debug"] = debug
        self.reward = ConfigReward(**reward_dict)


================================================
FILE: optimization/chatllama/chatllama/rlhf/dataset.py
================================================
import json
import os

import numpy as np

from beartype.typing import Dict, List, Union
from datasets import load_dataset
from chatllama.rlhf.config import Config, ConfigActor, ConfigReward
from chatllama.rlhf.reward import RewardModel, CriticModel
from chatllama.rlhf.actor import ActorModel


ConfigType = Union[Config, ConfigActor, ConfigReward]


class BaseDataset:
    def __init__(
        self,
    ) -> None:
        pass

    @staticmethod
    def sort_conversation(
        conversations: List[Dict],
        only_input: bool = False,
        reverse: bool = True,
        shuffle: bool = True,
    ) -> List[Dict]:
        """Sort the conversations by length of user_input + completion
        or by length of user_input only

        Args:
            conversations (List[Dict]): list of conversations
            only_input (bool, optional): sort by length of user_input only.
                Defaults to False.
            reverse (bool, optional): sort in descending order.
                Defaults to True.
            shuffle (bool, optional): shuffle the dataset leaving only the
                first 100 samples sorted. Defaults to True.

        Returns:
            List[Dict]: sorted list of conversations
        """

        # define the sorting function
        if only_input is True:

            def sort_fun(x):
                return len(x["user_input"])

        else:

            def sort_fun(x):
                return len(x["user_input"]) + len(x["completion"])

        # sort
        conversations = sorted(
            conversations,
            key=sort_fun,
            reverse=reverse,
        )

        # shuffle
        if shuffle is True:
            conversations = (
                conversations[:10]
                + np.random.choice(
                    conversations[10:],
                    size=len(conversations[10:]),
                    replace=False,
                ).tolist()
            )

        return conversations

    @staticmethod
    def take_n_samples(
        conversations: List[Dict],
        n: int,
    ) -> List[Dict]:
        """Take N samples from the dataset

        Args:
            conversations (List[Dict]): list of conversations
            n (int): number of samples to take randomly

        Returns:
            List[Dict]: list of N samples
        """

        # sample N number of index from 0 to len(conversations)
        indexes = np.random.choice(len(conversations), size=n, replace=False)
        # take the samples
        conversations = [conversations[i] for i in indexes]
        return conversations

    @staticmethod
    def clean_dataset(config: ConfigType):
        """Clean the datasets by removing too long examples
        The Reward Dataset constraints are:
        - user_input + completion < Reward model max sequence length
        The Actor Dataset constraints are:
        - user_input + completion < Actor model max sequence length
        The RLHF Training Dataset constraints are:
        - user_input + min_completion < Actor model max sequence length
        - user_input + min_completion < Critic model max sequence length
        - user_input + min_completion < Reward model max sequence length

        Args:
            config (Config): config object
        """

        if isinstance(config, Config):
            print("Start cleaning the dataset for RLHF")
            # constraints
            r_model_max_seq_len = config.reward.max_sequence_length
            a_model_max_seq_len = config.actor.max_sequence_length
            c_model_max_seq_len = config.critic.max_sequence_length
            min_completion = config.actor.min_tokens
            # dataset
            dataset_path = config.trainer.examples_path
            # tokenizers
            r_tokenizer = RewardModel.load_tokenizer(config.reward)
            a_tokenizer = ActorModel.load_tokenizer(config.actor)
            c_tokenizer = CriticModel.load_tokenizer(config.critic)
            # safety tokens
            safety_tokens = config.actor.additonal_prompt_tokens

        elif isinstance(config, ConfigActor):
            print("Start cleaning the dataset for Actor")
            # constraint
            a_model_max_seq_len = config.max_sequence_length
            # dataset
            dataset_path = config.train_dataset_path
            # tokenizer
            a_tokenizer = ActorModel.load_tokenizer(config)
            # safety tokens
            safety_tokens = config.additonal_prompt_tokens

        elif isinstance(config, ConfigReward):
            print("Start cleaning the dataset for Reward")
            # constraint
            r_model_max_seq_len = config.max_sequence_length
            # dataset
            dataset_path = config.train_dataset_path
            # tokenizer
            r_tokenizer = RewardModel.load_tokenizer(config)

        # if there is the datasets
        if os.path.exists(dataset_path):

            # load the dataset
            with open(dataset_path, "r") as f:
                conversations = json.load(f)

            # sort in desceding order - longest first
            if isinstance(config, Config):
                conversations = BaseDataset.sort_conversation(
                    conversations,
                    only_input=True,
                    reverse=True,
                )
            else:
                conversations = BaseDataset.sort_conversation(
                    conversations,
                    only_input=False,
                    reverse=True,
                )

            old_len = len(conversations)
            # remove too long examples
            # since datasets are ordered by the length
            # we can remove the first elements until we find
            # an example that is not too long
            while len(conversations) > 0:

                # get the text to be tokenized
                if isinstance(config, Config):
                    text = conversations[0]["user_input"]
                else:
                    text = (
                        conversations[0]["user_input"]
                        + conversations[0]["completion"]
                    )

                # remove elements from RLHF dataset
                if isinstance(config, Config):
                    a_tokens = a_tokenizer.encode(text, truncation=False)
                    r_tokens = r_tokenizer.encode(text, truncation=False)
                    c_tokens = c_tokenizer.encode(text, truncation=False)
                    if (
                        len(a_tokens) + min_completion + safety_tokens
                        > a_model_max_seq_len
                    ):
                        conversations.pop(0)
                    elif (
                        len(r_tokens) + min_completion + safety_tokens
                        > r_model_max_seq_len
                    ):
                        conversations.pop(0)
                    elif (
                        len(c_tokens) + min_completion + safety_tokens
                        > c_model_max_seq_len
                    ):
                        conversations.pop(0)
                    else:
                        break

                # remove elements from Actor dataset
                elif isinstance(config, ConfigActor):
                    tokens = a_tokenizer.encode(text, truncation=False)
                    if len(tokens) + safety_tokens > a_model_max_seq_len:
                        conversations.pop(0)
                    else:
                        break

                # remove elements from Reward dataset
                elif isinstance(config, ConfigReward):
                    tokens = r_tokenizer.encode(text, truncation=False)
                    if len(tokens) > r_model_max_seq_len:
                        conversations.pop(0)
                    else:
                        break

            # if the number of examples has changed
            if len(conversations) != old_len:
                print("Number of examples before cleaning: ", old_len)
                print(
                    "Number of examples after cleaning: ", len(conversations)
                )

                # remove the old dataset
                os.remove(dataset_path)

                # save the new dataset
                with open(dataset_path, "w") as f:
                    json.dump(conversations, f, indent=4)
            else:
                print("Dataset is already clean")

        else:
            print(
                f"Dataset not found at {dataset_path}"
                f" Skipping cleaning of the dataset"
            )


class StanfordNLPSHPDataset(BaseDataset):
    """Class for Stanford NLP SHP dataset from HuggingFace"""

    def __init__(
        self,
    ) -> None:
        print("Download the dataset")
        self.dataset = load_dataset("stanfordnlp/SHP")
        print("Download Completed")

    def reformat_dataset(self, data: List) -> List[Dict]:
        """Reformat the dataset to the format required by RLHF

        Args:
            data (List): dataset from HuggingFace

        Returns:
            List[Dict]: reformatted dataset
        """

        # initialize conversations
        conversations = []

        # loop over the dataset
        for i, d in enumerate(data):
            if d["score_A"] > d["score_B"]:
                response = d["human_ref_A"]
            else:
                response = d["human_ref_B"]

            # compose user_input template
            user_input = d["history"].rstrip("\n")
            user_input = "Human: " + d["history"] + "\n\n##\n\n"

            # compose completion template
            completion = "Assistant: " + response
            conv = {
                "user_input": user_input,
                "completion": completion,
                "score": None,
            }
            conversations.append(conv)

        return conversations

    def save_dataset(
        self, dataset_folder: str, number_of_samples: int, reverse: bool = True
    ) -> None:
        """Save the dataset in the format required by RLHF

        Args:
            dataset_folder (str): path to the folder where the dataset
                will be saved
            number_of_samples (int): number of samples to take from the
                dataset
            reverse (bool, optional): sort the dataset in descending order.
                Defaults to True.
        """

        print("Generate datasets for RLHF")

        # take the train and test dataset to create the finetuning dataset
        conversations = self.reformat_dataset(self.dataset["train"])
        conversations.extend(self.reformat_dataset(self.dataset["test"]))

        # sort conversations by length of user_input + completion
        conversations = self.sort_conversation(conversations, reverse=reverse)

        # save actor training data
        with open(f"{dataset_folder}/actor_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        # take N samples and sort them
        conversations = self.take_n_samples(conversations, number_of_samples)
        conversations = self.sort_conversation(conversations, reverse=reverse)

        # save reward training data
        with open(f"{dataset_folder}/reward_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        # take the validation dataset for rlhf
        conversations = self.reformat_dataset(self.dataset["validation"])
        # sort the validation dataset
        conversations = self.sort_conversation(
            conversations,
            only_input=True,
            reverse=reverse,
        )
        # save rlhf training data
        with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        print("Generation Completed")


class AnthropicRLHF(BaseDataset):
    def __init__(
        self,
    ) -> None:

        print("Download the dataset")
        self.dataset = load_dataset("Anthropic/hh-rlhf")
        print("Download Completed")

    def reformat_dataset(self, data: List) -> List[Dict]:
        """Reformat the dataset to the format required by RLHF

        Args:
            data (List): dataset from HuggingFace

        Returns:
            List[Dict]: reformatted dataset
        """

        conversations = []
        for _, d in enumerate(data):
            current_conv = d["chosen"]
            split_answer = current_conv.split("Assistant:")

            # take all the list element in split_answer except the last one
            # and joing them with "Assistant:" in a unique string
            previous_convers = split_answer[0]
            for i, s in enumerate(split_answer[1:-1]):
                previous_convers += "Assistant:" + s

            # remove the last characters if they are "\n" from the previous
            # conversation
            previous_convers = previous_convers.rstrip("\n")
            user_input = previous_convers + "\n\n##\n\n"
            completion = "Assistant: " + split_answer[-1]

            conv = {
                "user_input": user_input,
                "completion": completion,
                "score": None,
            }

            conversations.append(conv)
        return conversations

    def save_dataset(
        self, dataset_folder: str, number_of_samples: int, reverse: bool = True
    ) -> None:
        """Save the dataset in the format required by RLHF

        Args:
            dataset_folder (str): path to the folder where the dataset
                will be saved
            number_of_samples (int): number of samples to take from the
                dataset
            reverse (bool, optional): sort the dataset in descending order.
                Defaults to True.
        """

        print("Generate datasets for RLHF")

        # generate actor and reward dataset
        conversations = self.reformat_dataset(self.dataset["train"])
        conversations = self.sort_conversation(conversations, reverse=reverse)

        # save actor training data
        with open(f"{dataset_folder}/actor_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        # sample N number of index from 0 to len(conversations)
        conversations = self.take_n_samples(conversations, number_of_samples)
        conversations = self.sort_conversation(conversations, reverse=reverse)

        # save reward training data
        with open(f"{dataset_folder}/reward_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        # rlhf dataset
        conversations = self.reformat_dataset(self.dataset["test"])

        # sort conversations by length of user_input
        conversations = self.sort_conversation(
            conversations, only_input=True, reverse=reverse
        )

        # save rlhf training data
        with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f:
            json.dump(conversations, f, indent=4)

        print("Generation Completed")


================================================
FILE: optimization/chatllama/chatllama/rlhf/model_list.py
================================================
# llama models
llama_models = ["llama-7B", "llama-13B", "llama-33B", "llama-65B"]

# HF Models
# encoder-decoder models TODO: still not supported
hf_models_seq_2_seq = [
    "google/flan-t5-xxl",
    "google/flan-t5-xl",
    "google/flan-t5-large",
    "google/flan-t5-base",
    "google/flan-t5-small",
]

# decoder only TODO: codegen is still broken
hf_models_causal_lm = [
    "facebook/opt-125m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b",
    "facebook/opt-6.7b",
    "facebook/opt-11b",
    "facebook/galactica-125m",
    "facebook/galactica-1.3b",
    "facebook/galactica-6.7b",
    "bigscience/bloom-560m",
    "bigscience/bloomz-560m",
    "bigscience/bloom-1b1",
    "bigscience/bloomz-1b1",
    "bigscience/bloom-1b7",
    "bigscience/bloomz-1b7",
    "bigscience/bloom-3b",
    "bigscience/bloomz-3b",
    "bigscience/bloom-7b1",
    "bigscience/bloomz-7b1",
    "EleutherAI/gpt-neo-1.3B",
    "EleutherAI/gpt-neo-1.3B",
    "EleutherAI/gpt-neox-20b",
    "EleutherAI/gpt-j-6B",
    "gpt2",
    "gpt2-large",
    "gpt2-xl",
    "benjamin/gerpt2",
    "benjamin/gerpt2-large",
    "Salesforce/codegen-350M-mono",
    "Salesforce/codegen-2B-mono",
    "Salesforce/codegen-6B-mono",
    "Salesforce/codegen-16B-mono",
]

# create a list of all the models from hf
hf_models = hf_models_seq_2_seq + hf_models_causal_lm


================================================
FILE: optimization/chatllama/chatllama/rlhf/model_loader.py
================================================
import os
import shutil

from beartype.typing import Union, Optional, Tuple

from chatllama.rlhf.config import (
    Config,
    ConfigActor,
    ConfigCritic,
    ConfigReward,
)
from chatllama.rlhf.model_list import hf_models

ConfigType = Union[Config, ConfigActor, ConfigCritic, ConfigReward]


class ModelLoader:
    """Class to load and save models and their checkpoints during training."""

    def __init__(
        self,
    ) -> None:
        pass

    @staticmethod
    def get_training_stats_path(config: ConfigType) -> str:
        """Method to get the path to the training stats file. Used when saving

        Args:
            config (ConfigType): the config object
        """
        model_folder, model_name, path = ModelLoader.get_model_path(
            config, is_checkpoint=True
        )
        stat_path = os.path.join(model_folder, "training_stats.json")
        return stat_path

    @staticmethod
    def look_for_last_checkpoint(
        model_folder: str,
        model_name: str,
    ) -> Optional[str]:
        """Method to look for the last checkpoint in the model folder
        checkpoint are saved as {model_name}_epoch_{current_epoch}.pt

        Args:
            model_folder (str): the folder where the checkpoints are saved
            model_name (str): the name of the model
        """
        # remove .pt to model name
        model_name = model_name.split(".")[0]
        checkpoints = [
            f for f in os.listdir(model_folder) if f.startswith(model_name)
        ]
        if len(checkpoints) == 0:
            return None
        else:
            checkpoints = sorted(checkpoints)
            # get last checkpoint
            last_checkpoint = checkpoints[-1]
            return last_checkpoint

    @staticmethod
    def look_for_checkpoint_by_name(
        model_folder: str,
        checkpoint_name: str,
    ) -> Optional[str]:
        """Method to look for a particular checkpoint in the model folder
        checkpoint are saved as
        {model_name}_epoch_{current_epoch}_steps_{current_steps}.pt

        Args:
            model_folder (str): the folder where the checkpoints are saved
            checkpoint_name (str): the name of the checkpoint
        """
        # look for a file named checkpoint_name in the model folder
        path = os.path.join(model_folder, checkpoint_name)
        if os.path.exists(path):
            return checkpoint_name
        else:
            return None

    @staticmethod
    def get_checkpoint_name(config: ConfigType) -> str:
        if isinstance(config, Config):
            return config.trainer.checkpoint_name
        else:
            return config.checkpoint_name

    @staticmethod
    def get_base_model_folder_from_config(config: ConfigType) -> str:
        if isinstance(config, ConfigActor) or isinstance(config, ConfigReward):
            return config.model_folder
        elif isinstance(config, Config):
            return config.actor.model_folder
        else:
            raise ValueError(
                "Config type not recognized during saving or loading"
            )

    @staticmethod
    def get_model_type_from_config(config: ConfigType) -> str:
        if isinstance(config, ConfigReward):
            # here use ad-hoc flag from config to distinguish between
            #  reward and critic
            if config.is_reward:
                return "reward"
            else:
                return "critic"
        elif isinstance(config, ConfigActor):
            return "actor"
        elif isinstance(config, Config):
            return "actor_rl"

    @staticmethod
    def get_model_name_from_config(config: ConfigType) -> str:
        model_name = None
        if isinstance(config, Config):
            model_name = config.actor.model
        elif isinstance(config, ConfigReward) or isinstance(
            config, ConfigActor
        ):
            model_name = config.model
        if model_name in hf_models:
            return os.path.split(model_name)[-1]
        if model_name is None:
            raise ValueError("Model name not found")
        return model_name

    @staticmethod
    def delete_old_checkpoints(
        model_folder: str, model_name: str, n_ckp_to_keep: int = 5
    ):
        """Method to discard old checkpoints, keeping only the last
        n_ckp_to_keep

        Args:
            model_folder (str): the folder where the checkpoints are saved
            model_name (str): the name of the model
            n_ckp_to_keep (int): the number of checkpoints to keep
        """

        # remove .pt to model name
        model_name = model_name.split(".")[0]
        checkpoints = [
            f for f in os.listdir(model_folder) if f.startswith(model_name)
        ]
        if len(checkpoints) == 0:
            return
        else:
            checkpoints = sorted(checkpoints)
            # check if the number of checkpoint is greater than 5
            if len(checkpoints) > n_ckp_to_keep:
                for c in checkpoints[:-n_ckp_to_keep]:
                    checkpoint_path = os.path.join(model_folder, c)
                    os.remove(checkpoint_path)

    @staticmethod
    def get_model_path(
        config: ConfigType,
        is_checkpoint: bool = False,
        current_epoch: Optional[int] = None,
        current_step: Optional[int] = None,
        max_epochs: int = 1_000_000_000,
        max_steps: int = 1_000_000_000,
    ) -> Tuple[str, str, Optional[str]]:
        """Method to get the path to the right model file. Used when saving
        the model.
        The hierarchy of the model folder is:
        -- model_folder: here store the models trained, for each type of model
                        there is a dedicated folder
            -- actor
            -- critic
            -- reward
            -- actor_rl
            -- checkpoints: here store the checkpoints during training, for
                            each type of model there is a dedicated folder
                -- actor
                -- critic
                -- reward
                -- actor_rl

        Args:
            config (ConfigType): the config object, contains info of the model
            is_checkpoint (bool): if True, the path is for a checkpoint
            current_epoch (Optional[int]): the current epoch, used to create
                the checkpoint name. If is_checkpoint is True, and
                current_epoch is None, return just the folder and the simple
                model name for the possible checkpoint.
            current_step (Optional[int]): the current step, used to create
                the checkpoint name.
            max_epochs (Optional[int]): the maximum number of epochs, used to
                create the checkpoint name.
            max_steps (Optional[int]): the maximum number of steps, used to
                create the checkpoint name.

        Returns:
            model_folder (str): the folder where the model is saved
            model_name (str): the name of the model
            path (Optional[str]): the path to the model. If is_checkpoint is
                True, and current_epoch is None, return None
        """
        model_folder = ModelLoader.get_base_model_folder_from_config(config)

        # Add the checkpoint path if necessary
        if is_checkpoint:
            model_folder = os.path.join(model_folder, "checkpoints")

        # Create the folder for the model type
        #  (Actor, Critic, Reward, Actor_RL)
        model_type = ModelLoader.get_model_type_from_config(config)
        model_folder = os.path.join(model_folder, model_type)

        # Make the path if not exists
        if os.path.exists(model_folder) is False:
            os.makedirs(model_folder, exist_ok=True)
            print(f"Model folder does not exist. Creating it: {model_folder}")

        # Create the model name
        model_name = ModelLoader.get_model_name_from_config(config)

        # If is a checkpoint and current epoch are available
        # extend the model name with the epoch, if none epoch is provided
        # just return the simple model name
        if is_checkpoint and current_epoch is not None:
            # number of characters to store the checkpoints
            n_char = max(len(str(max_epochs)), len(str(max_steps)))
            # create the string epoch such that it is always the same length
            # equalt to n_char (i.e. 00000001) necessary for sorting
            string_epoch = str(current_epoch)
            string_epoch = "0" * (n_char - len(string_epoch)) + string_epoch
            string_epoch = f"_epoch_{string_epoch}"
            if current_step is not None:
                string_step = str(current_step)
                string_step = "0" * (n_char - len(string_step)) + string_step
                string_step = f"_step_{string_step}"
                model_name = f"{model_name}{string_epoch}{string_step}.pt"
            else:
                model_name = f"{model_name}{string_epoch}.pt"
        else:
            model_name = f"{model_name}.pt"

        # if the epoch is not provided, and it is a checkpoint
        # is impossible to know the path to the file.
        # but we can know the model folder and the model name
        if is_checkpoint and current_epoch is None:
            path = None
        else:
            path = os.path.join(model_folder, model_name)
        return model_folder, model_name, path

    @staticmethod
    def check_model_path(
        config: ConfigType,
        is_checkpoint: bool = False,
        current_epoch: Optional[int] = None,
        current_step: Optional[int] = None,
    ) -> Optional[int]:
        """Method to check if the model path exists to load models
        or checkpoints.

        Args:
            config (ConfigType): the config object, contains info of the model
            is_checkpoint (bool): if True, the path is for a checkpoint
            current_epoch (Optional[int]): the current epoch.
                is is_checkpoint is True, and current_epoch is None,
                it will look for the last checkpoint and return it.

        Returns:
            path (Optional[str]): the path to the model. If is_checkpoint is
                True, and current_epoch is None, search for the last checkpoint
                and return it. If no checkpoint is found, return None.
            epoch (Optional[int]): the epoch of the checkpoint if an actual
                checkpoint is found. If no checkpoint is found, return None.
        """
        model_folder, model_name, path = ModelLoader.get_model_path(
            config,
            is_checkpoint,
            current_epoch,
        )

        # If i am looking for a checkpoint.
        if is_checkpoint and current_epoch is None:
            # If the checkpoint is specified by name use it
            checkpoint_name = ModelLoader.get_checkpoint_name(config)
            if checkpoint_name is not None:
                checkpoint = ModelLoader.look_for_checkpoint_by_name(
                    model_folder, checkpoint_name
                )
            else:
                checkpoint = ModelLoader.look_for_last_checkpoint(
                    model_folder, model_name
                )
            if checkpoint is not None:
                path = os.path.join(model_folder, checkpoint)
                # Get the epoch number from the checkpoint name

        if path is not None:
            if os.path.exists(path) is False:
                path = None

        if path is None:
            if is_checkpoint:
                checkpoint_name = ModelLoader.get_checkpoint_name(config)
                if checkpoint_name is not None:
                    print(
                        f"No checkpoint found at {model_folder} "
                        f"with name {config.checkpoint_name}"
                    )
                else:
                    print(
                        f"No previous checkpoint found at "
                        f"{model_folder} for {model_name}"
                    )
            else:
                print(
                    f"No previous model found at "
                    f"{model_folder} for model {model_name}"
                )
        else:
            if is_checkpoint:
                # the name is modelname_epoch_00000001_step_00000001.pt
                # or modelname_epoch_00000001.pt
                if "_step_" in path:
                    epoch = int(path.split("_epoch_")[-1].split("_")[0])
                    step = int(path.split("_step_")[-1].split(".")[0])
                    print(
                        f"Found checkpoint for epoch {epoch + 1},"
                        f" step {step + 1}..."
                    )
                else:
                    epoch = int(path.split("_epoch_")[-1].split(".")[0])
                    print(f"Found checkpoint for epoch {epoch + 1} ...")
            else:
                print(f"Found model at {path}")
        return path

    def init_critic_from_reward(config: ConfigCritic) -> None:
        """Method to initialize the critic from the reward model.
        If the critic folder is empty
        """

        if config.is_reward is True:
            raise ValueError(
                "The config should work for the Critic model,"
                "but the config seems to be for the Reward model"
            )

        # check that the critic folder is empty
        path = ModelLoader.check_model_path(config)
        _, _, critic_path = ModelLoader.get_model_path(config)
        if path is None:
            print("Initializing Critic from Reward model...")
            config.is_reward = True
            path = ModelLoader.check_model_path(config)
            if path is not None:
                _, _, reward_path = ModelLoader.get_model_path(config)
                # copy the file in reward_path to critic_path
                shutil.copy(reward_path, critic_path)
            else:
                print("Critic Model remains uninitialized")
        config.is_reward = False


================================================
FILE: optimization/chatllama/chatllama/rlhf/reward.py
================================================
import json
import shutil
import os

import deepspeed
import torch
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Iterable, Tuple
from einops.layers.torch import Rearrange
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModel,
    AutoTokenizer,
)

from chatllama.rlhf.config import ConfigReward
from chatllama.rlhf.model_list import hf_models
from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.utils import TrainingStats


class RewardModel(torch.nn.Module):
    """Model to be trained to predict the reward for RL.
    or to be used as Critic in RL. It is a Language Model with a head
    that predicts the reward (a scalar) for a given sequence of tokens.

    Attributes:
        model (torch.nn.Module): Model to be used for the reward model
        tokenizer (torch.nn.Module): Tokenizer to be used for the reward model
        head (torch.nn.Module): Head to be used for the reward model
        config (ConfigReward): Config parameters for the reward model

    Methods:
        load_tokenizer: Load the tokenizer for the reward model
        forward: Forward pass of the model (used by the critic)
        save: Save the model
        load: Load the model
        get_reward: Get the reward for a given input (used by the reward model)
        parameters: Return the parameters of the reward model

    """

    def __init__(self, config: ConfigReward) -> None:
        super().__init__()

        # store config
        self.config = config

        # initialize the self.model
        head_hidden_size = config.model_head_hidden_size
        if config.model in hf_models:
            self.tokenizer = self.load_tokenizer(config)
            self.model = AutoModel.from_pretrained(config.model)
            head_dim = self.model.config.hidden_size
            if config.model.startswith("gpt2"):
                head_dim = self.model.config.n_embd
            self.head = torch.nn.Sequential(
                torch.nn.Linear(head_dim, head_hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(head_hidden_size, 1),
                Rearrange("... 1 -> ..."),
            )
        else:
            raise ValueError(f"Model {config.model} not supported")

        # load the model
        self.load()

        # freeze model parameters (only train the head)
        # for param in self.model.parameters():
        #     param.requires_grad = False

        # move model to device
        self.model.to(config.device)
        self.head.to(config.device)

    @staticmethod
    def load_tokenizer(config: ConfigReward):
        # load tokenizer from HF
        tokenizer = AutoTokenizer.from_pretrained(
            config.model,
            padding_side="left",
            padding=True,
            truncation=True,
            model_max_length=config.max_sequence_length,
        )

        # add eos token if not present
        if tokenizer.eos_token is None:
            tokenizer.eos_token = "</s>"
            tokenizer.eos_token_id = 2  # OPT  eos token id

        # add pad token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        return tokenizer

    @beartype
    def load(self) -> None:
        """Load the model from the path"""
        # look for a pretrained model
        path = ModelLoader.check_model_path(
            config=self.config,
            is_checkpoint=False,
            current_epoch=None,
        )

        # check if the model exists
        if path is not None:

            # load the model from the path
            print("Loading ...")
            model_dict = torch.load(path)
            self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model"))
            self.head.load_state_dict(model_dict["head"])

    @beartype
    def save(self) -> None:
        """Save the model to the path"""
        # get the path to save the model
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=False,
            current_epoch=None,
        )

        # save the model
        print(f"Saving model to {path} ...")
        torch.save(
            {"model": self.model.state_dict(), "head": self.head.state_dict()},
            path,
        )

    @beartype
    def parameters(
        self,
    ) -> Iterable[torch.nn.Parameter]:
        """Return the parameters of the reward model"""
        for p in self.model.parameters():
            yield p
        for p in self.head.parameters():
            yield p

    @beartype
    def forward(
        self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor
    ) -> torch.Tensor:
        """Generate the sequence of rewards for the given output sequence
        what is the quality of the output sequence tokens?

        Args:
            output_sequence (torch.Tensor): The sequence of tokens to be
                evaluated
            output_sequence_mask (torch.Tensor): Mask for the attention

        Returns:
            torch.Tensor: Rewards for the given output sequence
        """
        output = self.model(
            output_sequence, attention_mask=output_sequence_mask
        )

        # What if the output_sequence is longer than the max context of
        # the model?
        rewards = self.head(output.last_hidden_state)
        if self.config.debug:
            print("RewardModel.forward")
            print("output_sequence.shape", output_sequence.shape)
            print("output_sequence", output_sequence)
            print("reward.shape", rewards.shape)
            print("reward", rewards)
        return rewards

    @beartype
    def get_reward(
        self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor
    ) -> torch.Tensor:
        """Get the reward for the given output sequence

        Args:
            output_sequence (torch.Tensor): The concatenation of initial input
                and actor output as tokens
            output_sequence_mask (torch.Tensor): Mask for the attention
        """
        if output_sequence.shape[1] > self.config.max_sequence_length:
            raise ValueError(
                f"Output sequence is too long: {output_sequence.shape[1]}"
                f" > {self.config.max_sequence_length}"
            )
        rewards = self.forward(output_sequence, output_sequence_mask)
        return rewards[:, -1]


# just to keep namings consistent
CriticModel = RewardModel


class RewardDataset(Dataset):
    """Dataset class for the reward model
    read a json file with the following format:
    [
        {
            "user_input": "...",
            "completion": "...",
            "score": ...
        },
        ...
    ]
    Where:
        user_input: the initial input of the user
        completion: the completion generated by the model
        score: the score given by the user to the completion (or by the LLM)
    """

    def __init__(self, path: str) -> None:
        print(f"Loading dataset from {path}")
        with open(path, "r") as f:
            self.data = list(json.load(f))
        print(f"Loaded {len(self.data)} samples")

    def __getitem__(self, idx: int):
        user_input = self.data[idx]["user_input"]
        completion = self.data[idx]["completion"]
        if self.data[idx]["score"]:
            score = float(self.data[idx]["score"])
        else:
            score = 2.5

        item = (user_input + completion, score)
        return item

    def __len__(
        self,
    ):
        return len(self.data)


class RewardTrainer:
    """Class to train the reward model

    Args:
        config (ConfigModel): Config parameters for the model

    Attributes:
        model (RewardModel): Reward model
        config (ConfigModel): Config parameters for the model
        optimizer (torch.optim): Optimizer for the model
        loss_function (torch.nn): Loss function for the model
        validation_flag (bool): Flag to indicate if the validation dataset
            is available
        train_dataset (RewardDataset): Dataset for training
        validation_dataset (RewardDataset): Dataset for validation
        train_dataloader (DataLoader): Dataloader for training
        validation_dataloader (DataLoader): Dataloader for validation
        scheduler (torch.optim.lr_scheduler): Scheduler for the optimizer
        training_stats (List[Dict]): List of dictionaries with the training
            statistics
        model_engine (ModelEngine): Model engine to train the model
            using deepspeed
        accelerator (Accelerator): Accelerator to train the model using
            accelerate by HF.


    Methods:
        train: Train the reward model
        save_checkpoints: Save the checkpoints of the model
        load_checkpoints: Load the checkpoints of the model
    """

    def __init__(self, config: ConfigReward) -> None:

        # save the config
        self.config = config

        # load the model
        self.reward = RewardModel(config)

        # optimizer
        self.optimizer = torch.optim.AdamW(
            self.reward.parameters(), lr=config.lr
        )

        # loss function
        self.loss_function = torch.nn.MSELoss()

        # check validation dataset
        self.validation_flag = False
        if config.validation_dataset_path is not None:
            self.validation_flag = True

        # create dataset and dataloaders
        self.train_dataset = RewardDataset(config.train_dataset_path)
        self.train_dataloader = DataLoader(
            self.train_dataset, batch_size=config.batch_size
        )
        if self.validation_flag:
            self.eval_dataset = RewardDataset(config.validation_dataset_path)
            self.validation_dataloader = DataLoader(
                self.eval_dataset, batch_size=config.batch_size
            )

        # intilize scheduler - learning rate will drop to 10% of the initial
        # value
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer,
            T_0=len(self.train_dataset) // config.batch_size,
            T_mult=1,
            eta_min=config.lr * 0.1,
            last_epoch=-1,
        )

        # initialize training stats
        stats_path = ModelLoader.get_training_stats_path(config)
        self.training_stats = TrainingStats(stats_path)

        # consistency check between accelerate and deepspeed
        if config.accelerate_enable and config.deepspeed_enable:
            raise ValueError(
                "Both DeepSpeed and Accelerate are enabled for the Reward."
                "Please choose one of them."
            )

        # initialize deepspeed
        self.model_engine = None
        if config.deepspeed_enable is True:

            if config.deepspeed_config_path is None:
                raise ValueError(
                    "DeepSpeed config path is None, but deepspeed is enabled"
                )
            if os.path.exists(config.deepspeed_config_path) is False:
                raise ValueError(
                    f"DeepSpeed config path {config.deepspeed_config_path}"
                    f"does not exist"
                )
            (
                self.model_engine,
                self.optimizer,
                self.train_dataloader,
                self.scheduler,
            ) = deepspeed.initialize(
                args=None,
                model=self.reward,
                model_parameters=self.reward.parameters(),
                training_data=self.train_dataset,
                config=self.config.deepspeed_config_path,
            )
            print("Training with DeepSpeed")

        # initialize accelerate
        self.accelerator = None
        if config.accelerate_enable is True:
            self.accelerator = Accelerator()
            (
                self.reward,
                self.optimizer,
                self.train_dataloader,
                self.scheduler,
            ) = self.accelerator.prepare(
                self.reward,
                self.optimizer,
                self.train_dataloader,
                self.scheduler,
            )
            print("Training with Accelerate")

    @beartype
    def save_checkpoint(
        self,
        current_epoch: int,
        current_step: int,
        max_epochs: int,
        max_steps: int,
    ) -> None:
        """Save the checkpoints of the model

        Args:
            current_epoch (int): Current epoch
            current_step (int): Current step
            max_epochs (int): Maximum number of epochs
            max_steps (int): Maximum number of steps
        """

        print(
            f"Saving checkpoint for epoch {current_epoch + 1}, "
            f" step {current_step} ..."
        )

        # get the path to save the checkpoint
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=current_epoch,
            current_step=current_step,
            max_epochs=max_epochs,
            max_steps=max_steps,
        )

        # remove the checkpoint if it already exists
        if os.path.exists(path):
            if self.config.deepspeed_enable:
                shutil.rmtree(path)
            else:
                os.remove(path)

        # save the checkpoint
        if self.config.deepspeed_enable:
            client_state = {
                "epoch": current_epoch,
                "step": current_step,
            }
            self.model_engine.save_checkpoint(path, client_state=client_state)
        else:
            torch.save(
                {
                    "state_dict": self.reward.model.state_dict(),
                    "optim_state_dict": self.optimizer.state_dict(),
                    "scheduler_state_dict": self.scheduler.state_dict(),
                    "training_stats": self.training_stats,
                    "epoch": current_epoch,
                    "step": current_step,
                },
                path,
            )

    @beartype
    def load_checkpoint(
        self,
    ) -> Tuple[int, int]:
        """Load the checkpoints of the model

        Returns:
            Tuple[int, int]: The current epoch and step
                from which you should resume the training
        """

        print("Looking for checkpoints...")
        # look for the checkpoints
        path = ModelLoader.check_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=None,
        )

        # check if a checkpoint exists
        if path is not None:
            print("Loading ...")

            if self.config.deepspeed_enable:
                # try to load the checkpoint
                try:
                    _, client_state = self.model_engine.load_checkpoint(path)
                except Exception:
                    print(
                        "Checkpoint corrupted!"
                        "Try to remove the last checkpoint."
                        "Now Starting from epoch 0, step 0"
                    )
                    return 0, 0
                # load epoch and step to resume loops
                epoch = client_state["epoch"]
                step = client_state["step"]
            else:
                # try to load the checkpoint
                try:
                    checkpoint = torch.load(path)
                except Exception:
                    print(
                        "Checkpoint corrupted!"
                        "Try to remove the last checkpoint."
                        "Now Starting from epoch 0, step 0"
                    )
                    return 0, 0

                # load the model parameters and optimizer parameters
                # from the checkpoint
                epoch = checkpoint["epoch"]
                self.reward.model.load_state_dict(checkpoint["state_dict"])
                self.optimizer.load_state_dict(checkpoint["optim_state_dict"])
                self.scheduler.load_state_dict(
                    checkpoint["scheduler_state_dict"]
                )
                self.training_stats = checkpoint["training_stats"]
                step = checkpoint["step"]
            return epoch, step + 1  # return the next episode to train
        return 0, 0

    def train(
        self,
    ) -> None:
        """Train the reward model"""
        print("Start Training the Reward Model")

        # get config parameters
        if self.config.deepspeed_enable:
            batch_size = self.train_dataloader.batch_size
        else:
            batch_size = self.config.batch_size
        epochs = self.config.epochs
        device = self.config.device
        iteration_per_print = self.config.iteration_per_print
        checkpoint_steps = self.config.checkpoint_steps

        # compute the number of iterations
        n_iter = int(len(self.train_dataset) / batch_size)

        # load checkpoint
        start_epoch, start_step = self.load_checkpoint()

        # counter for the checkpoint
        cnt_checkpoints = 1

        # traing loop
        for epoch in range(start_epoch, epochs):
            self.reward.train()
            for i, inputs in enumerate(self.train_dataloader):

                # skip the steps if resuming from a checkpoint
                if i < start_step:
                    continue

                # get the inputs
                input_text = inputs[0]
                score = inputs[1]

                # tokenize the input
                with torch.no_grad():
                    input_tokens = self.reward.tokenizer(
                        input_text,
                        return_tensors="pt",
                        truncation=True,
                        padding=True,
                    )
                    output = torch.as_tensor(
                        score, dtype=torch.float32, device=device
                    )

                # forward pass
                if self.config.deepspeed_enable:
                    est_output = self.model_engine(
                        input_tokens["input_ids"].to(device),
                        input_tokens["attention_mask"].to(device),
                    )[:, -1]
                else:
                    est_output = self.reward.get_reward(
                        input_tokens["input_ids"].to(device),
                        input_tokens["attention_mask"].to(device),
                    )

                # compute the loss
                loss = self.loss_function(est_output, output)
                self.training_stats.training_loss.append(loss.item())

                # backward pass
                if self.config.deepspeed_enable:
                    self.model_engine.backward(loss)
                    self.model_engine.step()
                elif self.config.accelerate_enable:
                    self.optimizer.zero_grad()
                    self.accelerator.backward(loss)
                    self.optimizer.step()
                    self.scheduler.step()
                else:
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    self.scheduler.step()

                # print progress
                if i % iteration_per_print == 0:
                    print(
                        f"Epoch: {epoch+1}/{epochs}, "
                        f"Iteration: {i+1}/{n_iter}, "
                        f"Training Loss: {loss.item()}"
                    )
                    printed_est_output = [
                        round(float(x), 1) for x in est_output.cpu().tolist()
                    ]
                    print(
                        "prediction",
                        printed_est_output,
                        "target",
                        score.cpu().tolist(),
                    )

                # checkpoints saving
                if cnt_checkpoints % checkpoint_steps == 0:
                    self.save_checkpoint(epoch, i, epochs, n_iter)
                    cnt_checkpoints = 1
                else:
                    cnt_checkpoints += 1

            # Validation
            if self.validation_flag:
                self.reward.eval()
                with torch.no_grad():
                    for i, (text, score) in enumerate(
                        self.validation_dataloader
                    ):

                        # tokenize inputs
                        input_tokens = self.reward.tokenizer(
                            text, return_tensors="pt", padding=True
                        )
                        input_tokens = input_tokens.to(device)
                        # TODO: check on the length of the input tokens if
                        # they are too many it can create problems
                        output = torch.tensor(score, dtype=torch.float32).to(
                            device
                        )

                        # forward pass
                        est_output = self.reward.get_reward(
                            input_tokens["input_ids"],
                            input_tokens["attention_mask"],
                        )

                        # compute loss
                        loss = self.loss_function(est_output, output)
                        self.training_stats.validation_loss.append(loss.item())

                        # print progress
                        if i % iteration_per_print == 0:
                            print(
                                f"Epoch: {epoch+1}/{epochs}, "
                                f"Iteration: {i+1}/{n_iter}, "
                                f"Validation Loss: {loss.item()}"
                            )
            # reset start_step after training is resumed
            start_step = 0

        # save the model at the end of the training
        self.reward.save()


================================================
FILE: optimization/chatllama/chatllama/rlhf/trainer.py
================================================
import json
import os
import random
from collections import deque, namedtuple

import deepspeed
import torch
import torch.distributed as dist
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Deque, List, Tuple, Union
from deepspeed.runtime.engine import DeepSpeedEngine
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from chatllama.rlhf.actor import ActorModel
from chatllama.rlhf.config import (
    Config,
    ConfigActor,
    ConfigCritic,
    ConfigReward,
)
from chatllama.rlhf.model_list import hf_models
from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.reward import RewardModel, CriticModel
from chatllama.rlhf.utils import TrainingStats, ConversationLog


"""
train()
┌─────────────────────────────┐
│                             │◄─────────────────────────┐
│                             │                          │
│      ┌─────────────┐        │                          │
│      │ user input  │        │                          │ learn()
│      └─────┬───────┘        │             ┌────────────┴─────────────┐
│            │                │             │                          │
│            │                │             │       ┌────────┐         │
│            │                │             │   ┌───│ Update │──┐      │
│            │                │             │   │   └────▲───┘  │      │
│   ┌────────▼────────────┐   │             │   │        │      │      │
│   │  Actor (LLM Model)  │   │             │   │     ┌──┴───┐  │      │
│   └────────┬────────────┘   │             │   │     │ PPO  │  │      │
│            │                │             │   │     └▲────▲┘  │      │
│            │                │             │   │      │    │   │      │
│            │                │             │   │      │    │   │      │
│    ┌───────▼──────┐         │             │ ┌─▼──────┴┐ ┌─┴───▼──┐   │
│    │ Reward Model │         │             │ │  Actor  │ │ Critic │   │
│    └──────────────┘         │             │ └─────────┘ └────────┘   │
│                             │             │                          │
│                             │ x Episodes  └─────────────▲────────────┘
└───────────────┬─────────────┘                           │   x Epochs
                │ store N Examples per Timestep           │  
         ┌──────▼──────┐                                  │
         │             │                                  │
         │  Memories   ├──────────────────────────────────┘
         │             │ (update timesteps x N Examples)
         └─────────────┘
"""  # noqa W291


def change_tokenization(tokens, tokenizer1, tokenizer2):
    """Change the tokenizer of the tokens

    Args:
        tokens (torch.Tensor): Tokens to be changed
        tokenizer1 (transformers.PreTrainedTokenizer): Tokenizer to be changed
        tokenizer2 (transformers.PreTrainedTokenizer): Tokenizer to be
            changed to

    Returns:
        encoded_tokens: Encoded tokens
    """

    # decode tokens
    with torch.no_grad():
        decoded_tokens = [
            tokenizer1.decode(token) for i, token in enumerate(tokens)
        ]

        # remove all the pad tokens
        decoded_tokens = [
            token.replace(tokenizer1.pad_token, "") for token in decoded_tokens
        ]

        # remove all the eos tokens
        decoded_tokens = [
            token.replace(tokenizer1.eos_token, "") for token in decoded_tokens
        ]

        # encode the actions with critic tokenizer
        encoded_tokens = tokenizer2(
            decoded_tokens,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

    return encoded_tokens


ConfigType = Union[ConfigActor, ConfigReward, ConfigCritic]


@beartype
def check_model_family(config1: ConfigType, config2: ConfigType) -> bool:
    """Check if the model family is the same for the two configs
    the model family is specified in the config.model

    Args:
        config1 (ConfigType): First config
        config2 (ConfigType): Second config

    Returns:
        bool: True if the model family is the same, False otherwise
    """

    # check if both are an hugging face models
    if (config1.model in hf_models) and (config2.model in hf_models):

        # if there is a "/" remove it from the name
        model_name1 = config1.model
        model_name2 = config2.model
        if "/" in model_name1:
            model_name1 = model_name1.split("/")[1]
        if "/" in model_name2:
            model_name2 = model_name2.split("/")[1]

        # check if the model family is the same
        return model_name1.split("-")[0] == model_name2.split("-")[0]

    # check if both are not an hugging face models
    elif (config1.model not in hf_models) and (config2.model not in hf_models):

        # for now they could be only LLaMA models
        return True
    else:
        return False


class ActorCritic(torch.nn.Module):
    """Actor Critic class stores both the actor and the critic models
    and it generates values and action for given sequences during the training
    of the actor.

    Attributes:
        actor (ActorModel): Actor model
        critic (CriticModel): Critic model
        debug (bool): enable prints for Debugging
        use_same_tokenizer (bool): if True the actor and critic use the same
            tokenizer

    Methods:
        forward: given a sequence returns action logits and values (used
            to evaluate the actor during training)
        generate: given a sequence returns action, action logits, values
            sequences and sequences masks (used to generate new sequences
            during acting phase)
    """

    def __init__(self, config: Config) -> None:
        super().__init__()
        self.config = config

        self.actor = ActorModel(config.actor)

        # check if critic must be initialized from reward model
        ModelLoader.init_critic_from_reward(config.critic)
        self.critic = CriticModel(config.critic)

        # if the actor and critic use the same tokenizer is set to True
        self.use_same_tokenizer = False

        # debug flag
        self.debug = config.actor.debug

    @beartype
    def load(self) -> None:
        """Load the model from the path.
        This method is not implemented since it relies on actor and critic
        __init__ methods to perform the loading from their respective paths
        then loaded.

        """
        pass

    @beartype
    def save(self) -> None:
        """Save the model to the path
        This method is implemented to save the actor model as result of RLHF
        in the folder actor_rl instead of actor.save() method that saves it
        in the actor folder.
        """
        # get the path to save the actor
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=False,
        )

        # save the model
        print(f"Saving model to {path} ...")
        torch.save(
            {"state_dict": self.actor.model.state_dict()},
            path,
        )

        # get the path to save the critic model
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config.critic,
            is_checkpoint=False,
        )

        # save the model
        print(f"Saving model to {path} ...")
        torch.save(
            {
                "model": self.critic.model.state_dict(),
                "head": self.critic.head.state_dict(),
            },
            path,
        )

    def save_deepspeed(
        self,
        model_engine: DeepSpeedEngine,
        config: ConfigType,
        client_state: dict = None,
    ):
        """Save the deepspeed model_engine to the path
        This method is implemented to save the actor model as result of RLHF
        in the folder actor_rl instead of actor.save() method that saves it
        in the actor folder. Same goes for the critic model.
        """
        # get the path to save the actor
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=config,
            is_checkpoint=False,
        )

        # save the model
        print(f"Saving model to {path} ...")
        model_engine.save_checkpoint(
            save_dir=path, client_state=client_state if client_state else {}
        )

    @beartype
    def forward(
        self,
        sequences_actor: torch.Tensor,
        sequences_mask_actor: torch.Tensor,
        sequences_critic: torch.Tensor,
        sequences_mask_critic: torch.Tensor,
        action_len_actor: int,
        action_len_critic: int,
    ) -> Tuple:
        """Given the whole sequences, use the actor forward to get the logits
            for each token in the sequence and the critic forward to get the
            values for each generation step.

        Args:
            sequences_actor (torch.Tensor): Sequences composed of
                [states, actions] for the actor
            sequence_mask_actor (torch.Tensor): Mask for the sequences
                of the actor
            sequences_critic (torch.Tensor): Sequences composed of
                [states, actions] for the critic
            sequences_mask_critic (torch.Tensor): Mask for the sequences
                of the critic
            action_len_actor (int): Length of the actions in the sequences
                for the actor
            action_len_critic (int): Length of the actions in the sequences
                for the critic

        Returns:
            action_logits (torch.Tensor): Logits for the actions in the
                sequences
            values (torch.Tensor): Values for the actions in the sequences
        """

        # use a single forward on the whole sequence
        # to get pi(y | x) and ignore predicted output
        actions_logits = self.actor.forward(
            sequences_actor, sequences_mask_actor
        )

        # use the critic forward to get the values for the actions
        values = self.critic.forward(sequences_critic, sequences_mask_critic)

        # return only logits and values for the actions taken
        real_actions_logits = actions_logits[:, -action_len_actor:, :]
        real_values = values[:, -action_len_critic:]

        if self.debug:
            print("ActorCritic.forward")
            print("action_len_actor", action_len_actor)
            print("action_len_critic", action_len_critic)
            print("sequences_actor.shape", sequences_actor.shape)
            print("sequences_actor", sequences_actor)
            print("sequences_critic.shape", sequences_critic.shape)
            print("sequences_critic", sequences_critic)
            print("real_action_logits.shape", actions_logits.shape)
            print("real_action_logits", actions_logits)
            print("real_values.shape", values.shape)
            print("real_values", values)

        return (
            real_actions_logits,
            real_values,
        )

    @torch.no_grad()
    @beartype
    def generate(
        self,
        states_actor: torch.Tensor,
        states_mask_actor: torch.Tensor,
        states_critic: torch.Tensor,
    ) -> Tuple:
        """Generate actions, actions_logits, values and sequences from states

        Args:
            states_actor (torch.Tensor): States for the actor
            states_mask_actor (torch.Tensor): Mask for the states for the
                actor
            states_critic (torch.Tensor): States for the critic

        Returns:
            actions (torch.Tensor): Actions generated from the states
            actions_logits (torch.Tensor): Logits for the actions generated
                from the states (i.e. pi(y | x))
            values (torch.Tensor): Values generated by the critic model
                for the actions generated by the actor (i.e. V(x))
            sequences (torch.Tensor): Sequences generated from the states
                as [states, actions]
        """

        # generate action sequence from the actor
        actions, sequences_actor = self.actor.generate(
            states_actor, states_mask_actor
        )

        # create mask for the actor sequences
        sequences_mask_actor = (
            (sequences_actor != self.actor.tokenizer.pad_token_id)
            .to(sequences_actor.device)
            .long()
            .detach()
        )

        # get the length of the actions
        action_len_actor = actions.shape[1]

        # check if different encoding is needed for the critic
        if self.use_same_tokenizer:
            sequences_critic = sequences_actor
            sequences_mask_critic = sequences_mask_actor
            action_len_critic = action_len_actor
        else:
            encoded_critic = change_tokenization(
                sequences_actor,
                self.actor.tokenizer,
                self.critic.tokenizer,
            )
            # split the encoded_critic in tokens and maks
            sequences_critic = encoded_critic["input_ids"].to(
                sequences_actor.device,
            )
            sequences_mask_critic = (
                encoded_critic["attention_mask"]
                .to(sequences_actor.device)
                .long()
                .detach()
            )

            # compute len of actions for the critic tokenizer
            action_len_critic = states_critic.shape[1]

        # generate actions_logits and values
        actions_logits, values = self.forward(
            sequences_actor,
            sequences_mask_actor,
            sequences_critic,
            sequences_mask_critic,
            action_len_actor,
            action_len_critic,
        )
        if self.debug:
            print("ActorCritic.generate")
            print("actions shape", actions.shape)
            print("actions", actions)
            print("sequence shape", sequences_actor.shape)
            print("sequence", sequences_actor)
            print("actions_logits shape", actions_logits.shape)
            print("actions_logits", actions_logits)
            print("values shape", values.shape)
            print("values", values)

        return (
            actions,
            actions_logits,
            values,
            sequences_actor,
            sequences_mask_actor,
            sequences_critic,
            sequences_mask_critic,
            action_len_actor,
            action_len_critic,
        )


# structure to store the data for each experience
Memory = namedtuple(
    "Memory",
    [
        "states_actor",
        "actions",
        "values",
        "rewards",
        "actions_log_probs",
        "sequences_actor",
        "sequences_mask_actor",
        "sequences_critic",
        "sequences_mask_critic",
        "action_len_actor",
        "action_len_critic",
    ],
)


class ExperienceDataset(Dataset):
    """Dataset to train the actor-critic models"""

    def __init__(
        self,
        memories: Deque[Memory],
        device: torch.device,
    ) -> None:
        super().__init__()
        self.data = list(memories)

    def __len__(
        self,
    ) -> int:
        return len(self.data)

    def __getitem__(self, idx) -> Tuple:
        # return the idx-th memory element as a tuple of tensors on the device
        item = (
            self.data[idx].states_actor,
            self.data[idx].actions,
            self.data[idx].values,
            self.data[idx].rewards,
            self.data[idx].actions_log_probs,
            self.data[idx].sequences_actor,
            self.data[idx].sequences_mask_actor,
            self.data[idx].sequences_critic,
            self.data[idx].sequences_mask_critic,
            int(self.data[idx].action_len_actor),
            int(self.data[idx].action_len_critic),
        )
        return item


class ExamplesSampler:
    """Store the prompt to be sampled to generate the examples
    read a json file with the following format:
    [
        {
            "user_input" : "",
        } ,
        ...
    ]
    Where:
        user_input: is the input of the user or directly the input of the user
            with the memory preappended (i.e. user_input + memory)
    """

    def __init__(
        self,
        path: str,
    ) -> None:
        self.path = path
        with open(path, "r") as f:
            data = json.load(f)
        self.data = [d["user_input"] for d in data]

    def sample(self, n: int) -> List:
        """Sample n examples from the data

        Args:
            n (int): Number of examples to sample
        """
        return random.sample(self.data, n)


class RLTrainer:
    """Train the actor-critic model using RL

    Attributes:
        config (Config): Configuration of the trainer
        debug (bool): Debug mode
        actorcritic (ActorCritic): Actor-critic model
        actor_optim (torch.optim): Optimizer for the actor
        critic_optim (torch.optim): Optimizer for the critic
        actor_scheduler (torch.optim.lr_scheduler): Scheduler for the actor
        critic_scheduler (torch.optim.lr_scheduler): Scheduler for the critic
        reward (RewardModel): Reward model
        training_stats (TrainingStats): Class to store training stats
        conversation_log (ConversationLog): Class to store the conversation
        examples_sampler (ExamplesSampler): Class to sample examples
        eps (float): small epsilon to avoid division by zero

    Methods:
        train: the training loop that calls the learn function after generating
            the experiences.
        learn: Learn from a batch of experiences and update the actor and the
            critic model.
        load_checkpoint: Load the checkpoint of the actor-critic model
        save_checkpoint: Save the checkpoint of the actor-critic model
    """

    def __init__(
        self,
        config: Config,
    ) -> None:

        # save config
        self.config = config

        # set debug mode
        self.debug = config.trainer.debug

        # initialize agent-critic
        self.actorcritic = ActorCritic(config)

        # initialize actor optimizer
        self.actor_optimizer = torch.optim.Adam(
            self.actorcritic.actor.parameters(), lr=config.trainer.actor_lr
        )

        # initialize critic optimizer
        self.critic_optimizer = torch.optim.Adam(
            self.actorcritic.critic.parameters(), lr=config.trainer.critic_lr
        )

        # scheduler (defined in the learn() method (i need dataset size))
        self.actor_scheduler = None
        self.critic_scheduler = None

        # initialize reward model
        self.reward = RewardModel(config.reward)

        # initialize class to store training stats
        path = ModelLoader.get_training_stats_path(config)
        self.training_stats = TrainingStats(path)
        model_folder, _, _ = ModelLoader.get_model_path(
            config,
            is_checkpoint=True,
        )
        path = os.path.join(model_folder, "conversations_log.json")
        self.conversation_log = ConversationLog(path)

        # initialize examples sampler
        self.example_sampler = ExamplesSampler(config.trainer.examples_path)

        # check if actor and critic use the same tokenizer
        self.actorcritic.use_same_tokenizer = check_model_family(
            config.actor, config.critic
        )

        # check if actor and reward use the same tokenizer
        self.use_same_tokenizer = check_model_family(
            config.actor, config.reward
        )

        # eps
        self.eps = 1e-8

        # deepspeed initialization
        self.actor_model_engine = None
        self.critic_model_engine = None
        self.is_deepspeed_init = None

        if (
            self.config.actor.deepspeed_enable
            or self.config.critic.deepspeed_enable
            or self.config.critic.deepspeed_enable
        ):
            deepspeed.init_distributed("nccl")
            self.is_deepspeed_init = True
            os.environ["TOKENIZERS_PARALLELISM"] = "False"

        else:
            self.is_deepspeed_init = False

        if self.config.actor.deepspeed_enable:
            (
                self.actor_model_engine,
                self.actorcritic.actor,
                self.actor_optimizer,
            ) = self.initialize_deepspeed_model(
                config=self.config.actor, model=self.actorcritic.actor
            )

        if self.config.critic.deepspeed_enable:
            (
                self.critic_model_engine,
                self.actorcritic.critic,
                self.critic_optimizer,
            ) = self.initialize_deepspeed_model(
                config=self.config.critic, model=self.actorcritic.critic
            )

        if self.config.reward.deepspeed_enable:
            (
                _,
                self.reward,
                _,
            ) = self.initialize_deepspeed_model(
                config=self.config.reward, model=self.reward
            )

    @staticmethod
    def initialize_deepspeed_model(
            config: Union[ConfigActor, ConfigCritic, ConfigReward],
            model: torch.nn.Module,
    ):

        if config.deepspeed_config_path is None:
            raise ValueError("DeepSpeed config path is None, but deepspeed is enabled")
        if os.path.exists(config.deepspeed_config_path) is False:
            raise ValueError(
                f"DeepSpeed config path"
                f"{config.deepspeed_config_path}"
                f"does not exist"
            )
        (model_engine, ds_optimizer, _, _,) = deepspeed.initialize(
            args=None,
            model=model,
            model_parameters=model.parameters(),
            config=config.deepspeed_config_path,
        )
        # model_engine.module has to be returned to make custom methods
        # of Module accessible
        return model_engine, model_engine.module, ds_optimizer

    @beartype
    def save_checkpoint(
        self,
        current_episode: int,
        max_episode: int,
    ) -> None:

        print(f"Saving checkpoint for episode {current_episode+1}..")

        # get the path to save the checkpoint for the critic
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config.critic,
            is_checkpoint=True,
            current_epoch=current_episode,
            max_epochs=max_episode,
            max_steps=0,
        )

        # if the checkpoint already exists remove it.
        # Deepspeed checkpoints are already directories and will be overwritten
        if os.path.exists(path) and not self.is_deepspeed_init:
            os.remove(path)

        # save the checkpoint
        actor_checkpoint_dict = {
            "episode": current_episode,
            "critic_state_dict": self.actorcritic.critic.state_dict(),
            "critic_optim_state_dict": self.critic_optimizer.state_dict(),
        }

        if self.config.actor.deepspeed_enable:
            # The model and optimizer state dicts are actually already saved
            # In the deepspeed model engine. But to make sure no depending
            # methods fail, the states are included in actor_checkpoint_dict.
            # ATTENTION: If you use deepspeed zero optimization, the client_state
            # will not be saved
            self.actor_model_engine.save_checkpoint(
                save_dir=path, client_state=actor_checkpoint_dict
            )
        else:
            torch.save(actor_checkpoint_dict, path)

        # get the path to save the checkpoint for the actor
        model_folder, model_name, path = ModelLoader.get_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=current_episode,
            max_epochs=max_episode,
            max_steps=0,
        )

        # if the checkpoint already exists remove it.
        # Deepspeed checkpoints are already directories and will be overwritten
        if os.path.exists(path) and not self.is_deepspeed_init:
            os.remove(path)

        # save the checkpoint
        critic_checkpoint_dict = {
            "episode": current_episode,
            "actor_state_dict": self.actorcritic.actor.state_dict(),
            "actor_optim_state_dict": self.actor_optimizer.state_dict(),
            "training_stats": self.training_stats,
        }

        if self.config.critic.deepspeed_enable:
            # The model and optimizer state dicts are actually already saved
            # In the deepspeed model engine. But to make sure no depending
            # methods fail, the states are included in critic_checkpoint_dict.
            # ATTENTION: If you use deepspeed zero optimization, the client_state
            # will not be saved
            self.critic_model_engine.save_checkpoint(
                save_dir=path, client_state=critic_checkpoint_dict
            )
        else:
            torch.save(critic_checkpoint_dict, path)

    @beartype
    def load_checkpoint(
        self,
    ) -> int:

        critic_episode = -1
        actor_episode = -1

        # check if there are some checkpoint for the critic
        print("Looking for checkpoints...")
        path = ModelLoader.check_model_path(
            config=self.config.critic,
            is_checkpoint=True,
            current_epoch=None,
        )

        # if there are checkpoint
        if path is not None:

            # load the critic checkpoint
            print("Loading ...")
            try:
                checkpoint = torch.load(path)
            except Exception:
                print(
                    "Checkpoint of critic corrupted!"
                    "Try to remove the last checkpoint."
                    "Now Starting from episode 0"
                )
                return 0

            # load checkpoint into model
            critic_episode = checkpoint["episode"]
            self.actorcritic.critic.load_state_dict(
                checkpoint["critic_state_dict"]
            )
            self.critic_optimizer.load_state_dict(
                checkpoint["critic_optim_state_dict"]
            )

        # check if there are checkpoints for the actor
        print("Looking for checkpoints...")
        path = ModelLoader.check_model_path(
            config=self.config,
            is_checkpoint=True,
            current_epoch=None,
        )

        # if there are some checkpoints
        if path is not None:

            # load the actor checkpoint
            print("Loading ...")
            try:
                checkpoint = torch.load(path)
            except Exception:
                print(
                    "Checkpoint of actor corrupted!"
                    "Try to remove the last checkpoint."
                    "Now Starting from episode 0"
                )
                return 0

            # load checkpoint into the model
            actor_episode = checkpoint["episode"]
            self.actorcritic.actor.load_state_dict(
                checkpoint["actor_state_dict"]
            )
            self.actor_optimizer.load_state_dict(
                checkpoint["actor_optim_state_dict"]
            )
            self.training_stats = checkpoint["training_stats"]

        # check if there are some discrepancies between the checkpoints
        if critic_episode == actor_episode:
            # all ok start from next episode
            return critic_episode + 1
        else:
            print(
                f"There are some discrepancies between the checkpoints"
                f"of actor and critic \nactor episode: {actor_episode}"
                f"\n critic episode: {critic_episode}\n"
            )
            return min(critic_episode, actor_episode) + 1

    @beartype
    def learn(self, memories: Deque[Memory]) -> None:
        """Train the agent-critic model using RL:
        - for each batch of episodes, compute action logits and values
        - then compare action logits probs with memories one and values with
            rewards to compute the PPO loss and update the actor-critic model
        """
        print("Start to Learn...")

        # get parameters
        epochs = self.config.trainer.epochs
        actor_eps_clip = self.config.trainer.actor_eps_clip
        critic_eps_clip = self.config.trainer.critic_eps_clip
        beta_s = self.config.trainer.beta_s
        batch_size = self.config.trainer.batch_size
        device = (
            torch.device(f"cuda:{dist.get_rank()}")
            if self.is_deepspeed_init
            else self.config.trainer.device
        )

        # create dataset from memories
        dataset = ExperienceDataset(memories, device)
        if self.is_deepspeed_init:
            engine = self.actor_model_engine or self.critic_model_engine
            dataloader = engine.deepspeed_io(dataset)
        else:
            dataloader = DataLoader(dataset, batch_size=batch_size)

        # initialize scheduler for actor
        actor_lr = self.config.trainer.actor_lr
        # This lr_scheduler is not available in deepspeed
        # see https://deepspeed.readthedocs.io/en/latest/schedulers.html
        if not self.is_deepspeed_init:
            self.actor_scheduler = CosineAnnealingWarmRestarts(
                self.actor_optimizer, T_0=len(dataset), eta_min=actor_lr * 0.1
            )

        # initialize scheduler for critic
        critic_lr = self.config.trainer.critic_lr
        # This lr_scheduler is not available in deepspeed
        # see https://deepspeed.readthedocs.io/en/latest/schedulers.html
        if not self.is_deepspeed_init:
            self.critic_scheduler = CosineAnnealingWarmRestarts(
                self.critic_optimizer, T_0=len(dataset), eta_min=critic_lr * 0.1
            )

        # initialize actor accelerate
        if self.config.actor.accelerate_enable is True:
            actor_accelerator = Accelerator()
            (
                actor_model,
                self.actor_optimizer,
                self.train_dataloader,
                self.actor_scheduler,
            ) = actor_accelerator.prepare(
                self.actorcritic.actor,
                self.actor_optimizer,
                self.train_dataloader,
                self.actor_scheduler,
            )
            self.actorcritic.actor = actor_model

        # initialize critic accelerate
        if self.config.critic.accelerate_enable is True:
            critic_accelerator = Accelerator()
            (
                critic_model,
                self.critic_optimizer,
                self.critic_scheduler,
            ) = critic_accelerator.prepare(
                self.actorcritic.critic,
                self.critic_optimizer,
                self.critic_scheduler,
            )
            self.actorcritic.critic = critic_model

        # train agent-critic
        self.actorcritic.train()
        for epoch in range(epochs):
            for k, batch in enumerate(dataloader):

                (
                    states_actor,
                    old_actions,
                    old_values,
                    rewards,
                    old_actions_log_probs,
                    sequences_actor,
                    sequences_mask_actor,
                    sequences_critic,
                    sequences_mask_critic,
                    action_len_actor,
                    action_len_critic,
                ) = [tensor.to(device) for tensor in batch]

                if self.debug:
                    print(
                        f"#########################################"
                        f" batch from memories {k} \n "
                        f"#########################################"
                        f"states_actor {states_actor.shape} \n"
                        f"old_actions {old_actions.shape} \n"
                        f"old_values {old_values.shape} \n"
                        f"rewards {rewards.shape} \n"
                        f"old_actions_log_probs "
                        f"{old_actions_log_probs.shape}\n"
                        f"sequences_actor {sequences_actor.shape} \n"
                        f"sequences_mask_actor "
                        f"{sequences_mask_actor.shape} \n"
                        f"sequences_critic {sequences_critic.shape} \n"
                        f"sequences_mask_critic "
                        f"{sequences_mask_critic.shape} \n"
                        f"action_len_actor {action_len_actor} \n"
                        f"action_len_critic {action_len_critic} \n"
                        f"#########################################"
                    )

                # get actor critic new probabilities and values
                actions_logits, values = self.actorcritic.forward(
                    sequences_actor,
                    sequences_mask_actor,
                    sequences_critic,
                    sequences_mask_critic,
                    action_len_actor.item(),
                    action_len_critic.item(),
                )

                # get action log prob
                actions_prob = (
                    torch.softmax(actions_logits, dim=-1).max(dim=-1).values
                )
                actions_log_prob = torch.log(actions_prob + self.eps)

                # compute entropy
                entropies = (actions_prob * actions_log_prob).sum(dim=-1)

                # compute KL divergence
                kl_div_loss = (
                    (actions_prob * (old_actions_log_probs - actions_log_prob))
                    .sum(dim=-1)
                    .mean()
                )

                # compute ratios
                ratios = (actions_log_prob - old_actions_log_probs).exp()

                # compute PPO loss
                if check_model_family(self.config.actor, self.config.critic):
                    # compute discounted rewards as in TRL
                    gamma = self.config.trainer.gamma_discounted
                    discounted_rewards = torch.zeros_like(old_values)
                    for i in range(discounted_rewards.shape[1]):
                        for j in range(i, discounted_rewards.shape[1]):
                            discounted_rewards[:, i] += (
                                gamma ** (j - i) * rewards[:, j]
                            )

                    advantages = (
                        discounted_rewards - old_values
                    )  # TRL has opposite sign for old values
                    advantages = (advantages - advantages.mean(dim=-1)) / (
                        advantages.std() + self.eps
                    )

                    surr1 = advantages * ratios
                else:
                    advantages = rewards - old_values[:, -1]
                    surr1 = advantages * ratios

                surr2 = (
                    torch.clamp(ratios, 1 - actor_eps_clip, 1 + actor_eps_clip)
                    * advantages
                )

                policy_loss = -torch.min(surr1, surr2) - beta_s * entropies
                policy_loss = policy_loss.mean()
                loss = policy_loss + kl_div_loss

                # check if loss item is NaN
                if torch.isnan(loss):
                    raise ValueError("Loss is nan")

                # update actor with loss
                if self.config.actor.deepspeed_enable:
                    self.actor_model_engine.backward(loss)
                    self.actor_model_engine.step()
                elif self.config.actor.accelerate_enable:
                    self.actor_optimizer.zero_grad()
                    actor_accelerator.backward(loss)
                    self.actor_optimizer.step()
                    self.actor_scheduler.step()
                else:
                    self.actor_optimizer.zero_grad()
                    loss.backward()
                    self.actor_optimizer.step()
                    self.actor_scheduler.step()

                # compute value loss
                # the loss is the distance between the rewards and the values
                # I want this distance to be small so that values are
                # representative of the rewards, for this reason i took the
                # maximum between the two.
                # The clip is limiting the slew-rate of values_loss_clipped
                value_loss_clipped = old_values + (values - old_values).clamp(
                    -critic_eps_clip, critic_eps_clip
                )
                value_loss1 = (value_loss_clipped - rewards) ** 2
                value_loss2 = (values - rewards) ** 2
                value_loss = torch.max(value_loss1, value_loss2).mean()

                if torch.isnan(value_loss):
                    raise ValueError("Value loss is nan")

                # upate critic
                if self.config.critic.deepspeed_enable:
                    self.critic_model_engine.backward(value_loss)
                    self.critic_model_engine.step()
                elif self.config.critic.accelerate_enable:
                    self.critic_optimizer.zero_grad()
                    critic_accelerator.backward(loss)
                    self.critic_optimizer.step()
                    self.critic_scheduler.step()
                else:
                    self.critic_optimizer.zero_grad()
                    value_loss.backward()
                    self.critic_optimizer.step()
                    self.critic_scheduler.step()

                # append the losses to the training stats
                self.training_stats.training_loss.append(
                    loss.detach().cpu().item()
                )
                self.training_stats.value_loss.append(
                    value_loss.detach().cpu().item()
                )

                # print iteration info
                print(
                    f"Epoch {epoch+1}/{epochs}",
                    f"Step {k+1}/{int(len(dataloader) / batch_size)}",
                    f"Loss {loss.detach().cpu().item():.4f}",
                    f"Value Loss {value_loss.detach().cpu().item():.4f}",
                )

        self.actorcritic.eval()
        print("End Learning")

    def train(
        self,
    ) -> None:

        print("Start RL Training")

        # initialize settings
        num_episodes = self.config.trainer.num_episodes
        max_timesteps = self.config.trainer.max_timesteps
        num_examples = self.config.trainer.num_examples
        update_timesteps = self.config.trainer.update_timesteps
        batch_size = self.config.trainer.batch_size
        checkpoint_steps = self.config.trainer.checkpoint_steps
        device = (
            torch.device(f"cuda:{dist.get_rank()}")
            if self.is_deepspeed_init
            else self.config.trainer.device
        )

        # number of elements that the memories should contain when learning
        number_of_memories_per_learn_iteration = (
            num_examples * update_timesteps
        )

        # the number of memories must be a multiple of the batch size
        assert (
            number_of_memories_per_learn_iteration % batch_size == 0
        ), "The number of memories must be a multiple of the batch size"

        # the total number of timesteps done in the train() are
        total_number_of_timesteps = num_episodes * max_timesteps

        # the total timesteps done should be a multiple of the update timesteps
        assert total_number_of_timesteps % update_timesteps == 0, (
            "The number of timesteps (num_episodes*max_timesteps)"
            "must be a multiple of the update_timesteps"
        )

        # initialize memories
        memories = deque([])

        # load checkpoint
        start_episode = self.load_checkpoint()

        # if it is a new training from the start clear the conversation log
        if start_episode == 0:
            self.conversation_log.clear()

        # initialize counters
        cnt_timesteps = 0
        cnt_learn_iter = 0

        # loop over episodes and timesteps
        self.actorcritic.eval()
        for episode in range(start_episode, num_episodes):
            for timestep in range(max_timesteps):

                # print the iteration info
                print(
                    f"Episode: {episode + 1}/{num_episodes}, "
                    f"Timestep: {timestep + 1}/{max_timesteps}",
                    f"Learning Cnt: {cnt_timesteps + 1}/{update_timesteps}",
                )

                # counter used to count timesteps into memory
                cnt_timesteps += 1

                # sample num_examples examples from  example dataset
                inputs = self.example_sampler.sample(num_examples)

                # tokenize examples for the actor
                tok_inputs_act = self.actorcritic.actor.tokenizer(
                    inputs, padding=True, return_tensors="pt", truncation=True
                )

                # states are [batch_size, seq_len_of_states]
                states_actor = tok_inputs_act["input_ids"].to(device)
                states_mask_actor = tok_inputs_act["attention_mask"].to(device)

                # tokenize examples for the critic
                tok_inputs_crt = self.actorcritic.critic.tokenizer(
                    inputs, padding=True, return_tensors="pt", truncation=True
                )

                # states are [batch_size, seq_len_of_states]
                states_critic = tok_inputs_crt["input_ids"].to(device)

                # generate sequences of actions and values
                (
                    actions,
                    actions_logits,
                    values,
                    sequences_actor,
                    sequences_mask_actor,
                    sequences_critic,
                    sequences_mask_critic,
                    action_len_actor,
                    action_len_critic,
                ) = self.actorcritic.generate(
                    states_actor, states_mask_actor, states_critic
                )

                # compute action log probs
                action_prob = (
                    torch.softmax(actions_logits, dim=-1).max(dim=-1).values
                )
                actions_log_probs = torch.log(action_prob + self.eps)

                # get tokenized sequence for the reward models
                if self.use_same_tokenizer:
                    reward_sequence = sequences_actor
                    reward_mask = sequences_mask_actor
                elif check_model_family(
                    self.config.critic, self.config.reward
                ):
                    reward_sequence = sequences_critic
                    reward_mask = sequences_mask_critic
                else:
                    tokenized_responses = change_tokenization(
                        sequences_actor,
                        self.actorcritic.actor.tokenizer,
                        self.reward.tokenizer,
                    )
                    # get tokens and mask
                    reward_sequence = tokenized_responses["input_ids"].to(
                        device
                    )
                    reward_mask = tokenized_responses["attention_mask"].to(
                        device
                    )

                # compute rewards
                rewards = self.reward.forward(
                    reward_sequence,
                    reward_mask,
                )

                rewards = rewards[:, -action_len_critic:]
                reward = rewards[:, -1]

                # store memories of the episode / timestep
                for i in range(states_actor.shape[0]):
                    memories.append(
                        Memory(
                            states_actor[i, :].detach().cpu(),
                            actions[i, :].detach().cpu(),
                            values[i, :].detach().cpu(),
                            rewards[i, :].detach().cpu(),
                            actions_log_probs[i, :].detach().cpu(),
                            sequences_actor[i, :].detach().cpu(),
                            sequences_mask_actor[i, :].detach().cpu(),
                            sequences_critic[i, :].detach().cpu(),
                            sequences_mask_critic[i, :].detach().cpu(),
                            int(action_len_actor),
                            int(action_len_critic),
                        )
                    )

                # decode completions to be logged in the conversation log
                completions = [
                    self.actorcritic.actor.tokenizer.decode(action)
                    for action in actions
                ]
                # remove pad tokens from completions
                completions = [
                    c.replace(self.actorcritic.actor.tokenizer.pad_token, "")
                    for c in completions
                ]
                # remove eos tokens from completions
                completions = [
                    c.replace(self.actorcritic.actor.tokenizer.eos_token, "")
                    for c in completions
                ]
                # strange i need to force this?
                completions = [c.replace("<pad>", "") for c in completions]

                # log the memories in the conversation log
                for i in range(states_actor.shape[0]):
                    self.conversation_log.append(
                        inputs[i],
                        completions[i],
                        reward[i].detach().cpu().item(),
                        cnt_learn_iter,
                    )

                # learn from memories
                if (cnt_timesteps % update_timesteps == 0) and (
                    cnt_timesteps != 0
                ):
                    print("len memories", len(memories))
                    if not self.is_deepspeed_init or (dist.get_rank() == 0):
                        self.conversation_log.save()
                    self.learn(memories)
                    mean_reward = sum([m.rewards[-1] for m in memories]) / len(
                        memories
                    )
                    print(f"Mean Reward: {mean_reward}")
                    memories.clear()
                    cnt_timesteps = 0
                    cnt_learn_iter += 1
                    if not self.is_deepspeed_init or (dist.get_rank() == 0):
                        self.conversation_log.save()

            # save checkpoints
            if (episode % checkpoint_steps == 0) and (episode != 0):
                self.save_checkpoint(
                    current_episode=episode, max_episode=num_episodes
                )
                if not self.is_deepspeed_init or (dist.get_rank() == 0):
                    self.conversation_log.save()

        # save the models
        if self.is_deepspeed_init:
            self.actorcritic.save_deepspeed(self.actor_model_engine, self.config)
            self.actorcritic.save_deepspeed(
                self.critic_model_engine, self.config.critic
            )
        else:
            self.actorcritic.save()
        print("End RL Training")


================================================
FILE: optimization/chatllama/chatllama/rlhf/utils.py
================================================
import json
import os
from beartype import beartype
from plotly import graph_objects as go


class TrainingStats:
    """Training statistics

    Attributes:
        training_loss (List): List of training losses
        training_accuracy (List): List of training accuracies
        value_loss (List): List of value losses
        validation_loss (List): List of validation losses
        validation_accuracy (List): List of validation accuracies
    """

    def __init__(self, path: str):
        """Initialize the training stats

        Args:
            path (str): Path to save the stats
        """
        self.training_loss = []
        self.training_accuracy = []
        self.value_loss = []
        self.validation_loss = []
        self.validation_accuracy = []
        self.path = path

    def plot(self):
        """Plot the training statistics using plotly"""
        fig = go.Figure()
        if len(self.training_loss) > 0:
            fig.add_trace(
                go.Scatter(y=self.training_loss, name="Training loss")
            )
        if len(self.training_accuracy) > 0:
            fig.add_trace(
                go.Scatter(y=self.training_accuracy, name="Training accuracy")
            )
        if len(self.value_loss) > 0:
            fig.add_trace(go.Scatter(y=self.value_loss, name="Value loss"))
        if len(self.validation_loss) > 0:
            fig.add_trace(
                go.Scatter(y=self.validation_loss, name="Validation loss")
            )
        if len(self.validation_accuracy) > 0:
            fig.add_trace(
                go.Scatter(
                    y=self.validation_accuracy, name="Validation accuracy"
                )
            )
        fig.update_layout(
            showlegend=True, xaxis_type="log", xaxis_title="steps"
        )
        fig.show()

    def save(
        self,
    ):
        """Save the stats"""
        if os.path.exists(self.path):
            with open(self.path, "r") as f:
                stats = json.load(f)
            stats["training_loss"].extend(self.training_loss)
            stats["training_accuracy"].extend(self.training_accuracy)
            stats["value_loss"].extend(self.value_loss)
            stats["validation_loss"].extend(self.validation_loss)
            stats["validation_accuracy"].extend(self.validation_accuracy)
        else:
            stats = {
                "training_loss": self.training_loss,
                "training_accuracy": self.training_accuracy,
                "value_loss": self.value_loss,
                "validation_loss": self.validation_loss,
                "validation_accuracy": self.validation_accuracy,
            }
        with open(self.path, "w") as f:
            json.dump(stats, f, indent=4)

    def load(
        self,
    ):
        """Load the stats"""
        with open(self.path, "r") as f:
            stats = json.load(f)
        self.training_loss = stats["training_loss"]
        self.training_accuracy = stats["training_accuracy"]
        self.value_loss = stats["value_loss"]
        self.validation_loss = stats["validation_loss"]
        self.validation_accuracy = stats["validation_accuracy"]

    def clear(
        self,
    ):
        """Clear the stats"""
        self.training_loss = []
        self.training_accuracy = []
        self.value_loss = []
        self.validation_loss = []
        self.validation_accuracy = []
        if os.path.exists(self.path):
            os.remove(self.path)


class ConversationLog:
    """Save the conversation:
    (user input, model output, rewards and learn_counter)
    during the RL training loop.
    """

    def __init__(self, path: str):
        self.conversation = []
        self.path = path
        if self.path is None:
            self.path = "./convesation_log.json"

    @beartype
    def append(
        self,
        user_input: str,
        model_output: str,
        reward: float,
        learn_counter: int,
    ):
        """Add a conversation to the log

        Args:
            user_input (str): User input / initial prompt
            model_output (str): Completion of the LLM model
            reward (float): Reward of the reward model assigned to the output
            learn_counter (int): Number of the learning iteration to
                distinguish the conversations that happens at different
                points of the training loopt
        """
        self.conversation.append(
            {
                "user_input": user_input,
                "model_output": model_output,
                "reward": reward,
                "learn_counter": learn_counter,
            }
        )

    def save(self):
        print("Saving conversations log")
        if os.path.exists(self.path):
            with open(self.path, "r") as f:
                conversation = json.load(f)
            self.conversation.extend(conversation)
        self.conversation = sorted(
            self.conversation, key=lambda x: float(x["learn_counter"])
        )
        with open(self.path, "w") as f:
            json.dump(self.conversation, f, indent=4)

    def load(self):
        with open(self.path, "r") as f:
            self.conversation = json.load(f)

    def clear(self):
        print("Clearing conversations log")
        self.conversation = []
        # remove the file in path exists
        if os.path.exists(self.path):
            os.remove(self.path)

    def show(self, current_iteration: int = None):
        """Show the conversation log

        Args:
            current_iteration (int): Current iteration of the training loop,
                if not None, print only the conversations that happened at
                <current_iteration>
        """
        for i, c in enumerate(self.conversation):
            if current_iteration is None:
                print(
                    f"##########################################\n"
                    f"Conversation {i} at learn_counter "
                    f"{c['learn_counter']}\n"
                    f"##########################################\n"
                    f"## User Input:\n\n{c['user_input']}\n\n"
                    f"## Model Output:\n\n{c['model_output']}\n\n"
                    f"## Reward: {c['reward']}\n\n"
                )
            else:
                if current_iteration == c["learn_counter"]:
                    print(
                        f"##########################################\n"
                        f"Conversation {i} at learn_counter "
                        f"{c['learn_counter']}\n"
                        f"##########################################\n"
                        f"## User Input:\n\n{c['user_input']}\n\n"
                        f"## Model Output:\n\n{c['model_output']}\n\n"
                        f"## Reward: {c['reward']}\n\n"
                    )


================================================
FILE: optimization/chatllama/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages


REQUIREMENTS = [
    "accelerate",
    "beartype",
    "deepspeed",
    "einops",
    "fairscale",
    "langchain>=0.0.103",
    "torch",
    "tqdm",
    "transformers",
    "datasets",
    "openai",
    "plotly",
    "peft"
]

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")

setup(
    name="chatllama-py",
    version="0.0.4",
    packages=find_packages(),
    install_requires=REQUIREMENTS,
    long_description=long_description,
    include_package_data=True,
    long_description_content_type="text/markdown",
)


================================================
FILE: optimization/cloud_surfer/README.md
================================================
# 🏄 CloudSurfer (WIP)
Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.

If you like this module, give us a star to show your support for the project ⭐

## 📚 Description
The CloudSurfer module allows users to automatically compare the inference performance of their deep learning model across hardware and cloud providers. It leverages state-of-the-art optimization techniques to custom-accelerate the models on each platform, providing the user with an accurate benchmark of their model performances in terms of speed, accuracy, and cost.

With CloudSurfer, users can input their model in their preferred deep learning framework and express their preferences for accuracy and performance. The library will then automatically test the model on a range of hardware and cloud platforms, using optimization techniques to ensure that the results are accurate and representative of the model's performances.

Users can then compare the results side-by-side, seeing the performance of their model on different hardware and cloud providers. This is key to make informed decisions about which platform (cloud and hardware type) to pick, without having to guess or rely on outdated information.

Overall, CloudSurfer provides a powerful and easy-to-use tool to optimize deep learning models and to choose the best inference hardware and cloud platform. Try it out today, and reach out if you have any feedback!


================================================
FILE: optimization/forward_forward/README.md
================================================
# Forward-Forward Algorithm

This module implements a complete open-source version of [Geoffrey Hinton's Forward Forward](https://www.cs.toronto.edu/~hinton/FFA13.pdf) Algorithm, an alternative approach to backpropagation.

The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes, one with positive (i.e., real) data and the other with negative data that could be generated by the network itself.

Unlike the backpropagation approach, Forward-Forward does not require calculating the gradient of the loss function with respect to the network parameters. Instead, each optimization step can be performed locally and the weights of each layer can be updated immediately after the layer has performed its forward pass.

If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)

<img width="1012" alt="Screenshot 2022-12-20 at 14 45 22" src="https://user-images.githubusercontent.com/83510798/208681462-2d8fc8f8-b24e-41a3-978a-72101f7f6392.png">

## Installation

The forward-forward module is built on top of nebullvm, a framework for efficiency-based modules. The library can be easily installed from source code. First you have to clone the repository and navigate to the app directory:

```bash
git clone https://github.com/nebuly-ai/nebullvm.git
cd nebullvm/apps/accelerate/forward_forward
```

Then install the module:

```bash
pip install .
```
This process will just install the minimum requirements for running the module. If you want to run the module on a GPU you have to install the CUDA version of PyTorch. You can find the instructions on the official PyTorch website.

## Usage
At the current stage, this implementation supports the main architectures discussed by Hinton in his paper. Each architecture can be trained with the following command:

```python
from forward_forward import train_with_forward_forward_algorithm
import os
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

trained_model = train_with_forward_forward_algorithm(
    model_type="progressive",
    n_layers=3,
    hidden_size=2000,
    lr=0.03,
    device=device,
    epochs=100,
    batch_size=5000,
    theta=2.,
)
```

Three architectures are currently supported:
* `progressive`: the most simple architecture described in the paper. It has a pipeline-like structure and each layer can be trained independently from the following ones. Our implementation differs respect the original one since the labels are injected in the image concatenating them to the flattened tensor instead of replacing the first n_classes pixels value with a one-hot-representation of the label.

* `recurrent`: the recurrent architecture described in the paper. It has a recurrent-like structure and its based on the `GLOM` architecture proposed by Hinton. 

* `nlp`: A simple network which can be used as a language model.

The recurrent and nlp network architectures are better explained below.

## Recurrent Architecture
The recurrent architecture is based in the `GLOM` architecture for videos, proposed by Hinton in the paper [How to represent part-whole hierarchies in a neural network](https://arxiv.org/pdf/2102.12627.pdf). Its application to the forward-forward algorithm aims at enabling each layer to learn not just from the previous layer output, but from the following layers as well. This is done by concatenating the outputs of the previous layer and following layers computed at the previous time-step. A learned representation of the label (positive or negative) it is given as input to the last layer. The following figure shows the structure of the network:

<p align="center">
    <img width="500" alt="recurrent_net" src="https://user-images.githubusercontent.com/38586138/208651417-498c4bd4-f2dc-4613-a376-0b69317c73d4.png">
</p>

## NLP Architecture
The forward-forward architecture developed for NLP is a simple network which can be used as a language model. The network is composed by few normalized fully connected layers followed by a ReLU activation. All hidden representations are then concatenated together and given as input to the softmax for predicting the next token. The network can be trained in a progressive way, i.e. each layer can be sequentially trained separately from the following ones. The following figure shows the structure of the network:

<p align="center">
    <img width="500" class="center" alt="nlp_net" src="https://user-images.githubusercontent.com/38586138/208651624-c159b230-f903-4e13-aaa7-b39a0d1c52fc.png">
</p>

## What is missing
This app implements the main architectures exposed by hinton in its paper. However, there are still some features that are not implemented yet. In particular, the following features are missing:

* [ ] Implementation of unsupervised training.
* [ ] Implementation of the `progressive` architecture using local receptive fields instead of fully connected layers.
* [ ] Training on CIFAR-10 for CV-based architectures.

And don't forget to [leave a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) if you appreciate the project!
If you have any questions about the implementation, [open an issue](https://github.com/nebuly-ai/nebullvm/issues) or contact us in the [community chat](https://discord.gg/RbeQMu886J).

## Contributing

We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the linked page for more information on how to get involved.

A special thanks to [Additi Pandey](https://github.com/cyclotomicextension) for her amazing contribution to the Forward-Forward module.


================================================
FILE: optimization/forward_forward/forward_forward/__init__.py
================================================
from forward_forward.api.functions import (  # noqa F401
    train_with_forward_forward_algorithm,
)


================================================
FILE: optimization/forward_forward/forward_forward/api/__init__.py
================================================


================================================
FILE: optimization/forward_forward/forward_forward/api/functions.py
================================================
from torchvision import datasets

from forward_forward.root_op import (
    ForwardForwardRootOp,
    ForwardForwardModelType,
)


def train_with_forward_forward_algorithm(
    n_layers: int = 2,
    model_type: str = "progressive",
    device: str = "cpu",
    hidden_size: int = 2000,
    lr: float = 0.03,
    epochs: int = 100,
    batch_size: int = 5000,
    theta: float = 2.0,
    shuffle: bool = True,
    **kwargs,
):
    model_type = ForwardForwardModelType(model_type)
    root_op = ForwardForwardRootOp(model_type)

    output_size = None
    if model_type is ForwardForwardModelType.PROGRESSIVE:
        input_size = 28 * 28 + len(datasets.MNIST.classes)
    elif model_type is ForwardForwardModelType.RECURRENT:
        input_size = 28 * 28
        output_size = len(datasets.MNIST.classes)
    else:  # model_type is ForwardForwardModelType.NLP
        input_size = 10  # number of characters
        output_size = 30  # length of vocabulary
        assert (
            kwargs.get("predicted_tokens") is not None
        ), "predicted_tokens must be specified for NLP model"

    root_op.execute(
        input_size=input_size,
        n_layers=n_layers,
        hidden_size=hidden_size,
        optimizer_name="Adam",
        optimizer_params={"lr": lr},
        loss_fn_name="alternative_loss_fn",
        batch_size=batch_size,
        epochs=epochs,
        device=device,
        shuffle=shuffle,
        theta=theta,
        output_size=output_size,
    )

    return root_op.get_result()


================================================
FILE: optimization/forward_forward/forward_forward/app.py
================================================
from nebullvm.apps.base import App

from forward_forward.root_op import ForwardForwardRootOp


class ForwardForwardApp(App):
    def __init__(self):
        super().__init__()
        self.root_op = ForwardForwardRootOp()

    def execute(self, *args, **kwargs):
        return self.root_op.execute(*args, **kwargs)


================================================
FILE: optimization/forward_forward/forward_forward/operations/__init__.py
================================================


================================================
FILE: optimization/forward_forward/forward_forward/operations/build_models.py
================================================
from abc import ABC, abstractmethod

import torch

from nebullvm.operations.base import Operation

from forward_forward.utils.modules import (
    FCNetFFProgressive,
    RecurrentFCNetFF,
    LMFFNet,
)


class BaseModelBuildOperation(Operation, ABC):
    def __init__(self):
        super().__init__()
        self.model = None

    @abstractmethod
    def execute(
        self,
        input_size: int,
        n_layers: int,
        hidden_size: int,
        optimizer_name: str,
        optimizer_params: dict,
        loss_fn_name: str,
        output_size: int = None,
    ):
        raise NotImplementedError

    def get_result(self):
        return self.model


class FCNetFFProgressiveBuildOperation(BaseModelBuildOperation):
    def __init__(self):
        super().__init__()

    def execute(
        self,
        input_size: int,
        n_layers: int,
        hidden_size: int,
        optimizer_name: str,
        optimizer_params: dict,
        loss_fn_name: str,
        output_size: int = None,
    ):
        layer_sizes = [input_size] + [hidden_size] * n_layers
        model = FCNetFFProgressive(
            layer_sizes=layer_sizes,
            optimizer_name=optimizer_name,
            optimizer_kwargs=optimizer_params,
            loss_fn_name=loss_fn_name,
            epochs=-1,
        )
        if output_size is not None:
            output_layer = torch.nn.Linear(layer_sizes[-1], output_size)
            model = torch.nn.Sequential(model, output_layer)

        self.model = model


class RecurrentFCNetFFBuildOperation(BaseModelBuildOperation):
    def __init__(self):
        super().__init__()

    def execute(
        self,
        input_size: int,
        n_layers: int,
        hidden_size: int,
        optimizer_name: str,
        optimizer_params: dict,
        loss_fn_name: str,
        output_size: int = None,
    ):
        layer_sizes = [input_size] + [hidden_size] * n_layers + [output_size]
        model = RecurrentFCNetFF(
            layer_sizes=layer_sizes,
            optimizer_name=optimizer_name,
            optimizer_kwargs=optimizer_params,
            loss_fn_name=loss_fn_name,
        )
        self.model = model


class LMFFNetBuildOperation(BaseModelBuildOperation):
    def __init__(self):
        super().__init__()

    def execute(
        self,
        input_size: int,
        n_layers: int,
        hidden_size: int,
        optimizer_name: str,
        optimizer_params: dict,
        loss_fn_name: str,
        output_size: int = None,
    ):
        model = LMFFNet(
            token_num=output_size,
            hidden_size=hidden_size,
            n_layers=n_layers,
            seq_len=input_size,
            optimizer_name=optimizer_name,
            optimizer_kwargs=optimizer_params,
            loss_fn_name=loss_fn_name,
            epochs=-1,
            predicted_tokens=-1,
        )
        self.model = model


================================================
FILE: optimization/forward_forward/forward_forward/operations/data.py
================================================
import urllib.request
from typing import Any

import torch
import torch.utils.data
from nebullvm.operations.base import Operation
from torchvision import datasets, transforms


class MNISTDataLoaderOperation(Operation):
    """DataLoaderOperation"""

    def __init__(self):
        super().__init__()
        self.train_data = None
        self.test_data = None

    def get_result(self) -> Any:
        if self.train_data is not None:
            return self.train_data, self.test_data
        else:
            return None

    def execute(self, batch_size: int, shuffle: bool):
        train_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "data",
                train=True,
                download=True,
                transform=transforms.Compose(
                    [
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,)),
                    ]
                ),
            ),
            batch_size=batch_size,
            shuffle=shuffle,
        )
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "data",
                train=False,
                transform=transforms.Compose(
                    [
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,)),
                    ]
                ),
            ),
            batch_size=1000,
            shuffle=False,
        )
        self.train_data = train_loader
        self.test_data = test_loader


def download_fables():
    http_str = "http://classics.mit.edu/Aesop/fab.mb.txt"
    with urllib.request.urlopen(http_str) as response:
        html = response.read()
    return html.decode("utf-8")


def get_fables():
    fables = download_fables()
    fables = fables.split("SECTION 1")[1]
    fables = fables.split("THE END")[0]
    fables = fables.split("\n\n")
    fables = [fable for fable in fables if len(fable) >= 100]
    return fables


VOCABULARY = {
    " ": 0,
    "!": 1,
    ",": 2,
    ".": 3,
    "a": 4,
    "b": 5,
    "c": 6,
    "d": 7,
    "e": 8,
    "f": 9,
    "g": 10,
    "h": 11,
    "i": 12,
    "j": 13,
    "k": 14,
    "l": 15,
    "m": 16,
    "n": 17,
    "o": 18,
    "p": 19,
    "q": 20,
    "r": 21,
    "s": 22,
    "t": 23,
    "u": 24,
    "v": 25,
    "w": 26,
    "x": 27,
    "y": 28,
    "z": 29,
}


def tokenize(fable, max_len=100):
    tokenized_fable = [
        VOCABULARY[char]
        for i, char in enumerate(fable.lower())
        if char in VOCABULARY
    ]
    return tokenized_fable[:max_len]


def get_tokenized_fables():
    fables = get_fables()
    tokenized_fables = [tokenize(fable) for fable in fables]
    tokenized_fables = torch.stack(
        [
            torch.tensor(tokens)
            for tokens in tokenized_fables
            if len(tokens) == 100
        ]
    )
    return tokenized_fables


def get_dataloader(batch_size=32, test_size=0.2, shuffle=True):
    tokenized_fables = get_tokenized_fables()
    n_test = int(len(tokenized_fables) * test_size)
    test_set = torch.utils.data.TensorDataset(tokenized_fables[:n_test])
    train_set = torch.utils.data.TensorDataset(tokenized_fables[n_test:])
    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=batch_size, shuffle=shuffle
    )
    test_loader = torch.utils.data.DataLoader(
        test_set, batch_size=n_test, shuffle=False
    )
    return train_loader, test_loader


class AesopFablesDataLoaderOperation(Operation):
    """DataLoaderOperation"""

    def __init__(self):
        super().__init__()
        self.train_data = None
        self.test_data = None

    def get_result(self) -> Any:
        if self.train_data is not None:
            return self.train_data, self.test_data
        else:
            return None

    def execute(self, batch_size: int, shuffle: bool):
        train_loader, test_loader = get_dataloader(
            batch_size=batch_size, test_size=0.2, shuffle=shuffle
        )
        self.train_data = train_loader
        self.test_data = test_loader


================================================
FILE: optimization/forward_forward/forward_forward/operations/fetch_operations.py
================================================
from typing import Any

from nebullvm.operations.base import Operation
from torch.utils.data import DataLoader


class FetchTrainingDataFromLocal(Operation):
    def get_result(self) -> Any:
        pass

    def execute(self, train_data: DataLoader, test_data: DataLoader):
        self.state["train_data"] = train_data
        self.state["test_data"] = test_data

    def get_train_data(self) -> DataLoader:
        return self.state.get("train_data")

    def get_test_data(self) -> DataLoader:
        return self.state.get("test_data")


================================================
FILE: optimization/forward_forward/forward_forward/operations/trainers.py
================================================
from abc import ABC, abstractmethod

import torch
from nebullvm.operations.base import Operation
from nebullvm.operations.fetch_operations.local import FetchModelFromLocal
from torch.utils.data import DataLoader
from torchvision import datasets

from forward_forward.operations.data import VOCABULARY
from forward_forward.operations.fetch_operations import (
    FetchTrainingDataFromLocal,
)
from forward_forward.utils.labels import LabelsInjector
from forward_forward.utils.modules import FCNetFFProgressive
from forward_forward.utils.utils import (
    ProgressiveTrainingDataset,
    compute_perplexity,
)


class BaseForwardForwardTrainer(Operation, ABC):
    def __init__(self):
        super().__init__()
        self.model = None
        self.train_data = None
        self.test_data = None

        self.fetch_model_op = FetchModelFromLocal()
        self.fetch_data_op = FetchTrainingDataFromLocal()

    def get_result(self):
        if self.state.get("model_is_trained"):
            return self.model

    def execute(
        self,
        model: FCNetFFProgressive,
        train_data: DataLoader,
        test_data: DataLoader,
        epochs: int,
        theta: float,
        device: str,
        **kwargs,
    ):
        if self.fetch_model_op.get_model() is None:
            self.fetch_model_op.execute(model)

        if self.fetch_data_op.get_train_data() is None:
            self.fetch_data_op.execute(train_data, test_data)

        self.model = self.fetch_model_op.get_model()
        self.train_data = self.fetch_data_op.get_train_data()
        self.test_data = self.fetch_data_op.get_test_data()

        if (
            self.model is not None
            and self.train_data is not None
            and self.test_data is not None
        ):
            self._train(epochs, theta, device, **kwargs)

    @abstractmethod
    def _train(self, *args, **kwargs):
        raise NotImplementedError


class ForwardForwardTrainer(BaseForwardForwardTrainer):
    def _train(self, epochs: int, theta: float, device: str, **kwargs):
        # Define model
        model = self.model.to(device)
        model.epochs = epochs
        batch_size = self.train_data.batch_size

        # TODO: SELECT THE N_CLASSES OUTSIDE THE OPERATION
        label_injector = LabelsInjector(datasets.MNIST.classes)

        progressive_dataset = ProgressiveTrainingDataset(
            (label_injector.inject_train(x, y) for x, y in self.train_data)
        )
        progressive_dataloader = torch.utils.data.DataLoader(
            progressive_dataset, batch_size=2 * batch_size, shuffle=False
        )

        model.train()
        model.progressive_train(progressive_dataloader, theta)

        model.eval()
        correct = 0
        with torch.no_grad():
            for data, target in self.test_data:
                input_data = label_injector.inject_eval(data)
                input_data = input_data.to(device)
                target = target.to(device)
                input_shapes = input_data.shape[:-1]
                input_data = input_data.reshape(-1, input_data.shape[-1])
                _, prob = model.positive_eval(input_data, theta)
                prob = prob.reshape(*input_shapes)
                pred = prob.argmax(dim=1)
                correct += (pred == target).float().sum().item()
        if isinstance(correct, torch.Tensor):
            correct = correct.item()
        self.logger.info(
            "Test set: Accuracy: {}/{} ({:.0f}%)".format(
                correct,
                len(self.test_data.dataset),
                100.0 * correct / len(self.test_data.dataset),
            )
        )


class RecurrentForwardForwardTrainer(BaseForwardForwardTrainer):
    def _train(self, epochs: int, theta: float, device: str, **kwargs):
        model = self.model.to(device)

        for epoch in range(epochs):
            accumulated_goodness = None
            model.train()
            for j, (data, target) in enumerate(self.train_data):
                # TODO: THE IMAGE SHAPE SHOULD NOT BE DEFINED HERE
                data = data.to(device).reshape(-1, 28 * 28)
                target = torch.functional.F.one_hot(
                    target.to(device),
                    num_classes=len(datasets.MNIST.classes),
                )
                _, goodness = model.ff_train(data, target, theta)
                if accumulated_goodness is None:
                    accumulated_goodness = goodness
                else:
                    accumulated_goodness[0] += goodness[0]
                    accumulated_goodness[1] += goodness[1]
            goodness_ratio = (
                accumulated_goodness[0] - accumulated_goodness[1]
            ) / abs(max(accumulated_goodness))
            self.logger.info(f"Epoch {epoch + 1}")
            self.logger.info(f"Accumulated goodness: {accumulated_goodness}")
            self.logger.info(f"Goodness ratio: {goodness_ratio}")
            model.eval()
            correct = 0
            with torch.no_grad():
                for data, target in self.test_data:
                    data = data.to(device).reshape(-1, 28 * 28)
                    target = target.to(device)
                    pred, _ = model.positive_eval(data, theta)
                    correct += pred.eq(target.view_as(pred)).sum().item()
            self.logger.info(
                f"Test accuracy: {correct} / 10000 ({correct / 10000 * 100}%)"
            )


class NLPForwardForwardTrainer(BaseForwardForwardTrainer):
    def _train(
        self,
        epochs: int,
        theta: float,
        device: str,
        predicted_tokens: int,
        **kwargs,
    ):
        model = self.model.to(device)
        self.model.epochs = epochs
        self.model.predicted_tokens = predicted_tokens
        token_num = len(VOCABULARY)
        sequence_len = self.model.seq_len

        for input_data in self.train_data:
            input_data = torch.functional.F.one_hot(
                input_data[0].to(device), num_classes=token_num
            ).float()

            accumulated_goodness = model.LM_ff_train(input_data, theta=theta)
            goodness_ratio = (
                accumulated_goodness[0] - accumulated_goodness[1]
            ) / abs(max(accumulated_goodness))
            self.logger.info("Trained on batch")
            self.logger.info(f"Accumulated goodness: {accumulated_goodness}")
            self.logger.info(f"Accumulated goodness ratio: {goodness_ratio}")

        for test_data in self.test_data:
            test_data = torch.functional.F.one_hot(
                test_data[0].to(device), num_classes=token_num
            ).float()
            test_data = test_data.reshape(-1, token_num * sequence_len)
            predictions, _ = model.positive_eval(test_data, theta)
            perplexity = compute_perplexity(predictions)
            self.logger.info(f"Perplexity: {perplexity}")


================================================
FILE: optimization/forward_forward/forward_forward/root_op.py
================================================
from enum import Enum

from nebullvm.operations.base import Operation

from forward_forward.operations.build_models import (
    FCNetFFProgressiveBuildOperation,
    RecurrentFCNetFFBuildOperation,
    LMFFNetBuildOperation,
)
from forward_forward.operations.data import (
    MNISTDataLoaderOperation,
    AesopFablesDataLoaderOperation,
)
from forward_forward.operations.trainers import (
    ForwardForwardTrainer,
    RecurrentForwardForwardTrainer,
    NLPForwardForwardTrainer,
)


class ForwardForwardModelType(Enum):
    PROGRESSIVE = "progressive"
    RECURRENT = "recurrent"
    NLP = "nlp"


class ForwardForwardRootOp(Operation):
    def __init__(self, model_type: ForwardForwardModelType):
        super().__init__()

        if model_type is ForwardForwardModelType.PROGRESSIVE:
            self.build_model = FCNetFFProgressiveBuildOperation()
            self.train_model = ForwardForwardTrainer()
            self.load_data = MNISTDataLoaderOperation()
        elif model_type is ForwardForwardModelType.RECURRENT:
            self.build_model = RecurrentFCNetFFBuildOperation()
            self.train_model = RecurrentForwardForwardTrainer()
            self.load_data = MNISTDataLoaderOperation()
        elif model_type is ForwardForwardModelType.NLP:
            self.build_model = LMFFNetBuildOperation()
            self.train_model = NLPForwardForwardTrainer()
            self.load_data = AesopFablesDataLoaderOperation()

    def execute(
        self,
        input_size: int,
        n_layers: int,
        hidden_size: int,
        optimizer_name: str,
        optimizer_params: dict,
        loss_fn_name: str,
        batch_size: int,
        epochs: int,
        shuffle: bool,
        theta: float,
        device: str,
        output_size: int = None,
        **kwargs,
    ):
        if self.build_model.get_result() is None:
            self.build_model.execute(
                input_size=input_size,
                n_layers=n_layers,
                hidden_size=hidden_size,
                optimizer_name=optimizer_name,
                optimizer_params=optimizer_params,
                loss_fn_name=loss_fn_name,
                output_size=output_size,
            )

        if self.load_data.get_result() is None:
            self.load_data.execute(batch_size=batch_size, shuffle=shuffle)

        if (
            self.build_model.get_result() is not None
            and self.load_data.get_result() is not None
        ):
            if self.train_model.get_result() is None:
                train_loader, test_loader = self.load_data.get_result()
                self.train_model.execute(
                    model=self.build_model.get_result(),
                    train_data=train_loader,
                    test_data=test_loader,
                    epochs=epochs,
                    theta=theta,
                    device=device,
                    **kwargs,
                )
            if self.train_model.get_result() is not None:
                self.state["model"] = self.train_model.get_result()

    def get_result(self):
        return self.state.get("model")


================================================
FILE: optimization/forward_forward/forward_forward/utils/__init__.py
================================================


================================================
FILE: optimization/forward_forward/forward_forward/utils/labels.py
================================================
from typing import List

import torch


class LabelsInjector:
    def __init__(self, labels: List):
        # save labels into a dict having label as key and a tensor of size
        #  len(labels) as value. The tensor contains ones up to the index of
        #  the label and zeros after.
        self.label_names = labels
        self.labels = [
            torch.nn.functional.one_hot(
                torch.tensor([i]), len(labels)
            ).reshape(-1)
            for i in range(len(labels))
        ]

    @torch.no_grad()
    def inject_train(self, input_image: torch.Tensor, labels: torch.Tensor):
        # inject label in the input image
        bs = input_image.shape[0]
        injecting_labels = torch.stack(
            [self.labels[label] for label in labels]
        )
        negative_injecting_labels = torch.stack(
            [
                self.labels[label]
                for label in select_random_different_label(
                    labels, len(self.labels)
                )
            ]
        )
        positive_images = torch.cat(
            [input_image.reshape(bs, -1), injecting_labels], dim=1
        )
        negative_images = torch.cat(
            [input_image.reshape(bs, -1), negative_injecting_labels], dim=1
        )
        images = torch.cat([positive_images, negative_images], dim=0)
        signs = torch.cat([torch.ones(bs), -torch.ones(bs)], dim=0)
        return images, signs

    @torch.no_grad()
    def inject_eval(self, input_image: torch.Tensor):
        # input image is expected to have batch size 1
        # TODO: FIX THIS BEHAVIOUR
        labels = torch.stack(self.labels).unsqueeze(0)
        labels = labels.repeat(input_image.shape[0], 1, 1)
        input_image = input_image.reshape(input_image.shape[0], -1).unsqueeze(
            1
        )
        replicated_input = input_image.repeat(1, len(self.labels), 1)
        new_input = torch.cat([replicated_input, labels], dim=2)
        return new_input  # .reshape(-1, new_input.shape[2])


def select_random_different_label(labels: torch.Tensor, n_classes: int):
    # select a random label different from the given one
    for label in enumerate(labels):
        samples = torch.randint(0, n_classes, (1,))
        while samples[0] == label:
            samples = torch.randint(0, n_classes, (1,))
        yield samples[0]


================================================
FILE: optimization/forward_forward/forward_forward/utils/modules.py
================================================
from abc import ABC, abstractmethod
from typing import List

import torch
import torch.utils.data

from forward_forward.utils.utils import ProgressiveTrainingDataset


def loss_fn(y, theta, sign):
    logits = torch.square(y).mean(dim=1) - theta
    loss = -logits * sign
    with torch.no_grad():
        accumulated_logits = logits.mean().item()
    loss = loss.mean()
    return loss, accumulated_logits


def probabilistic_loss_fn(y, theta, sign):
    logits = torch.square(y).mean(dim=1) - theta
    prob = torch.sigmoid(logits)
    loss = -torch.log(prob + 1e-6) * sign
    with torch.no_grad():
        accumulated_logits = logits.mean().item()
    loss = loss.mean()
    return loss, accumulated_logits


def alternative_loss_fn(y, theta, sign):
    logits = y.pow(2).mean(dim=1) - theta
    with torch.no_grad():
        accumulated_logits = logits.mean().item()
    logits = -logits * sign
    prob = torch.nan_to_num(torch.exp(logits))
    loss = torch.log(1 + prob)
    loss = loss.mean()
    return loss, accumulated_logits


class BaseFFLayer(torch.nn.Module, ABC):
    @abstractmethod
    def ff_train(
        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
    ):
        raise NotImplementedError

    @abstractmethod
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        raise NotImplementedError

    @property
    def requires_training(self):
        return True


class FFLayer(BaseFFLayer):
    """Layer wrapper for efficient forward-forward layers."""

    def __init__(
        self,
        layer,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str = "loss_fn",
    ):
        super().__init__()
        self.layer = layer
        self.optimizer = getattr(torch.optim, optimizer_name)(
            layer.parameters(), **optimizer_kwargs
        )
        if loss_fn_name == "loss_fn":
            self.loss_fn = loss_fn
        elif loss_fn_name == "alternative_loss_fn":
            self.loss_fn = alternative_loss_fn
        elif loss_fn_name == "probabilistic_loss_fn":
            self.loss_fn = probabilistic_loss_fn

    def forward(self, x):
        return self.layer(x)

    def ff_train(
        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
    ):
        """Train the layer with the given target."""
        # upgrade optimizer for positive goodness
        y = self(input_tensor.detach())
        y_pos = y[torch.where(signs == 1)]
        y_neg = y[torch.where(signs == -1)]
        # y_pos = self(input_tensor.detach()[torch.where(signs == 1)])
        loss_pos, cumulated_logits_pos = self.loss_fn(y_pos, theta, sign=1)
        # self.optimizer.zero_grad()
        # loss_pos.backward()
        # print(loss_pos.item())
        # self.optimizer.step()
        # y_neg = self(input_tensor.detach()[torch.where(signs == -1)])
        loss_neg, cumulated_logits_neg = self.loss_fn(y_neg, theta, sign=-1)
        self.optimizer.zero_grad()
        loss = loss_pos + loss_neg
        loss.backward()
        self.optimizer.step()
        separation = [cumulated_logits_pos, cumulated_logits_neg]
        y = torch.zeros(
            input_tensor.shape[0], *y_pos.shape[1:], device=input_tensor.device
        )
        y[torch.where(signs == 1)] = y_pos
        y[torch.where(signs == -1)] = y_neg
        return y.detach(), separation

    @torch.no_grad()
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        """Evaluate the layer with the given input and theta."""
        y = self(input_tensor)
        return y, torch.square(y).mean(dim=1) - theta


class FFNormalization(BaseFFLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        l2_norm = (
            torch.norm(x.reshape(x.shape[0], -1), p=2, dim=1, keepdim=True)
            + 1e-8
        )
        return x / l2_norm

    def ff_train(
        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
    ):
        with torch.no_grad():
            output = self()
        return output, None

    @torch.no_grad()
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        with torch.no_grad():
            output = self(input_tensor)

        return output, torch.zeros(
            input_tensor.shape[0], device=input_tensor.device
        )

    @property
    def requires_training(self):
        return False


class LinearReLU(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = torch.nn.Linear(in_features, out_features, bias=True)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        return self.relu(self.linear(x))


class FCNetFFProgressive(BaseFFLayer):
    """FCNet trained using forward-forward algorithm. The network is trained
    in a progressive manner, i.e. the first layer is trained, then the
    second layer, and so on.
    """

    def __init__(
        self,
        layer_sizes: list,
        optimizer_name: str,
        optimizer_kwargs: dict,
        epochs: int,
        loss_fn_name: str = "loss_fn",
    ):
        super().__init__()
        self.epochs = epochs
        self.layers = torch.nn.ModuleList()
        for i in range(len(layer_sizes) - 1):
            self.layers.append(FFNormalization())
            self.layers.append(
                FFLayer(
                    LinearReLU(layer_sizes[i], layer_sizes[i + 1]),
                    optimizer_name,
                    optimizer_kwargs,
                    loss_fn_name,
                )
            )

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def progressive_train(self, dl: torch.utils.data.DataLoader, theta: float):
        """Train the network in a progressive manner."""
        print("Training the network in a progressive manner.")
        for i, layer in enumerate(self.layers):
            if layer.requires_training:
                for epoch in range(self.epochs):
                    accumulated_separation = None
                    for j, (data, signs) in enumerate(dl):
                        data = data.to(self.device)
                        signs = signs.to(self.device)
                        _, separation = layer.ff_train(data, signs, theta)
                        if accumulated_separation is None:
                            accumulated_separation = separation
                        else:
                            accumulated_separation[0] += separation[0]
                            accumulated_separation[1] += separation[1]
                        if j % 100 == 0:
                            print(f"Epoch: {epoch}, Batch: {j}, Layer: {i}")
                    print(f"Epoch {epoch} of layer {i} done.")
                    accumulated_separation[0] /= len(dl.dataset)
                    accumulated_separation[1] /= len(dl.dataset)
                    separation_ratio = (
                        accumulated_separation[0] - accumulated_separation[1]
                    ) / abs(max(accumulated_separation))
                    print("Goodness: ", accumulated_separation)
                    print(f"Accumulated separation: {separation_ratio}")
                print(f"Finished training layer {i} / {len(self.layers)}.")
            # create a new dataloader for the next layer
            dataset = ProgressiveTrainingDataset(
                (
                    (layer(x.to(self.device)), sign.to(self.device))
                    for x, sign in dl
                )
            )
            batch_size = dl.batch_size
            dl = torch.utils.data.DataLoader(
                dataset, batch_size=batch_size, shuffle=False
            )
        print("Finished training the network.")

    def ff_train(
        self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
    ):
        """Train the network with the given target."""
        accumulated_separation = None
        for layer in self.layers:
            input_tensor, separation = layer.ff_train(
                input_tensor, signs, theta
            )
            if accumulated_separation is None:
                accumulated_separation = separation
            else:
                accumulated_separation[0] += separation[0]
                accumulated_separation[1] += separation[1]
        return input_tensor, accumulated_separation

    @torch.no_grad()
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        """Evaluate the network with the given input and theta."""
        accumulated_goodness = torch.zeros(
            input_tensor.shape[0], device=input_tensor.device
        )
        for i, layer in enumerate(self.layers):
            input_tensor, goodness = layer.positive_eval(input_tensor, theta)
            if i > 1:
                accumulated_goodness += goodness
        return input_tensor, accumulated_goodness

    @property
    def device(self):
        return next(self.parameters()).device


class NormLinearReLU(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.norm = FFNormalization()
        self.linear_relu = LinearReLU(in_features, out_features)

    def forward(self, x):
        return self.linear_relu(self.norm(x))


class RecurrentFFLayer(BaseFFLayer):
    def __init__(
        self,
        hidden_size: int,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str,
    ):
        super().__init__()
        self.layer = NormLinearReLU(2 * hidden_size, hidden_size)
        self.optimizer = getattr(torch.optim, optimizer_name)(
            self.layer.parameters(), **optimizer_kwargs
        )
        self.loss_fn = eval(loss_fn_name)

    def forward(self, x_prev, x_same, x_next):
        x = torch.cat((x_prev, x_next), dim=1)
        new_x = self.layer(x)
        new_x = 0.3 * x_same + 0.7 * new_x
        return new_x

    def ff_train(
        self,
        x_prev: torch.Tensor,
        x_same: torch.Tensor,
        x_next: torch.Tensor,
        signs: torch.Tensor,
        theta: float,
    ):
        new_x = self(x_prev.detach(), x_same.detach(), x_next.detach())
        y_pos = new_x[signs == 1]
        y_neg = new_x[signs == -1]
        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
        loss = loss_pos + loss_neg
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return new_x, [goodness_pos, goodness_neg]

    @torch.no_grad()
    def positive_eval(
        self,
        x_prev: torch.Tensor,
        x_same: torch.Tensor,
        x_next: torch.Tensor,
        theta: float,
    ):
        new_x = self(x_prev, x_same, x_next)
        goodness = new_x.pow(2).mean(dim=1) - theta
        return new_x, goodness


class RecurrentProjectionFFLayer(BaseFFLayer):
    def __init__(
        self,
        input_size: int,
        output_size: int,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str,
    ):
        super().__init__()
        self.layer = NormLinearReLU(input_size, output_size)
        self.optimizer = getattr(torch.optim, optimizer_name)(
            self.layer.parameters(), **optimizer_kwargs
        )
        self.loss_fn = eval(loss_fn_name)

    def forward(self, x: torch.Tensor):
        return self.layer(x)

    def ff_train(
        self,
        x: torch.Tensor,
        signs: torch.Tensor,
        theta: float,
    ):
        new_x = self(x.detach())
        y_pos = new_x[signs == 1]
        y_neg = new_x[signs == -1]
        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
        loss = loss_pos + loss_neg
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return new_x, [goodness_pos, goodness_neg]

    @torch.no_grad()
    def positive_eval(self, x: torch.Tensor, theta: float):
        new_x = self(x)
        goodness = new_x.pow(2).mean(dim=1) - theta
        return new_x, goodness


class RecurrentProjectedSoftmaxFFLayer(BaseFFLayer):
    def __init__(
        self,
        input_size: int,
        output_size: int,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str,
    ):
        super().__init__()
        self.loss_fn = eval(loss_fn_name)
        self.norm = FFNormalization()
        self.linear = torch.nn.Linear(input_size, output_size)
        self.softmax = torch.nn.Softmax(dim=1)
        self.optimizer = getattr(torch.optim, optimizer_name)(
            self.linear.parameters(), **optimizer_kwargs
        )

    def forward(self, x: torch.Tensor):
        x = self.norm(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

    def ff_train(
        self,
        x: torch.Tensor,
        signs: torch.Tensor,
        theta: float,
    ):
        new_x = self(x.detach())
        y_pos = new_x[signs == 1]
        y_neg = new_x[signs == -1]
        loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
        loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
        loss = loss_pos + loss_neg
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return new_x, [goodness_pos, goodness_neg]

    @torch.no_grad()
    def positive_eval(self, x: torch.Tensor, theta: float):
        new_x = self(x)
        goodness = new_x.pow(2).mean(dim=1) - theta
        return new_x, goodness


class RecurrentFCNetFF(BaseFFLayer):
    """Recurrent FCNet trained using forward-forward algorithm."""

    def __init__(
        self,
        layer_sizes: list,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str = "loss_fn",
    ):
        super().__init__()
        self.time_steps = 8
        self.test_time_steps = 8
        self.storable_time_steps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        # self.storable_time_steps = [3, 4, 5]
        self.states = []
        self.layers = torch.nn.ModuleList()
        self.projector = RecurrentProjectionFFLayer(
            layer_sizes[0],
            layer_sizes[1],
            optimizer_name,
            optimizer_kwargs,
            loss_fn_name,
        )
        for i in range(1, len(layer_sizes) - 1):
            self.layers.append(
                RecurrentFFLayer(
                    layer_sizes[i],
                    optimizer_name,
                    optimizer_kwargs,
                    loss_fn_name,
                )
            )
        self.proj_y = RecurrentProjectionFFLayer(
            layer_sizes[-1],
            layer_sizes[-2],
            optimizer_name,
            optimizer_kwargs,
            loss_fn_name,
        )
        self.softmax = RecurrentProjectedSoftmaxFFLayer(
            layer_sizes[-2],
            layer_sizes[-1],
            optimizer_name,
            optimizer_kwargs,
            loss_fn_name,
        )
        self.num_labels = layer_sizes[-1]

    @property
    def device(self):
        return next(self.parameters()).device

    @torch.no_grad()
    def bottom_up(self, x: torch.Tensor, y: torch.Tensor):
        states = []
        x_proj = self.projector(x)

        for layer in self.layers:
            states.append(x_proj)
            x_proj = layer(
                x_proj,
                torch.zeros_like(x_proj, device=self.device),
                torch.zeros_like(x_proj, device=self.device),
            )
        states.append(x_proj)
        states.append(y)
        y_arg = torch.argmax(y, dim=1)
        x_proj_ = x_proj.clone()
        x_proj_[torch.arange(x_proj.shape[0]), y_arg] = -1e6
        neg_prob = self.softmax(x_proj_)
        cumulative_neg_prob = torch.cumsum(neg_prob, dim=1)
        neg_samples = torch.argmax(
            1.0
            * (
                cumulative_neg_prob > torch.rand(x.shape[0], 1).to(self.device)
            ),
            dim=1,
        )
        neg_samples = torch.functional.F.one_hot(
            neg_samples, num_classes=self.num_labels
        )
        return states, neg_samples

    def forward(self, x: torch.Tensor, prev_states: List[torch.Tensor]):
        x_proj = self.projector(x)
        new_states = []
        for i, layer in enumerate(self.layers):
            if i < len(self.layers) - 1:
                next_state = prev_states[i + 2]
            else:
                next_state = self.proj_y(prev_states[i + 2].float())
            new_states.append(x_proj)
            x_proj = layer(prev_states[i], prev_states[i + 1], next_state)
        new_states.append(x_proj)
        y = self.softmax(x_proj)
        new_states.append(y)
        return new_states

    def ff_train(
        self, input_tensor: torch.Tensor, labels: torch.Tensor, theta: float
    ):
        """Train the network with the given target."""
        with torch.no_grad():
            states, neg_samples = self.bottom_up(input_tensor, labels)
            neg_states, _ = self.bottom_up(input_tensor, neg_samples)
            states = [
                torch.cat([s, ns], dim=0) for s, ns in zip(states, neg_states)
            ]
            signs = torch.cat(
                [
                    torch.ones(input_tensor.shape[0], device=self.device),
                    -torch.ones(input_tensor.shape[0], device=self.device),
                ],
                dim=0,
            )
            input_tensor = torch.cat([input_tensor, input_tensor], dim=0)
        # states have been created, now we can train the network
        x_proj, accumulated_goodness = self.projector.ff_train(
            input_tensor, signs, theta
        )
        for _ in range(self.time_steps):
            new_states = []
            x = x_proj
            for j, layer in enumerate(self.layers):
                if j < len(self.layers) - 1:
                    next_state = states[j + 2]
                else:
                    next_state = self.proj_y(states[j + 2].float())
                new_states.append(x)
                x, goodnesses = layer.ff_train(
                    states[j], states[j + 1], next_state, signs, theta
                )
                accumulated_goodness[0] += goodnesses[0]
                accumulated_goodness[1] += goodnesses[1]
            new_states.append(x)
            with torch.no_grad():
                x_ = states[-2][torch.where(signs == -1)]
                real_y = states[-1][torch.where(signs == 1)]
                x_[
                    torch.arange(x_.shape[0]), torch.argmax(real_y, dim=1)
                ] = -1e6
                y = self.softmax(x_)
                cumulative_y = torch.cumsum(y, dim=1)
                neg_samples = torch.argmax(
                    1.0
                    * (
                        cumulative_y
                        > torch.rand(x_.shape[0], 1).to(self.device)
                    ),
                    dim=1,
                )
                neg_samples = torch.functional.F.one_hot(
                    neg_samples, num_classes=self.num_labels
                )
                # replace just negative samples
                next_labels = states[-1].clone()
                next_labels[torch.where(signs == -1)] = neg_samples
                new_states.append(next_labels)
            states = new_states
        accumulated_goodness[0] /= self.time_steps * len(self.layers) + 1
        accumulated_goodness[1] /= self.time_steps * len(self.layers) + 1
        with torch.no_grad():
            states = [t[: input_tensor.shape[0] // 2] for t in states]
        return states, accumulated_goodness

    @torch.no_grad()
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        """Evaluate the network with the given input and theta."""
        labels = torch.arange(0, self.num_labels, device=self.device)
        labels = torch.functional.F.one_hot(
            labels, num_classes=self.num_labels
        )
        original_bs = input_tensor.shape[0]
        input_tensor = (
            input_tensor.unsqueeze(1)
            .repeat(1, self.num_labels, 1)
            .reshape(-1, input_tensor.shape[-1])
        )
        labels = (
            labels.unsqueeze(0)
            .repeat(original_bs, 1, 1)
            .reshape(-1, labels.shape[-1])
        )

        states, _ = self.bottom_up(input_tensor, labels)
        x_proj, goodness = self.projector.positive_eval(input_tensor, theta)
        accumulated_goodness = goodness

        for time_step in range(self.test_time_steps):
            new_states = []
            x = x_proj
            for j, layer in enumerate(self.layers):
                if j < len(self.layers) - 1:
                    next_state = states[j + 2]
                else:
                    next_state = self.proj_y(states[j + 2].float())
                new_states.append(x)
                x, goodnesses = layer.positive_eval(
                    states[j], states[j + 1], next_state, theta
                )
                if time_step in self.storable_time_steps:
                    accumulated_goodness += goodnesses
            new_states.append(x)
            if time_step in self.storable_time_steps:
                _, goodness = self.softmax.positive_eval(x, theta)
                accumulated_goodness += goodness
            new_states.append(states[-1])
            states = new_states
        accumulated_goodness = accumulated_goodness.reshape(
            original_bs, self.num_labels
        )
        prediction = torch.argmax(accumulated_goodness, dim=1)
        return prediction, accumulated_goodness


class LMFFLinearSoftmax(BaseFFLayer):
    def __init__(
        self,
        input_size: int,
        output_size: int,
        optimizer_name: str,
        optimizer_kwargs: dict,
    ):
        super().__init__()
        self.loss_fn = torch.nn.NLLLoss()
        self.norm = FFNormalization()
        self.linear = torch.nn.Linear(input_size, output_size)
        self.softmax = torch.nn.Softmax(dim=1)
        self.optimizer = getattr(torch.optim, optimizer_name)(
            self.parameters(), **optimizer_kwargs
        )

    def forward(self, x: torch.Tensor):
        x = self.norm(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

    def ff_train(
        self,
        input_tensor: torch.Tensor,
        labels: torch.Tensor,
        signs: torch.Tensor,
    ):
        x = input_tensor[torch.where(signs == 1)]
        y = labels[torch.where(signs == 1)]
        x = self(x)
        loss = self.loss_fn(x, torch.argmax(y, dim=1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        with torch.no_grad():
            x_neg = input_tensor[torch.where(signs == -1)]
            new_y_neg = self(x_neg)
            new_x = torch.zeros(
                len(input_tensor), *x.shape[1:], device=input_tensor.device
            )
            new_x[torch.where(signs == 1)] = x
            new_x[torch.where(signs == -1)] = new_y_neg
        return new_x, loss.item()

    @torch.no_grad()
    def positive_eval(self, x: torch.Tensor):
        pred = self(x)
        return pred


class LMFFNet(BaseFFLayer):
    def __init__(
        self,
        token_num: int,
        hidden_size: int,
        n_layers: int,
        seq_len: int,
        predicted_tokens: int,
        epochs: int,
        optimizer_name: str,
        optimizer_kwargs: dict,
        loss_fn_name: str = "loss_fn",
    ):
        super().__init__()
        self.token_num = token_num
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.predicted_tokens = predicted_tokens
        self.token2emb = RecurrentProjectionFFLayer(
            token_num * seq_len,
            hidden_size,
            optimizer_name,
            optimizer_kwargs,
            loss_fn_name,
        )
        self.layers = torch.nn.ModuleList(
            [
                FFLayer(
                    NormLinearReLU(hidden_size, hidden_size),
                    optimizer_name,
                    optimizer_kwargs,
                    loss_fn_name,
                )
                for _ in range(n_layers)
            ]
        )
        self.emb2token = LMFFLinearSoftmax(
            n_layers * hidden_size, token_num, optimizer_name, optimizer_kwargs
        )
        self.epochs = epochs

    def forward(self, input_tensor: torch.Tensor):
        x = self.token2emb(input_tensor)
        xs = []
        for layer in self.layers:
            x = layer(x)
            xs.append(x)
        x = torch.cat(xs, dim=1)
        x = self.emb2token(x)
        return x

    def ff_train(
        self,
        input_tensor: torch.Tensor,
        prev_pred: torch.Tensor,
        labels: torch.Tensor,
        theta: float,
    ):
        signs = torch.cat(
            [
                torch.ones(input_tensor.shape[0], device=input_tensor.device),
                -torch.ones(input_tensor.shape[0], device=input_tensor.device),
            ]
        )
        input_tensor = torch.cat([input_tensor, prev_pred], dim=0)
        labels = torch.cat([labels, labels], dim=0)
        for idx in range(self.epochs):
            x, goodness = self.token2emb.ff_train(input_tensor, signs, theta)
            if idx % 20 == 0:
                print(f"Epoch {idx}: {goodness}")
        accumulated_goodness = goodness
        xs = []
        for layer in self.layers:
            for epoch in range(self.epochs):
                x_new, goodness = layer.ff_train(x, signs, theta)
                if epoch % 20 == 0:
                    print(f"Epoch {epoch}: {goodness}")
            x = x_new
            xs.append(x)
            accumulated_goodness[0] += goodness[0]
            accumulated_goodness[1] += goodness[1]
        x = torch.cat(xs, dim=1)
        for epoch in range(self.epochs):
            x_new, loss = self.emb2token.ff_train(x, labels, signs)
            if epoch % 20 == 0 or epoch < 20:
                print(f"Epoch {epoch}: {loss}")
        x = x_new
        next_input = input_tensor[signs == 1].roll(-self.token_num, dims=1)
        next_input[
            :, -self.token_num :  # noqa E203
        ] = torch.functional.F.one_hot(
            torch.argmax(x[signs == 1], dim=1), num_classes=self.token_num
        )
        return next_input, accumulated_goodness

    def LM_ff_train(self, input_tensor: torch.Tensor, theta: float):
        with torch.no_grad():
            input_tensor = input_tensor.reshape(
                -1, self.token_num * self.seq_len
            )
            labels = input_tensor[:, -self.token_num :].roll(  # noqa E203
                -1, dims=0
            )
            temp = torch.argmax(labels, dim=1)
            print(temp.shape, torch.sum(temp == 0))
            pred = self(input_tensor)
            new_char = torch.functional.F.one_hot(
                torch.argmax(pred, dim=1), num_classes=self.token_num
            )
            prev_pred = input_tensor.clone().roll(1)
            prev_pred[:, -self.token_num :] = new_char  # noqa E203
        _, accumulated_goodness = self.ff_train(
            input_tensor, prev_pred, labels, theta
        )
        return accumulated_goodness

    @torch.no_grad()
    def positive_eval(self, input_tensor: torch.Tensor, theta: float):
        cumulated_goodness = torch.zeros(
            input_tensor.shape[0], device=input_tensor.device
        )
        prediction = torch.zeros(
            input_tensor.shape[0],
            self.predicted_tokens,
            self.token_num,
            device=input_tensor.device,
        )
        for idx in range(self.predicted_tokens):
            x, goodness = self.token2emb.positive_eval(input_tensor, theta)
            cumulated_goodness += goodness
            xs = []
            for layer in self.layers:
                x, goodness = layer.positive_eval(x, theta)
                xs.append(x)
                cumulated_goodness += goodness
            x = torch.cat(xs, dim=1)
            x = self.emb2token.positive_eval(x)
            prediction[:, idx] = x
            input_tensor = input_tensor.roll(-self.token_num, dims=1)
            input_tensor[
                :, -self.token_num :  # noqa E203
            ] = torch.functional.F.one_hot(
                torch.argmax(x, dim=1), num_classes=self.token_num
            )
        cumulated_goodness /= self.predicted_tokens
        return prediction, cumulated_goodness


================================================
FILE: optimization/forward_forward/forward_forward/utils/utils.py
================================================
from collections import Generator

import torch.utils.data


class ProgressiveTrainingDataset(torch.utils.data.Dataset):
    """Dataset for progressive training."""

    def __init__(self, dataset_generator: Generator):
        with torch.no_grad():
            self.internal_dataset = [
                batch
                for data, sign in dataset_generator
                for batch in zip(data, sign)
            ]

    def __getitem__(self, index):
        return self.internal_dataset[index]

    def __len__(self):
        return len(self.internal_dataset)


def compute_perplexity(tensor: torch.Tensor):
    """Compute perplexity of a tensor. The tensor has shape (batch_size,
    sequence_length, vocab_size).
    The softmax has already been computed over the vocab dimension.
    """
    return torch.exp(-torch.sum(tensor * torch.log(tensor), dim=-1)).mean()


================================================
FILE: optimization/forward_forward/requirements.txt
================================================
torch>=1.9
torchvision>=0.10
nebullvm>=0.6


================================================
FILE: optimization/forward_forward/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages


REQUIREMENTS = [
    "torch>=1.9",
    "torchvision>=0.10",
    "nebullvm>=0.6",
]

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")

setup(
    name="forward_forward",
    version="0.0.1",
    packages=find_packages(),
    install_requires=REQUIREMENTS,
    long_description=long_description,
    include_package_data=True,
    long_description_content_type="text/markdown",
)


================================================
FILE: optimization/large_speedster/README.md
================================================
# ⚡ LargeSpeedster App (WIP)
Automatically apply SOTA optimization techniques on large AI models to achieve the maximum acceleration on your hardware.

If you like this App, give us a star to show your support for the project ⭐

## 📚 Description
The LargeSpeedster App is a powerful tool to optimize large AI models (LMs). Leveraging state-of-the-art open-source optimization tools, LargeSpeedster enables the acceleration of large models, i.e. models with a number of parameters in excess of what could be stored on a single GPU. The workflow consists in 3 steps: select, search, and serve.

In the select step, users input their large model in their preferred deep learning framework and express their preferences regarding maximum consented accuracy loss. This information is used to guide the optimization process and ensure that the resulting model meets the user's needs.

In the search step, the App automatically tests multiple LMs-specific optimization techniques across the software-to-hardware stack, such as SmoothQuant quantization, FlashAttention, and inference-specific kernels. The App also tunes the optimal parallelization strategy and its configuration parameters, allowing it to find the optimal configuration of techniques for accelerating the model.

Finally, in the serve step, the App returns an accelerated version of the user's model in the DL framework of choice, providing a significant boost in performance.

Overall, LargeSpeedster is an easy-to-use tool that allows users to optimize their large AI models and get the most out of their software-to-hardware stack. Try it out today, and reach out if you have any feedback!


================================================
FILE: optimization/nebullvm/.pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/ambv/black
    rev: 22.3.0
    hooks:
      - id: black
        args: [--line-length=79]

  - repo: https://github.com/pycqa/flake8
    rev: 3.9.2
    hooks:
      - id: flake8
        args: [--exclude=nebullvm/tools/diffusers.py]


================================================
FILE: optimization/nebullvm/CONTRIBUTING.md
================================================
# Guidelines for Contributing to Nebullvm 🚀

Hello coder 👋

We are very happy that you have decided to contribute to the library and we thank you for your efforts. Here you can find guidelines on how to standardize your code with the style we adopted for `nebullvm`.  But remember, there are various ways to help the community other than submitting code contributions, answering questions and improving the documentation are also very valuable.

It also helps us if you mention our library in your blog posts to show off the cool things it's made possible, or just give the repository a ⭐️ to show us that you appreciate the project

This guide was inspired by the awesome [Transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing.

We hope to come across your pull request soon!

Happy coding 💫 The nebullvm Team


## How to submit an issue
Did you spot a bug? Did you come up with a cool idea that you think should be implemented in nebullvm? Well, GitHub issues are the best way to let us know!

We don't have a strict policy on issue generation, just use a meaningful title and specify the problem or your proposal in the first problem comment. Then, you can use GitHub labels to let us know what kind of proposal you are making, for example `bug` if you are reporting a bug or `enhancement` if you are proposing a library improvement. 

## How to contribute to solve an issue
We are always delighted to welcome other people to the contributors section of nebullvm! We are looking forward to welcoming you to the community, here are some guidelines to follow:
1. Please [fork](https://github.com/nebuly-ai/nebullvm/fork) the [library](https://github.com/nebuly-ai/nebullvm) by clicking on the Fork button on the repository's page. This will create a copy of the repository in your GitHub account.
2. Clone your fork to your local machine, and add the base repository as a remote:
    ```bash
    $ git clone git@github.com:<your Github handle>/nebuly-ai/nebullvm.git
    $ cd nebullvm
    $ git remote add upstream https://github.com/nebuly-ai/nebullvm.git
    ```
3. Install the library in editable mode with the following command:
    ```bash
    $ pip install -e .
    ```
4. Work on your fork to develop the feature you have in mind.
5. Nebullvm relies on `black` to format its source code consistently. To use the formatting style defined for nebullvm, run the following commands:
    ```bash
    $ pip install pre-commit black autoflake
    $ pre-commit install
    # the following command is optional, but needed if you have already 
    # committed some files to your forked repo.
    $ pre-commit run --all-files
    ```
    As for the naming convention, we follow [PEP 8](https://peps.python.org/pep-0008/) for code and a slight variation of [Google convention](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for docstrings. For docstrings we redundantly express the input type in both the function definition and the function docstring.
6. Once you're happy with your changes, add changed files with git add and commit your code:
    ```bash
    $ git add edited_file.py
    $ git commit -m "Add a cool feature"
    ```
7. Push your changes to your repo:
    ```bash
    $ git push
    ```
8. Now you can go to the repo you have forked on your github profile and press on **Pull Request** to open a pull request. In the pull request specify which problems it is solving. For instance, if the pull request solves `Issue #1`, the comment should be `Closes #1`. Also make the title of the pull request meaningful and self-explanatory.
---

See you soon in the list of nebullvm contributors 🌈


================================================
FILE: optimization/nebullvm/Dockerfile
================================================
ARG STARTING_IMAGE=nvcr.io/nvidia/tensorrt:23.03-py3
FROM ${STARTING_IMAGE}

WORKDIR /

# Set frontend as non-interactive
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get -y update && apt-get -y upgrade

RUN apt-get install ffmpeg libsm6 libxext6  -y

# Install other libraries
RUN apt-get install -y sudo wget

# Install libraries
RUN python3 -m pip install --upgrade pip \
    && pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118  \
    && pip install --no-cache-dir tensorflow \
    && pip install --no-cache-dir xformers \
    && pip install --no-cache-dir accelerate \
    && python3 -m pip install --no-cache-dir --upgrade tensorrt

# Copy the working dir to the container
COPY ../.. /nebullvm

# Install nebullvm
ARG NEBULLVM_VERSION=latest
RUN if [ "$NEBULLVM_VERSION" = "latest" ] ; then \
        cd nebullvm ; \
        pip install . ; \
        cd apps/accelerate/speedster ; \
        pip install . ; \
        cd ../../../.. ; \
        rm -rf nebullvm ; \
    else \
        pip install --no-cache-dir nebullvm==${NEBULLVM_VERSION} ; \
    fi

# Install required python modules
RUN pip install --no-cache-dir cmake

# Install default deep learning compilers
ARG COMPILER=all
RUN if [ "$COMPILER" = "all" ] ; then \
        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers all ; \
    elif [ "$COMPILER" = "tensorrt" ] ; then \
        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers tensorrt ; \
    elif [ "$COMPILER" = "openvino" ] ; then \
        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers openvino ; \
    elif [ "$COMPILER" = "onnxruntime" ] ; then \
        python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers onnxruntime ; \
    fi

# Install TVM
RUN if [ "$COMPILER" = "all" ] || [ "$COMPILER" = "tvm" ] ; then \
        pip install --no-cache-dir https://github.com/tlc-pack/tlcpack/releases/download/v0.11.1/tlcpack_cu116-0.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; \
        pip install --no-cache-dir xgboost ; \
        python3 -c "from tvm.runtime import Module" ; \
    fi

ENV SIGOPT_PROJECT="tmp"
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/tensorrt
ENV CUDA_MODULE_LOADING="LAZY"


================================================
FILE: optimization/nebullvm/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: optimization/nebullvm/MANIFEST.in
================================================
recursive-include nebullvm/installers/tvm_installers *.cmake
recursive-include nebullvm/installers *.sh

================================================
FILE: optimization/nebullvm/README.md
================================================
<p align="center">
<br><br><br>
<a https://docs.nebuly.com/welcome/quick-start"><img src="https://user-images.githubusercontent.com/83510798/208247207-861541f0-b968-484c-8a0c-0fb110399c16.png" width="400px"></a>
<br><br><br>
</p>

<p align="center">
<b>A framework for building optimization modules to boost the performances of your AI systems</b>
</p>

<p align=center>
<a href="https://pypi.org/project/nebullvm/"><img src="https://badge.fury.io/py/nebullvm.svg"></a>
<a href="https://pypistats.org/packages/nebullvm"><img src="https://pepy.tech/badge/nebullvm"></a>
<a href="https://discord.gg/77d5kGSa8e"><img src="https://img.shields.io/badge/Discord-1.1k-blueviolet?logo=discord&amp;logoColor=white&style=round">
<a href="https://twitter.com/nebuly_ai"><img src="https://img.shields.io/twitter/url.svg?label=Follow%20%40nebuly_ai&style=social&url=https%3A%2F%2Ftwitter.com-nebuly_ai"></a>


</a>
  
---

**Documentation**: <a href="https://docs.nebuly.com/" target="_blank"> docs.nebuly.com/ </a>

---

`Nebullvm` is a framework for building the optimization modules needed to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away.

If you like the idea, give us a star to show your support for the project ⭐


## **What can this help with?**

There are multiple modules we actually provide built on top of the framework:

✅ [Speedster](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster): Automatically apply the best set of SOTA optimization techniques to achieve the maximum inference speed-up on your hardware.

✅ [OpenAlphaTensor](https://github.com/nebuly-ai/nebuly/tree/main/optimization/open_alpha_tensor): Increase the computational performances of an AI model with custom-generated matrix multiplication algorithm fine-tuned for your specific hardware.

✅ [Forward-Forward](https://github.com/nebuly-ai/nebuly/tree/main/optimization/forward_forward): The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes.

## Next modules and roadmap
We are actively working on incorporating the following modules, as requested by members of our community, in upcoming releases:

- [ ]  [CloudSurfer](https://github.com/nebuly-ai/nebuly/blob/main/optimization/cloud_surfer): Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.
- [ ]  [OptiMate](https://github.com/nebuly-ai/nebuly/blob/main/optimizatione/optimate): Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.

## Contributing
As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.

---

<p align="center">
  <a href="https://discord.gg/RbeQMu886J">Join the community</a> |
  <a href="https://docs.nebuly.com/contributions/">Contribute to the library</a>
</p>


================================================
FILE: optimization/nebullvm/azure-pipelines.yml
================================================
trigger:
  branches:
    include:
      - main
  paths:
    exclude:
      - .github/*
      - docs/**
      - README.md
      - notebooks/*

pool:
  name: gpu-t4-pool

variables:
  imageName: 'nebulydocker/nebullvm'

steps:

  - script: |
      nvidia-smi
    displayName: 'Ensure cuda is installed correctly'

  - script: |
      pip uninstall -y nebullvm
      pip install .
    displayName: 'Install nebullvm'

  - script: |
      cd apps/accelerate/speedster
      pip uninstall -y speedster
      pip install .
      cd ../../..
    displayName: 'Install speedster'

  - script: python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
    displayName: 'Install PyTorch'

  - script: |
      export PATH=$PATH:/home/AzDevOps/.local/bin
      python -m nebullvm.installers.auto_installer --compilers all
    displayName: 'Install deep learning compilers'

  - script: |
      python -m pip install -r "requirements-dev.txt"
      pip install pytest-azurepipelines
    displayName: 'Install requirements for testing'

  - script: |
      res=$(python -c "from nebullvm.tools.utils import check_device; print(check_device().type.name == 'GPU')")
      if [ "$res" = "False" ]; then
          echo "GPU is not available"
          exit 1
      fi
      echo "GPU is available: $res"
      res=$(python -c "import torch; print(torch.cuda.is_available())")
      if [ "$res" = "False" ]; then
          echo "CUDA is not available for PyTorch"
          exit 1
      fi
      echo "CUDA is available for PyTorch: $res"
      res=$(python -c "import torch; num_devices = torch.cuda.device_count(); print(num_devices is not None and isinstance(num_devices, int) and num_devices > 0)")
      if [ "$res" = "False" ]; then
          echo "No CUDA devices found"
          exit 1
      fi
      echo "CUDA devices found: $res"
    displayName: 'Check GPU is available'

  - script: |
      export SPEEDSTER_DISABLE_TELEMETRY=1
      export PATH=$PATH:/home/AzDevOps/.local/bin
      cd apps/accelerate/speedster
      pytest
      cd ../../..
    displayName: 'Run api tests'
    failOnStderr: true

  - script: |
      export PATH=$PATH:/home/AzDevOps/.local/bin
      cd nebullvm
      pytest
      cd ../
    displayName: 'Run components tests'
    failOnStderr: true


================================================
FILE: optimization/nebullvm/docker_build.sh
================================================
# Create image with all compilers installed
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-allcompilers .

# Create an image for each compiler installed
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-onnxruntime . --build-arg COMPILER="onnxruntime"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-openvino . --build-arg COMPILER="openvino"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tvm . --build-arg COMPILER="tvm"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tensorrt . --build-arg COMPILER="tensorrt"


================================================
FILE: optimization/nebullvm/docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: optimization/nebullvm/docs/README.md
================================================
# Documentation
Nebullvm documentation is built using Sphynx and furo! You can follow the guide below for
## Build the docs:

1. Install nebullvm according to [README.md](../../../README.md#step-1-installation-of-nebullvm-library).
2. Install additional libraries required to build docs:
```
pip install -r requirements-docs.txt
```
3. Run `make html` from this directory.


================================================
FILE: optimization/nebullvm/docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# flake8: noqa

import os
import sys

sys.path.insert(0, os.path.abspath("../../../"))

# import sphinx_rtd_theme

# -- Project information -----------------------------------------------------

project = "nebullvm"
copyright = "2022, nebuly"
author = "nebuly"

# The full version, including alpha/beta/rc tags
# release = "0.3.0"


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "sphinx.ext.napoleon",
    "sphinx.ext.autodoc",
    "sphinx.ext.intersphinx",
    "sphinx.ext.todo",
    "sphinx.ext.coverage",
    "sphinx.ext.mathjax",
    "sphinx.ext.viewcode",
    "sphinx.ext.githubpages",
]

# -- Configurations for plugins ------------
napoleon_google_docstring = True
napoleon_include_init_with_doc = True
napoleon_include_special_with_doc = True
napoleon_numpy_docstring = False
napoleon_use_rtype = False
autodoc_inherit_docstrings = False
autodoc_member_order = "bysource"


# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#

# html_theme = "sphinx_rtd_theme"
html_theme = "furo"

html_theme_options = {
    "light_css_variables": {
        "color-brand-primary": "#dark",
        "color-brand-content": "#dark",
        "color-admonition-background": "#dark",
        "font-stack": "Montserrat, sans-serif",
        "font-stack--monospace": "Courier, monospace",
    },
    "footer_icons": [
        {
            "name": "GitHub",
            "url": "https://github.com/nebuly-ai/nebullvm",
            "html": """
                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
                </svg>
            """,
            "class": "",
        },
    ],
    "light_logo": "Logo_azure.svg",
    "dark_logo": "Logo_azure.svg",
}


html_static_path = ["_static"]
html_title = ""


# html_theme_options = {
#    "announcement": "<em>Important</em> announcement!",
# }

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']


================================================
FILE: optimization/nebullvm/docs/index.rst
================================================
Welcome to nebullvm's documentation!
======================================

.. toctree::
   :maxdepth: 2

   modules/index


================================================
FILE: optimization/nebullvm/docs/modules/api.rst
================================================
nebullvm.api
=============

.. automodule:: nebullvm
    :members:
    
.. automodule:: nebullvm.api.frontend.huggingface
    :members:
    

================================================
FILE: optimization/nebullvm/docs/modules/converters.rst
================================================
nebullvm.converters
===================

.. automodule:: nebullvm.converters
    :members:


================================================
FILE: optimization/nebullvm/docs/modules/index.rst
================================================
API Documentation
==================

.. toctree::
    
    api
    converters
    inference_learners
    installers
    optimizers


================================================
FILE: optimization/nebullvm/docs/modules/inference_learners.rst
================================================
nebullvm.inference_learners
===========================

.. automodule:: nebullvm.inference_learners
    :members:


================================================
FILE: optimization/nebullvm/docs/modules/installers.rst
================================================
nebullvm.installers
===================

.. automodule:: nebullvm.installers
    :members:


================================================
FILE: optimization/nebullvm/docs/modules/optimizers.rst
================================================
nebullvm.optimizers
===================

.. automodule:: nebullvm.optimizers
    :members:


================================================
FILE: optimization/nebullvm/docs/requirements-docs.txt
================================================
Sphinx==4.5.0
coloredlogs
sympy
furo

================================================
FILE: optimization/nebullvm/nebullvm/__init__.py
================================================
# The torch import is necessary for a strange issue when
# using cuda 11.8, if torch is imported after
# tensorflow it generates a core dumped error
from nebullvm.optional_modules.torch import torch  # noqa F401
from nebullvm.tools.logger import setup_logger

setup_logger()

__all__ = [k for k in globals().keys() if not k.startswith("_")]


================================================
FILE: optimization/nebullvm/nebullvm/api/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/apps/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/apps/base.py
================================================
import abc


class App(abc.ABC):
    def __init__(self):
        super().__init__()

    @abc.abstractmethod
    def execute(self, **kwargs):
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/config.py
================================================
from nebullvm.optional_modules.torch import torch


VERSION = "0.10.0"
LEARNER_METADATA_FILENAME = "metadata.json"
ONNX_OPSET_VERSION = 13
NEBULLVM_DEBUG_FILE = "nebullvm_debug.json"

AUTO_TVM_TUNING_OPTION = {
    "tuner": "xgb",
    "trials": 10,
    "early_stopping": 100,
}
# TODO: remove the min_repeat_ms key
AUTO_TVM_PARAMS = {
    "number": 10,
    "repeat": 1,
    "min_repeat_ms": 0,  # since we're tuning on a CPU, can be set to 0
    "timeout": 10,  # in seconds
}

NVIDIA_FILENAMES = {
    "engine": "tensor_rt.engine",
    "metadata": LEARNER_METADATA_FILENAME,
}

TVM_FILENAMES = {"engine": "compiled_lib.so"}

ONNX_FILENAMES = {"model_name": "model.onnx"}
ONNX_PROVIDERS = {
    "cuda": [
        "TensorrtExecutionProvider",
        "CUDAExecutionProvider",
        "CPUExecutionProvider",
    ],
    "cpu": [
        "CPUExecutionProvider",
    ],
}

OPENVINO_FILENAMES = {
    "metadata": LEARNER_METADATA_FILENAME,
    "description_file": "description.xml",
    "weights": "weights.bin",
}

TENSORFLOW_BACKEND_FILENAMES = {
    "tflite_model": "tf_model.tflite",
    "tf_model": "tf_model.h5",
}

TORCH_TENSORRT_PRECISIONS = {
    "torch.float32": {torch.float},
    "torch.float16": {torch.float, torch.half},
    "torch.int8": {torch.float, torch.half, torch.int8},
}

MIN_DIM_INPUT_DATA = 100
QUANTIZATION_DATA_NUM = 300
CONSTRAINED_METRIC_DROP_THS = 1e-2
TRAIN_TEST_SPLIT_RATIO = 0.8

COMPILER_LIST = [
    "deepsparse",
    "tensor_rt",
    "torchscript",
    "onnxruntime",
    "tflite",
    "xla",
    "tvm",
    "openvino",
    "bladedisc",
    "intel_neural_compressor",
    "torch_neuron",
    "torch_xla",
    "torch_dynamo",
    "faster_transformer",
]

COMPRESSOR_LIST = [
    "sparseml",
    "intel_pruning",
]

ONNX_MODULES = ["openvino", "tensor_rt"]

TORCH_MODULES = [
    "deepsparse",
    "intel_neural_compressor",
    "tensor_rt",
    "torch_tensor_rt",
    "faster_transformer",
]

TENSORFLOW_MODULES = []
HUGGING_FACE_MODULES = []
DIFFUSERS_MODULES = []

LIBRARIES_GPU = ["tensor_rt", "torch_tensor_rt", "faster_transformer"]

MIN_NUMBER = 1e-4
DEFAULT_METRIC_DROP_THS = 1e-2
ACTIVATION_METRIC_DROP_THS = 2e-2


================================================
FILE: optimization/nebullvm/nebullvm/core/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/core/models.py
================================================
import subprocess
from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from typing import Optional, Any, Union, Tuple, List, Dict

import numpy as np

from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch


class DeepLearningFramework(Enum):
    PYTORCH = "torch"
    TENSORFLOW = "tensorflow"
    NUMPY = "numpy"


class QuantizationType(Enum):
    DYNAMIC = "DYNAMIC"
    STATIC = "STATIC"
    HALF = "HALF"


class Status(Enum):
    OK = "OK"
    ERROR = "ERROR"


class DeviceType(Enum):
    CPU = "cpu"
    GPU = "gpu"
    TPU = "tpu"
    NEURON = "neuron"


class DataType(str, Enum):
    FLOAT16 = "float16"
    FLOAT32 = "float32"
    INT32 = "int32"
    INT64 = "int64"

    @classmethod
    def from_framework_format(
        cls, dtype: Union[torch.dtype, tf.dtypes.DType, np.dtype]
    ):
        if isinstance(dtype, torch.dtype):
            framework = "torch"
        elif isinstance(dtype, tf.dtypes.DType):
            framework = "tensorflow"
        else:
            framework = "numpy"
            dtype = dtype.type
        return FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[framework][dtype]

    def to_torch_format(self):
        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
            "torch"
        ].items():
            if value == self:
                return key

    def to_tf_format(self):
        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
            "tensorflow"
        ].items():
            if value == self:
                return key

    def to_numpy_format(self):
        for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
            "numpy"
        ].items():
            if value == self:
                return key


class ModelCompiler(Enum):
    TENSOR_RT = "tensor_rt"
    TENSOR_RT_ONNX = "onnx_tensor_rt"
    TENSOR_RT_TORCH = "torch_tensor_rt"
    OPENVINO = "openvino"
    APACHE_TVM = "tvm"
    APACHE_TVM_TORCH = "torch_tvm"
    APACHE_TVM_ONNX = "onnx_tvm"
    ONNX_RUNTIME = "onnxruntime"
    DEEPSPARSE = "deepsparse"
    TORCHSCRIPT = "torchscript"
    XLA = "xla"
    TFLITE = "tflite"
    BLADEDISC = "bladedisc"
    INTEL_NEURAL_COMPRESSOR = "intel_neural_compressor"
    TORCH_NEURON = "torch_neuron"
    TORCH_XLA = "torch_xla"
    TORCH_DYNAMO = "torch_dynamo"
    FASTER_TRANSFORMER = "faster_transformer"


class ModelCompressor(Enum):
    SPARSE_ML = "sparseml"
    INTEL_PRUNING = "intel_pruning"


class OptimizationTime(Enum):
    CONSTRAINED = "constrained"
    UNCONSTRAINED = "unconstrained"


@dataclass
class HardwareSetup:
    cpu: str
    operating_system: str
    memory_gb: int
    accelerator: Optional[str] = None


@dataclass
class OptimizedModel:
    inference_learner: Any
    latency_seconds: float
    metric_drop: float
    technique: str
    compiler: str
    throughput: float
    size_mb: float


@dataclass
class OriginalModel:
    model: Any
    latency_seconds: float
    throughput: float
    name: str
    size_mb: float
    framework: DeepLearningFramework


@dataclass
class BenchmarkOriginalModelResult:
    """The result of the LatencyOriginalModelMeasureOp"""

    latency_seconds: float
    model_outputs: Any


@dataclass
class OptimizeInferenceResult:
    """The result of the OptimizeInferenceOp"""

    original_model: OriginalModel
    hardware_setup: HardwareSetup
    optimized_model: Optional[OptimizedModel]

    @property
    def metric_drop(self) -> Optional[float]:
        if self.optimized_model is None:
            return None
        return self.optimized_model.metric_drop

    @cached_property
    def latency_improvement_rate(self) -> Optional[float]:
        if self.optimized_model is None:
            return None
        if self.optimized_model.latency_seconds == 0:
            return -1
        return (
            self.original_model.latency_seconds
            / self.optimized_model.latency_seconds
        )

    @cached_property
    def throughput_improvement_rate(self) -> Optional[float]:
        if self.optimized_model is None:
            return None
        if self.original_model.throughput == 0:
            return -1
        return self.optimized_model.throughput / self.original_model.throughput

    @cached_property
    def size_improvement_rate(self) -> Optional[float]:
        if self.optimized_model is None:
            return None
        if self.optimized_model.size_mb == 0:
            return 1
        return self.original_model.size_mb / self.optimized_model.size_mb


class InputInfo:
    """Class for storing all the information needed for creating an input
    tensor for AI models.

    Attributes:
        size (tuple): Tuple with the input size (batch size excluded)
        dtype (str): Data type of the tensor.
        min_value (int or float, optional): Min value the tensor elements can
            have.
        max_value (int or float, optional): Max value the tensor elements can
            have.
    """

    def __init__(self, size: Tuple[int, ...], dtype: str, **extra_info):
        self.dtype = DataType(dtype)
        self.size = size
        self.__dict__.update(extra_info)

    def __getattr__(self, item):
        return self.__dict__.get(item)

    def dict(self):
        return {
            k: v for k, v in self.__dict__.items() if not k.startswith("_")
        }


@dataclass
class DynamicAxisInfo:
    inputs: List[Dict[int, str]]
    outputs: List[Dict[int, str]]

    def dict(self):
        return {
            k: v for k, v in self.__dict__.items() if not k.startswith("_")
        }

    def retrieve_output_dim(
        self,
        input_shapes: List[Tuple[int, ...]],
        output_idx: int,
        dimension_idx: int,
        default_output_value: int,
    ) -> int:
        output_tag = self.outputs[output_idx][dimension_idx]
        for input_dict, input_shape in zip(self.inputs, input_shapes):
            for key, value in input_dict.items():
                if (
                    isinstance(value, dict) and value.get("name") == output_tag
                ) or value == output_tag:
                    return input_shape[key]
        return default_output_value


@dataclass
class ModelParams:
    batch_size: int
    input_infos: List[InputInfo]
    output_sizes: List[Tuple[int, ...]]
    output_types: List[DataType]
    dynamic_info: Union[DynamicAxisInfo, Dict] = None

    def __post_init__(self):
        if isinstance(self.dynamic_info, dict):
            self.dynamic_info = DynamicAxisInfo(**self.dynamic_info)
        self.input_infos = [
            InputInfo(**x) if isinstance(x, dict) else x
            for x in self.input_infos
        ]
        self.output_types = [DataType(x) for x in self.output_types]

    def dict(self):
        def recursively_dictionarize(element):
            if isinstance(element, list):
                element = [recursively_dictionarize(el) for el in element]
            elif hasattr(element, "dict"):
                element = element.dict()
            return element

        return {
            k: recursively_dictionarize(v)
            for k, v in self.__dict__.items()
            if not k.startswith("_")
        }

    @property
    def input_sizes(self):
        for input_info in self.input_infos:
            yield input_info.size


class Device:
    def __init__(self, type: DeviceType, idx: int = 0):
        self.type = type
        self.idx = idx

    @classmethod
    def from_str(cls, string: str) -> "Device":
        if string.startswith("cuda") or string.startswith("gpu"):
            return cls(
                DeviceType.GPU,
                int(string.split(":")[1] if ":" in string else 0),
            )
        elif string.startswith("tpu"):
            return cls(
                DeviceType.TPU,
                int(string.split(":")[1] if ":" in string else 0),
            )

        return cls(DeviceType.CPU)

    def to_torch_format(self) -> str:
        if self.type is DeviceType.GPU:
            return f"cuda:{self.idx}"
        elif self.type is DeviceType.TPU:
            return f"xla:{self.idx}"

        return "cpu"

    def to_tf_format(self) -> str:
        if self.type is DeviceType.GPU:
            return f"GPU:{self.idx}"

        return "CPU"

    def get_total_memory(self) -> int:
        # Return total memory in bytes using nvidia-smi in bytes
        if self.type is not DeviceType.GPU:
            raise Exception("Device type must be GPU")
        else:
            try:
                output = (
                    subprocess.check_output(
                        "nvidia-smi --query-gpu=memory.total "
                        "--format=csv,nounits,noheader",
                        shell=True,
                    )
                    .decode("utf-8")
                    .split()[self.idx]
                )
                return int(output) * 1024 * 1024
            except Exception:
                raise Exception(
                    "Unable to get total memory of device. "
                    "Please make sure nvidia-smi is available."
                )

    def get_free_memory(self) -> int:
        # Return free memory in bytes using nvidia-smi in bytes
        if self.type is not DeviceType.GPU:
            raise Exception("Device type must be GPU")
        else:
            try:
                output = (
                    subprocess.check_output(
                        "nvidia-smi --query-gpu=memory.free "
                        "--format=csv,nounits,noheader",
                        shell=True,
                    )
                    .decode("utf-8")
                    .split()[self.idx]
                )
                return int(output) * 1024 * 1024
            except Exception:
                raise Exception(
                    "Unable to get free memory of device. "
                    "Please make sure nvidia-smi is available."
                )


FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT = {
    "torch": {
        torch.float16: DataType.FLOAT16,
        torch.float32: DataType.FLOAT32,
        torch.int32: DataType.INT32,
        torch.int64: DataType.INT64,
    },
    "tensorflow": {
        tf.float16: DataType.FLOAT16,
        tf.float32: DataType.FLOAT32,
        tf.int32: DataType.INT32,
        tf.int64: DataType.INT64,
    },
    "numpy": {
        np.float16: DataType.FLOAT16,
        np.float32: DataType.FLOAT32,
        np.int32: DataType.INT32,
        np.int64: DataType.INT64,
    },
}


================================================
FILE: optimization/nebullvm/nebullvm/core/tests/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/core/tests/test_models.py
================================================
import unittest
from unittest.mock import MagicMock

from nebullvm.core.models import OptimizeInferenceResult


class TestOptimizeInferenceResult(unittest.TestCase):
    def test_latency_improvement_rate__optimized_model_is_none(self):
        res = OptimizeInferenceResult(
            original_model=MagicMock(),
            hardware_setup=MagicMock(),
            optimized_model=None,
        )
        self.assertIsNone(res.latency_improvement_rate)

    def test_latency_improvement_rate__optimized_latency_is_zero(self):
        original_latency = 1.0
        optimized_latency = 0.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(latency_seconds=original_latency),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(latency_seconds=optimized_latency),
        )
        self.assertEqual(-1, res.latency_improvement_rate)

    def test_latency_improvement_rate__original_latency_is_zero(self):
        original_latency = 0.0
        optimized_latency = 1.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(latency_seconds=original_latency),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(latency_seconds=optimized_latency),
        )
        self.assertEqual(0, res.latency_improvement_rate)

    def test_latency_improvement_rate__rate_gt_1(self):
        original_latency = 1.0
        optimized_latency = 0.5
        res = OptimizeInferenceResult(
            original_model=MagicMock(latency_seconds=original_latency),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(latency_seconds=optimized_latency),
        )
        self.assertGreater(res.latency_improvement_rate, 1)

    def test_latency_improvement_rate__rate_lt_1(self):
        original_latency = 0.5
        optimized_latency = 1.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(latency_seconds=original_latency),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(latency_seconds=optimized_latency),
        )
        self.assertLess(res.latency_improvement_rate, 1)

    def test_th_improvement_rate__optimized_model_is_none(self):
        res = OptimizeInferenceResult(
            original_model=MagicMock(),
            hardware_setup=MagicMock(),
            optimized_model=None,
        )
        self.assertIsNone(res.throughput_improvement_rate)

    def test_th_improvement_rate__optimized_th_is_zero(self):
        original_th = 1.0
        optimized_th = 0.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(throughput=original_th),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(throughput=optimized_th),
        )
        self.assertEqual(0, res.throughput_improvement_rate)

    def test_th_improvement_rate__original_th_is_zero(self):
        original_th = 0.0
        optimized_th = 1.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(throughput=original_th),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(throughput=optimized_th),
        )
        self.assertEqual(-1, res.throughput_improvement_rate)

    def test_th_improvement_rate__rate_gt_1(self):
        original_th = 0.5
        optimized_th = 1
        res = OptimizeInferenceResult(
            original_model=MagicMock(throughput=original_th),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(throughput=optimized_th),
        )
        self.assertGreater(res.throughput_improvement_rate, 1)

    def test_th_improvement_rate__rate_lt_1(self):
        original_th = 1.0
        optimized_th = 0.5
        res = OptimizeInferenceResult(
            original_model=MagicMock(throughput=original_th),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(throughput=optimized_th),
        )
        self.assertLess(res.throughput_improvement_rate, 1)

    def test_size_improvement_rate__optimized_model_is_none(self):
        res = OptimizeInferenceResult(
            original_model=MagicMock(),
            hardware_setup=MagicMock(),
            optimized_model=None,
        )
        self.assertIsNone(res.size_improvement_rate)

    def test_size_improvement_rate__optimized_size_is_zero(self):
        original_size = 1.0
        optimized_size = 0.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(size_mb=original_size),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(size_mb=optimized_size),
        )
        self.assertEqual(1, res.size_improvement_rate)

    def test_size_improvement_rate__original_size_is_zero(self):
        original_size = 0.0
        optimized_size = 1.0
        res = OptimizeInferenceResult(
            original_model=MagicMock(size_mb=original_size),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(size_mb=optimized_size),
        )
        self.assertEqual(0, res.size_improvement_rate)

    def test_size_improvement_rate__rate_gt_1(self):
        original_size = 1
        optimized_size = 0.5
        res = OptimizeInferenceResult(
            original_model=MagicMock(size_mb=original_size),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(size_mb=optimized_size),
        )
        self.assertGreater(res.size_improvement_rate, 1)

    def test_size_improvement_rate__rate_lt_1(self):
        original_size = 0.5
        optimized_size = 1
        res = OptimizeInferenceResult(
            original_model=MagicMock(size_mb=original_size),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(size_mb=optimized_size),
        )
        self.assertLess(res.size_improvement_rate, 1)

    def test_metric_drop__optimized_model_is_none(self):
        res = OptimizeInferenceResult(
            original_model=MagicMock(),
            hardware_setup=MagicMock(),
            optimized_model=None,
        )
        self.assertIsNone(res.metric_drop)

    def test_metric_drop(self):
        metric_drop = 0.1
        res = OptimizeInferenceResult(
            original_model=MagicMock(),
            hardware_setup=MagicMock(),
            optimized_model=MagicMock(metric_drop=metric_drop),
        )
        self.assertEqual(metric_drop, res.metric_drop)


================================================
FILE: optimization/nebullvm/nebullvm/core/types.py
================================================
from typing import Union, Iterable, Sequence

from nebullvm.tools.data import DataManager

InputData = Union[Iterable, Sequence, DataManager]


================================================
FILE: optimization/nebullvm/nebullvm/installers/__init__.py
================================================
# flake8: noqa

__all__ = [k for k in globals().keys() if not k.startswith("_")]


================================================
FILE: optimization/nebullvm/nebullvm/installers/auto_installer.py
================================================
import argparse
from typing import List, Union

from loguru import logger

from nebullvm.config import (
    ONNX_MODULES,
    TENSORFLOW_MODULES,
    TORCH_MODULES,
    HUGGING_FACE_MODULES,
    DIFFUSERS_MODULES,
)
from nebullvm.installers.installers import (
    ONNXInstaller,
    PytorchInstaller,
    TensorflowInstaller,
    HuggingFaceInstaller,
    DiffusersInstaller,
)


SUPPORTED_BACKENDS_DICT = {
    "torch": ["onnx"],
    "tensorflow": ["onnx"],
    "huggingface": ["torch", "tensorflow", "onnx"],
    "diffusers": ["torch", "onnx"],
    "onnx": [],
}

INSTALLERS = {
    "onnx": ONNXInstaller,
    "torch": PytorchInstaller,
    "tensorflow": TensorflowInstaller,
    "huggingface": HuggingFaceInstaller,
    "diffusers": DiffusersInstaller,
}

MODULES = {
    "onnx": ONNX_MODULES,
    "torch": TORCH_MODULES,
    "tensorflow": TENSORFLOW_MODULES,
    "huggingface": HUGGING_FACE_MODULES,
    "diffusers": DIFFUSERS_MODULES,
}


def select_frameworks_to_install(
    include_frameworks: Union[List[str], str],
    include_backends: Union[List[str], str],
) -> List[str]:
    supported_frameworks = list(INSTALLERS.keys())
    if isinstance(include_frameworks, str) and include_frameworks == "all":
        frameworks_list = supported_frameworks
    elif isinstance(include_frameworks, list):
        frameworks_list = []
        for framework in include_frameworks:
            if framework in supported_frameworks:
                frameworks_list.append(framework)
            else:
                logger.warning(f"Framework {framework} not supported")

        if len(frameworks_list) == 0:
            raise ValueError("No supported frameworks selected")

        if isinstance(include_backends, str) and include_backends == "all":
            for framework in frameworks_list:
                for backend in SUPPORTED_BACKENDS_DICT[framework]:
                    frameworks_list.append(backend)
        elif isinstance(include_backends, list):
            for backend in include_backends:
                if backend not in supported_frameworks:
                    logger.warning(f"Backend {backend} not supported")
                else:
                    backend_supported = False
                    for framework in frameworks_list:
                        if backend in SUPPORTED_BACKENDS_DICT[framework]:
                            frameworks_list.append(backend)
                            backend_supported = True
                            break
                    if not backend_supported:
                        logger.warning(
                            f"Backend {backend} not supported for selected "
                            f"frameworks"
                        )
        else:
            raise ValueError("Invalid backends list")
    else:
        raise ValueError("Invalid frameworks list")

    frameworks_list = list(set(frameworks_list))
    frameworks_list.sort()

    return frameworks_list


def select_compilers_to_install(
    include_compilers: Union[List[str], str], framework_list: List[str]
) -> List[str]:
    compiler_list = []
    supported_compilers = list(
        set([item for sublist in MODULES.values() for item in sublist])
    )
    if isinstance(include_compilers, str) and include_compilers == "all":
        compiler_list = list(
            set(
                [
                    item
                    for (fr, compilers) in MODULES.items()
                    for item in compilers
                    if fr in framework_list
                ]
            )
        )
    else:
        for compiler in include_compilers:
            if compiler not in supported_compilers:
                logger.warning(f"Compiler {compiler} not supported")
            else:
                compiler_supported = False
                for framework in framework_list:
                    if compiler in MODULES[framework]:
                        compiler_list.append(compiler)
                        compiler_supported = True
                        break
                if not compiler_supported:
                    logger.warning(
                        f"Compiler {compiler} not supported for selected "
                        f"frameworks"
                    )

    compiler_list = list(set(compiler_list))
    compiler_list.sort()

    return compiler_list


def auto_install_libraries(
    include_frameworks: Union[List[str], str] = "all",
    include_backends: Union[List[str], str] = "all",
    include_compilers: Union[List[str], str] = "all",
):
    logger.info("Running auto install of nebullvm dependencies")

    framework_list = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    compilers_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    for framework in framework_list:
        framework_installer = INSTALLERS[framework](MODULES[framework])
        if not framework_installer.check_framework():
            framework_installer.install_framework()
        framework_installer.install_dependencies(framework_list)
        framework_installer.install_compilers(compilers_list)


def main():
    parser = argparse.ArgumentParser(
        description="Auto install dl frameworks and dependencies"
    )
    parser.add_argument(
        "-f",
        "--frameworks",
        help="The base dl frameworks to be installed",
        default="all",
        nargs="+",
    )
    parser.add_argument(
        "-b",
        "--extra-backends",
        help="additional dl frameworks to be installed to "
        "gain the optimal speedup",
        default="all",
        nargs="+",
    )
    parser.add_argument(
        "-c",
        "--compilers",
        help="Compilers to be installed",
        default="all",
        nargs="+",
    )
    args = vars(parser.parse_args())

    if len(args["frameworks"]) == 1 and args["frameworks"][0] == "all":
        framework_list = "all"
    else:
        framework_list = args["frameworks"]

    if len(args["extra_backends"]) == 1 and args["extra_backends"][0] in [
        "all",
        "none",
    ]:
        if args["extra_backends"][0] == "all":
            backend_list = "all"
        else:
            backend_list = []
    else:
        backend_list = args["extra_backends"]

    if len(args["compilers"]) == 1 and args["compilers"][0] == "all":
        compilers_list = "all"
    else:
        compilers_list = args["compilers"]

    auto_install_libraries(framework_list, backend_list, compilers_list)


if __name__ == "__main__":
    main()


================================================
FILE: optimization/nebullvm/nebullvm/installers/install_bladedisc.sh
================================================
#!/bin/bash

# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive

if [ ! -d "BladeDISC" ]
then
  git clone https://github.com/alibaba/BladeDISC.git
fi

cd BladeDISC && git submodule update --init --recursive

# Install bazel
sudo apt install apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update && sudo apt install bazel
sudo apt install default-jdk

if [ $1 == "true" ]
then
cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
else
  if [[ $OSTYPE == "darwin"* ]]
  then
    export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF
    export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.10.0+aarch64
    cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
  else
    export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF
    export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.8.1+cpu
    cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
  fi
fi

cd ../..


================================================
FILE: optimization/nebullvm/nebullvm/installers/install_fastertransformer.sh
================================================
#!/bin/bash

# TODO: check requirements
# https://github.com/NVIDIA/FasterTransformer/blob/main/docs/bert_guide.md
# Requirements
#CMake >= 3.8 for Tensorflow, CMake >= 3.13 for PyTorch
#CUDA 11.0 or newer version
#Python: Only verify on python 3
#Tensorflow: Verify on 1.15, 1.13 and 1.14 should work.
#PyTorch: Verify on 1.8.0, >= 1.5.0 should work.


# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive

if [[ $OSTYPE == "darwin"* ]]
then
  echo "MacOS is not supported for FasterTransformer"
  exit 1
fi

if [ ! -d "FasterTransformer" ]
then
  git clone --recursive https://github.com/NVIDIA/FasterTransformer FasterTransformer
fi

# TODO: checkout to latest release

cd FasterTransformer &&
mkdir -p build &&
cd build &&
cmake -DSM=$COMPUTE_CAPABILITY -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON  .. &&
make -j8 &&
touch ../../FasterTransformer_build_success  # create a file to indicate that the build was successful

# TODO: enable multi gpu if possible
#-DBUILD_MULTI_GPU=OFF

================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tensor_rt.sh
================================================
#!/bin/bash

if [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]]
then
  # Installation for centos type linux distribution
  # Try installation with pip if fails then install from source
  pip3 install --upgrade "setuptools<=65.7.0" pip
  # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1
  if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]
  then
    python3 -m pip install --upgrade "tensorrt<=8.5.3.1"
  else
    python3 -m pip install --upgrade "tensorrt<=8.6.1"
  fi
  pip3 install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com

  if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]]
  then
    # Uninstall previous version
    pip3 uninstall nvidia-tensorrt
    # install pre-requisites
    pip3 install numpy
    yum update && \
      yum -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \
      libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \
      rm -rf /var/lib/apt/lists/*
  fi
else
  # Try installation with pip if fails then install from source
  pip install --upgrade "setuptools<=65.7.0" pip
  # If cuda version is less than 12.0 then install tensorrt<=8.5.3.1
   if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]
  then
    python3 -m pip install --upgrade "tensorrt<=8.5.3.1"
  else
    python3 -m pip install --upgrade "tensorrt<=8.6.1"
  fi

  pip install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com

  if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]]
  then
    # Uninstall previous version
    pip uninstall nvidia-tensorrt
    # install pre-requisites
    pip install numpy
    apt-get update && \
      apt-get -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \
      libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \
      rm -rf /var/lib/apt/lists/*
  fi
fi


================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tvm.sh
================================================
#!/bin/bash

# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive

if [ ! -d "tvm" ]
then
  git clone --recursive https://github.com/apache/tvm tvm
fi

cd tvm
mkdir -p build
cp $CONFIG_PATH build/
cd build
cmake ..
make -j8
if [[ $OSTYPE == "darwin"* ]]
then
  pip install tornado
  brew install openblas gfortran
  pip install pybind11 cython pythran
  conda install -y scipy
  pip install xgboost decorator
  export MACOSX_DEPLOYMENT_TARGET=10.9
else
  pip3 install decorator attrs tornado psutil xgboost cloudpickle
fi
cd ../python
python3 setup.py install --user
cd ../..


================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tvm_prerequisites.sh
================================================
#!/bin/bash

# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive

if [[ $OSTYPE == "darwin"* ]]
then
  brew install gcc git cmake
  #brew install llvm
  conda install -y -c conda-forge clangdev
elif [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]]
then
  sudo yum update -y && sudo yum install -y gcc gcc-c++ llvm-devel cmake3 git
  if [ -f "/usr/bin/cmake" ]
  then
    sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 \
      --slave /usr/local/bin/ctest ctest /usr/bin/ctest \
      --slave /usr/local/bin/cpack cpack /usr/bin/cpack \
      --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake \
      --family cmake
    sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 \
      --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 \
      --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 \
      --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 \
      --family cmake
  else
    sudo ln -s /usr/bin/cmake3 /usr/bin/cmake
  fi
else
  sudo apt-get update && sudo apt-get install -y libpython3.8 gcc libtinfo-dev zlib1g-dev \
    build-essential cmake libedit-dev libxml2-dev llvm-12
fi


================================================
FILE: optimization/nebullvm/nebullvm/installers/installers.py
================================================
import os
import platform
import subprocess
import sys
from abc import ABC
from pathlib import Path
from typing import List

import cpuinfo
from loguru import logger

from nebullvm.config import LIBRARIES_GPU
from nebullvm.operations.optimizations.compilers.utils import (
    deepsparse_is_available,
    get_faster_transformer_repo_path,
    intel_neural_compressor_is_available,
    openvino_is_available,
    tensorrt_is_available,
    torch_tensorrt_is_available,
)
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.utils import check_module_version, gpu_is_available


def get_cpu_arch():
    arch = cpuinfo.get_cpu_info()["arch"].lower()
    if "x86" in arch:
        return "x86"
    else:
        return "arm"


def _get_os():
    return platform.system()


def install_tvm(
    working_dir: str = None,
):
    """Helper function for installing ApacheTVM.

    This function needs some prerequisites for running, as a valid `git`
    installation and having MacOS or a Linux-distribution as OS.

    Args:
        working_dir (str, optional): The directory where the tvm repo will be
            cloned and installed.
    """
    path = Path(__file__).parent
    # install pre-requisites
    installation_file_prerequisites = str(
        path / "install_tvm_prerequisites.sh"
    )
    subprocess.run(
        ["bash", installation_file_prerequisites],
        cwd=working_dir or Path.home(),
    )
    installation_file = str(path / "install_tvm.sh")
    hardware_config = get_cpu_arch()
    if gpu_is_available():
        hardware_config = f"{hardware_config}_cuda"
    env_dict = {
        "CONFIG_PATH": str(
            path / f"tvm_installers/{hardware_config}/config.cmake"
        ),
        **dict(os.environ.copy()),
    }
    subprocess.run(
        ["bash", installation_file],
        cwd=working_dir or Path.home(),
        env=env_dict,
    )

    try:
        import tvm  # noqa F401
    except ImportError:
        return True

    return True


def install_bladedisc():
    """Helper function for installing BladeDisc."""
    has_cuda = False
    if gpu_is_available():
        has_cuda = True

    path = Path(__file__).parent
    installation_file = str(path / "install_bladedisc.sh")
    subprocess.Popen(["bash", installation_file, str(has_cuda).lower()])

    try:
        import torch_blade  # noqa F401
    except ImportError:
        return False

    return True


def install_torch_tensor_rt():
    """Helper function for installing Torch-TensorRT.

    The function will install the software only if a cuda driver is available.
    """
    if not gpu_is_available():
        raise RuntimeError(
            "Torch-TensorRT can run just on Nvidia machines. "
            "No available cuda driver has been found."
        )
    elif not check_module_version(
        torch, min_version="1.12.0", max_version="1.13.1+cu117"
    ):
        logger.warning(
            "Torch-TensorRT can be installed only for "
            "'PyTorch>=1.12, <=1.13.1'. Please update your Pytorch "
            "version accordingly if you want to use Torch-TensorRT."
        )
        return False

    # Verify that TensorRT is installed, otherwise install it
    try:
        import tensorrt  # noqa F401
    except ImportError:
        install_tensor_rt()

    cmd = [
        "pip3",
        "install",
        "torch-tensorrt",
        "--find-links",
        "https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0",
    ]
    subprocess.run(cmd)
    cuda_version = subprocess.check_output(["nvidia-smi"])
    cuda_version = int(
        cuda_version.decode("utf-8")
        .split("\n")[2]
        .split("|")[-2]
        .split(":")[-1]
        .strip()
        .split(".")[0]
    )
    if cuda_version >= 12:
        cmd = [
            "pip3",
            "install",
            "tensorrt>=8.6.0,<=8.6.1",
        ]
        subprocess.run(cmd)

    try:
        import torch_tensorrt  # noqa F401
    except ImportError:
        return False

    return True


def install_tf2onnx():
    if _get_os() == "Darwin" and get_cpu_arch() == "arm":
        cmd = ["conda", "install", "-y", "tf2onnx>=1.8.4"]
        subprocess.run(cmd)
    else:
        cmd = ["pip3", "install", "--user", "protobuf<4,>=3.20.2"]
        subprocess.run(cmd)

        cmd = ["pip3", "install", "tf2onnx>=1.8.4"]
        subprocess.run(cmd)

    try:
        import tf2onnx  # noqa F401
    except ImportError:
        return False
    except AttributeError:
        # Sometimes the import could raise an attribute error
        # if installation fails
        pass

    return True


def install_tensor_rt():
    """Helper function for installing TensorRT.

    The function will install the software only if a cuda driver is available.
    """
    if not gpu_is_available():
        raise RuntimeError(
            "TensorRT can run just on Nvidia machines. "
            "No available cuda driver has been found."
        )
    path = Path(__file__).parent
    installation_file = str(path / "install_tensor_rt.sh")
    subprocess.run(["bash", installation_file])

    try:
        import polygraphy  # noqa F401
        import tensorrt  # noqa F401
    except ImportError:
        return False

    return True


def install_openvino(with_optimization: bool = True):
    """Helper function for installing the OpenVino compiler.

    This function just works on intel machines.

    Args:
        with_optimization (bool): Flag for installing the full openvino engine
            or limiting the installation to the tools need for inference
            models.
    """
    processor = cpuinfo.get_cpu_info()["brand_raw"].lower()
    if "intel" not in processor:
        raise RuntimeError(
            f"Openvino can run just on Intel machines. "
            f"You are trying to install it on {processor}"
        )

    openvino_version = "openvino-dev" if with_optimization else "openvino"
    # If on windows
    if _get_os() == "Windows":
        cmd = ["pip3", "install", "--user", f"{openvino_version}>=2022.1.0"]
    else:
        cmd = ["pip3", "install", f"{openvino_version}>=2022.1.0"]
    subprocess.run(cmd)

    cmd = ["pip3", "install", "scipy>=1.7.3"]
    subprocess.run(cmd)

    try:
        from openvino.runtime import (  # noqa F401
            CompiledModel,
            Core,
            InferRequest,
            Model,
        )
    except ImportError:
        return False

    return True


def install_onnxruntime():
    """Helper function for installing the right version of onnxruntime."""
    distribution_name = "onnxruntime"
    if gpu_is_available():
        distribution_name = f"{distribution_name}-gpu"
    if _get_os() == "Darwin" and get_cpu_arch() == "arm":
        cmd = ["conda", "install", "-y", distribution_name]
    else:
        cmd = ["pip3", "install", distribution_name]
    subprocess.run(cmd)
    # install requirements for onnxruntime.transformers
    cmd = ["pip3", "install", "coloredlogs", "sympy"]
    subprocess.run(cmd)

    try:
        import onnxruntime  # noqa F401
    except ImportError:
        return False

    return True


def install_deepsparse():
    """Helper function for installing DeepSparse."""
    python_minor_version = sys.version_info.minor

    os_ = platform.system()
    if os_ in ["Darwin", "Windows"] or get_cpu_arch() == "arm":
        raise RuntimeError(
            "DeepSparse is not supported on this platform. "
            "It won't be installed."
        )

    try:
        cmd = ["apt-get", "install", f"python3.{python_minor_version}-venv"]
        subprocess.run(cmd)
    except Exception:
        pass

    cmd = ["pip3", "install", "deepsparse"]
    subprocess.run(cmd)

    try:
        cmd = ["pip3", "install", "numpy>=1.22.0,<1.24.0"]
        subprocess.run(cmd)
    except Exception:
        # For python 3.7 numpy 1.22.0 is not available
        pass

    try:
        from deepsparse import compile_model, cpu  # noqa F401
    except ImportError:
        return False

    return True


def install_intel_neural_compressor():
    """Helper function for installing Intel Neural Compressor."""

    processor = cpuinfo.get_cpu_info()["brand_raw"].lower()
    if "intel" not in processor:
        raise RuntimeError(
            f"Intel Neural Compressor can run just on Intel machines. "
            f"You are trying to install it on {processor}"
        )

    cmd = ["pip3", "install", "--user", "neural-compressor"]
    subprocess.run(cmd)

    try:
        from neural_compressor.experimental import (  # noqa F401
            MixedPrecision,
            Quantization,
        )
    except ImportError:
        return False

    return True


def install_onnx_simplifier():
    """Helper function for installing ONNX simplifier."""

    if get_cpu_arch() != "arm":
        # Install onnx simplifier
        cmd = ["pip3", "install", "onnxsim"]
        subprocess.run(cmd)

    try:
        import onnxsim  # noqa F401
    except ImportError:
        return False

    return True


def install_faster_transformer(
    working_dir: str = None,
):
    """Helper function for installing FasterTransformer.
    https://github.com/NVIDIA/FasterTransformer

    This function needs some prerequisites for running, as a valid `git`
    installation and having MacOS or a Linux-distribution as OS.

    Args:
        working_dir (str, optional): The directory where the FasterTransformer
        repo will be cloned and installed. Default: None
    """
    if not gpu_is_available():
        return False
    path = Path(__file__).parent
    # install faster transformer
    try:
        import torch

        CP = compute_capability = torch.cuda.get_device_capability()
        assert len(compute_capability) == 2
    except (ImportError, AssertionError):
        return False
    installation_file = str(path / "install_fastertransformer.sh")
    env_dict = {
        "COMPUTE_CAPABILITY": f"{CP[0]}{CP[1]}",
        **dict(os.environ.copy()),
    }

    result = subprocess.run(
        ["bash", installation_file],
        cwd=get_faster_transformer_repo_path().parent,
        env=env_dict,
    )
    # check result
    if result.returncode != 0:
        return False
    return True


class BaseInstaller(ABC):
    def __init__(self, module_list: List[str]):
        self.modules = module_list

    def install_compilers(
        self,
        include_libraries: List[str],
    ):
        for library in self.modules:
            if (
                isinstance(include_libraries, List)
                and library not in include_libraries
            ) or (not gpu_is_available() and library in LIBRARIES_GPU):
                continue

            logger.info(f"Trying to install {library} on the platform...")

            try:
                if not COMPILERS_AVAILABLE[library]():
                    install_ok = COMPILER_INSTALLERS[library]()
                else:
                    install_ok = True
            except Exception:
                install_ok = False

            if not install_ok:
                logger.warning(
                    f"Unable to install {library} on this platform. "
                    f"The compiler will be skipped. "
                )
            else:
                logger.info(f"{library} installed successfully!")

    @staticmethod
    def install_dependencies(include_framework: List[str]):
        raise NotImplementedError

    @staticmethod
    def check_framework():
        raise NotImplementedError

    @staticmethod
    def install_framework():
        raise NotImplementedError


class PytorchInstaller(BaseInstaller):
    @staticmethod
    def install_dependencies(include_framework: List[str]):
        return

    @staticmethod
    def check_framework():
        try:
            import torch  # noqa F401
        except ImportError:
            raise ImportError(
                "No PyTorch found in your python environment. Please install "
                "it from https://pytorch.org/get-started/locally/."
            )

        if not check_module_version(
            torch, min_version="1.12.0", max_version="2.0.1+cu118"
        ):
            logger.warning(
                "PyTorch version is not supported. Please install "
                "PyTorch >= 1.12.0 and <= 2.0.1."
            )

        return True

    @staticmethod
    def install_framework():
        cmd = ["pip3", "install", "torch>=1.12.0, <=2.0.1"]
        subprocess.run(cmd)

        try:
            import torch  # noqa F401
        except ImportError:
            return False

        return True


class TensorflowInstaller(BaseInstaller):
    @staticmethod
    def install_dependencies(include_framework: List[str]):
        if "onnx" in include_framework:
            install_tf2onnx()

    @staticmethod
    def check_framework():
        try:
            import tensorflow  # noqa F401
        except ImportError:
            return False

        if not check_module_version(
            tensorflow, min_version="2.7.0", max_version="2.12.0"
        ):
            logger.warning(
                "TensorFlow version is not supported. Please install "
                "TensorFlow >= 2.7.0 and <= 2.12.0."
            )
            return False

        return True

    @staticmethod
    def install_framework():
        if _get_os() == "Darwin" and get_cpu_arch() == "arm":
            cmd = [
                "conda",
                "install",
                "-y",
                "tensorflow>=2.7.0, 2.12.0",
                "numpy<1.24",
            ]
            subprocess.run(cmd)
        else:
            cmd = ["pip3", "install", "--user", "tensorflow>=2.7.0, <=2.12.0"]
            subprocess.run(cmd)

        try:
            import tensorflow  # noqa F401
        except ImportError:
            return False

        return True


class ONNXInstaller(BaseInstaller):
    @staticmethod
    def install_dependencies(include_framework: List[str]):
        install_onnxruntime()
        cmd = ["pip3", "install", "onnxmltools>=1.11.0"]
        subprocess.run(cmd)
        install_onnx_simplifier()

    @staticmethod
    def check_framework():
        try:
            import onnx  # noqa F401
        except ImportError:
            return False

        if not check_module_version(
            onnx, min_version="1.10.0", max_version="1.14.0"
        ):
            logger.warning(
                "ONNX version is not supported. Please install "
                "ONNX >= 1.10.0 and <= 1.14.0."
            )
            return False

        return True

    @staticmethod
    def install_framework():
        if _get_os() == "Darwin" and get_cpu_arch() == "arm":
            cmd = ["pip3", "install", "cmake"]
            subprocess.run(cmd)

        cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"]
        subprocess.run(cmd)

        try:
            import onnx  # noqa F401
        except ImportError:
            return False

        return True


class HuggingFaceInstaller(BaseInstaller):
    @staticmethod
    def install_dependencies(include_framework: List[str]):
        pass

    @staticmethod
    def check_framework():
        try:
            import transformers  # noqa F401
        except ImportError:
            return False

        return True

    @staticmethod
    def install_framework():
        cmd = ["pip3", "install", "transformers<=4.28.0"]
        subprocess.run(cmd)

        try:
            import transformers  # noqa F401
        except ImportError:
            return False

        return True


class DiffusersInstaller(BaseInstaller):
    @staticmethod
    def install_dependencies(include_framework: List[str]):
        cmd = ["pip3", "install", "transformers<=4.28.0"]
        subprocess.run(cmd)

        if gpu_is_available():
            cmd = ["pip3", "install", "cuda-python"]
            subprocess.run(cmd)

            cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"]
            subprocess.run(cmd)

            cmd = [
                "pip3",
                "install",
                "onnx_graphsurgeon",
                "--index-url",
                "https://pypi.ngc.nvidia.com",
            ]
            subprocess.run(cmd)

    @staticmethod
    def check_framework():
        try:
            import diffusers  # noqa F401
        except ImportError:
            return False

        if not check_module_version(diffusers, min_version="0.13.0"):
            return False

        return True

    @staticmethod
    def install_framework():
        cmd = ["pip3", "install", "diffusers>=0.13.0, <=0.15.0"]
        subprocess.run(cmd)

        try:
            import diffusers  # noqa F401
        except ImportError:
            return False

        return True


COMPILER_INSTALLERS = {
    "openvino": install_openvino,
    "tensor_rt": install_tensor_rt,
    "torch_tensor_rt": install_torch_tensor_rt,
    "deepsparse": install_deepsparse,
    "intel_neural_compressor": install_intel_neural_compressor,
    # "faster_transformer": install_faster_transformer,
}

COMPILERS_AVAILABLE = {
    "openvino": openvino_is_available,
    "tensor_rt": tensorrt_is_available,
    "torch_tensor_rt": torch_tensorrt_is_available,
    "deepsparse": deepsparse_is_available,
    "intel_neural_compressor": intel_neural_compressor_is_available,
    # "faster_transformer": faster_transformer_is_available,
}


================================================
FILE: optimization/nebullvm/nebullvm/installers/tests/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/installers/tests/test_install_frameworks.py
================================================
from nebullvm.installers.auto_installer import (
    select_frameworks_to_install,
    select_compilers_to_install,
)


def test_install_default_option():
    include_frameworks = "all"
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == [
        "diffusers",
        "huggingface",
        "onnx",
        "tensorflow",
        "torch",
    ]


def test_install_torch_full():
    include_frameworks = ["torch"]
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["onnx", "torch"]


def test_install_torch_base():
    include_frameworks = ["torch"]
    include_backends = []

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["torch"]


def test_install_tensorflow_full():
    include_frameworks = ["tensorflow"]
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["onnx", "tensorflow"]


def test_install_tensorflow_base():
    include_frameworks = ["tensorflow"]
    include_backends = []

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["tensorflow"]


def test_install_onnx_full():
    include_frameworks = ["onnx"]
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["onnx"]


def test_install_onnx_base():
    include_frameworks = ["onnx"]
    include_backends = []

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["onnx"]


def test_install_diffusers_full():
    include_frameworks = ["diffusers"]
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["diffusers", "onnx", "torch"]


def test_install_huggingface_full():
    include_frameworks = ["huggingface"]
    include_backends = "all"

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["huggingface", "onnx", "tensorflow", "torch"]


def test_install_huggingface_full_tf():
    include_frameworks = ["huggingface"]
    include_backends = ["onnx", "tensorflow"]

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["huggingface", "onnx", "tensorflow"]


def test_install_huggingface_full_torch():
    include_frameworks = ["huggingface"]
    include_backends = ["onnx", "torch"]

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["huggingface", "onnx", "torch"]


def test_install_huggingface_tf():
    include_frameworks = ["huggingface"]
    include_backends = ["tensorflow"]

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["huggingface", "tensorflow"]


def test_install_huggingface_torch():
    include_frameworks = ["huggingface"]
    include_backends = ["torch"]

    include_backends = select_frameworks_to_install(
        include_frameworks, include_backends
    )

    assert include_backends == ["huggingface", "torch"]


def test_install_huggingface_compilers_all():
    framework_list = ["huggingface"]
    include_compilers = "all"

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == []


def test_install_huggingface_torch_compilers_all():
    framework_list = ["huggingface", "torch"]
    include_compilers = "all"

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == [
        "deepsparse",
        "faster_transformer",
        "intel_neural_compressor",
        "tensor_rt",
        "torch_tensor_rt",
    ]


def test_install_torch_compilers_all():
    framework_list = ["torch"]
    include_compilers = "all"

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == [
        "deepsparse",
        "faster_transformer",
        "intel_neural_compressor",
        "tensor_rt",
        "torch_tensor_rt",
    ]


def test_install_torch_compilers_deepsparse():
    framework_list = ["torch"]
    include_compilers = ["deepsparse"]

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == ["deepsparse"]


def test_install_torch_compilers_invalid():
    framework_list = ["torch"]
    include_compilers = ["best_compiler"]

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == []


def test_install_torch_onnx_compilers_all():
    framework_list = ["torch", "onnx"]
    include_compilers = "all"

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == [
        "deepsparse",
        "faster_transformer",
        "intel_neural_compressor",
        "openvino",
        "tensor_rt",
        "torch_tensor_rt",
    ]


def test_install_tensorflow_compilers_all():
    framework_list = ["tensorflow"]
    include_compilers = "all"

    compiler_list = select_compilers_to_install(
        include_compilers, framework_list
    )

    assert compiler_list == []


================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#--------------------------------------------------------------------
#  Template custom cmake configuration for compiling
#
#  This file is used to override the build options in build.
#  If you want to change the configuration, please use the following
#  steps. Assume you are on the root directory. First copy the this
#  file so that any local changes will be ignored by git
#
#  $ mkdir build
#  $ cp cmake/config.cmake build
#
#  Next modify the according entries, and then compile by
#
#  $ cd build
#  $ cmake ..
#
#  Then build in parallel with 8 threads
#
#  $ make -j8
#--------------------------------------------------------------------

#---------------------------------------------
# Backend runtimes.
#---------------------------------------------

# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA OFF)

# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)

# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)

# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)

# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)

# Whether enable Metal runtime
set(USE_METAL OFF)

# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)

# Whether enable OpenGL runtime
set(USE_OPENGL OFF)

# Whether enable MicroTVM runtime
set(USE_MICRO OFF)

# Whether enable RPC runtime
set(USE_RPC ON)

# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)

# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)

# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)

# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)

# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)

# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)

# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)

# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)

# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
#        which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)

#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)

# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)

# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
set(USE_MKL OFF)

# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)

# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)

# Whether use contrib.random in runtime
set(USE_RANDOM ON)

# Whether use NNPack
set(USE_NNPACK OFF)

# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)

# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)

# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)

# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)

# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)

# Whether use cuBLAS
set(USE_CUBLAS OFF)

# Whether use MIOpen
set(USE_MIOPEN OFF)

# Whether use MPS
set(USE_MPS OFF)

# Whether use rocBlas
set(USE_ROCBLAS OFF)

# Whether use contrib sort
set(USE_SORT ON)

# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)

# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
#                       operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
#                                     runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)

# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
#   Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)

# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)

# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
#                        offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
#                        TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)

# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)

# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)

# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)

# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)

# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)

# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)

# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)

# Whether use Thrust
set(USE_THRUST OFF)

# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)

# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)

# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)

# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)

# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)

# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")

# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)

# Whether enable BNNS runtime
set(USE_BNNS OFF)

# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)

# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)


# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)

# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)

# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
#   to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
#   be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)

# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)


================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm_cuda/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#--------------------------------------------------------------------
#  Template custom cmake configuration for compiling
#
#  This file is used to override the build options in build.
#  If you want to change the configuration, please use the following
#  steps. Assume you are on the root directory. First copy the this
#  file so that any local changes will be ignored by git
#
#  $ mkdir build
#  $ cp cmake/config.cmake build
#
#  Next modify the according entries, and then compile by
#
#  $ cd build
#  $ cmake ..
#
#  Then build in parallel with 8 threads
#
#  $ make -j8
#--------------------------------------------------------------------

#---------------------------------------------
# Backend runtimes.
#---------------------------------------------

# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA ON)

# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)

# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)

# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)

# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)

# Whether enable Metal runtime
set(USE_METAL OFF)

# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)

# Whether enable OpenGL runtime
set(USE_OPENGL OFF)

# Whether enable MicroTVM runtime
set(USE_MICRO OFF)

# Whether enable RPC runtime
set(USE_RPC ON)

# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)

# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)

# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)

# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)

# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)

# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)

# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)

# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)

# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
#        which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)

#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)

# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)

# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
set(USE_MKL OFF)

# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)

# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)

# Whether use contrib.random in runtime
set(USE_RANDOM ON)

# Whether use NNPack
set(USE_NNPACK OFF)

# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)

# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)

# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)

# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)

# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)

# Whether use cuBLAS
set(USE_CUBLAS OFF)

# Whether use MIOpen
set(USE_MIOPEN OFF)

# Whether use MPS
set(USE_MPS OFF)

# Whether use rocBlas
set(USE_ROCBLAS OFF)

# Whether use contrib sort
set(USE_SORT ON)

# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)

# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
#                       operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
#                                     runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)

# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
#   Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)

# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)

# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
#                        offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
#                        TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)

# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)

# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)

# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)

# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)

# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)

# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)

# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)

# Whether use Thrust
set(USE_THRUST OFF)

# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)

# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)

# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)

# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)

# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)

# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")

# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)

# Whether enable BNNS runtime
set(USE_BNNS OFF)

# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)

# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)


# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)

# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)

# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
#   to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
#   be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)

# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)


================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#--------------------------------------------------------------------
#  Template custom cmake configuration for compiling
#
#  This file is used to override the build options in build.
#  If you want to change the configuration, please use the following
#  steps. Assume you are on the root directory. First copy the this
#  file so that any local changes will be ignored by git
#
#  $ mkdir build
#  $ cp cmake/config.cmake build
#
#  Next modify the according entries, and then compile by
#
#  $ cd build
#  $ cmake ..
#
#  Then build in parallel with 8 threads
#
#  $ make -j8
#--------------------------------------------------------------------

#---------------------------------------------
# Backend runtimes.
#---------------------------------------------

# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA OFF)

# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)

# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)

# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)

# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)

# Whether enable Metal runtime
set(USE_METAL OFF)

# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)

# Whether enable OpenGL runtime
set(USE_OPENGL OFF)

# Whether enable MicroTVM runtime
set(USE_MICRO OFF)

# Whether enable RPC runtime
set(USE_RPC ON)

# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)

# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)

# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)

# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)

# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)

# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)

# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)

# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)

# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
#        which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)

#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)

# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)

# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
set(USE_MKL OFF)

# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)

# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)

# Whether use contrib.random in runtime
set(USE_RANDOM ON)

# Whether use NNPack
set(USE_NNPACK OFF)

# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)

# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)

# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)

# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)

# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)

# Whether use cuBLAS
set(USE_CUBLAS OFF)

# Whether use MIOpen
set(USE_MIOPEN OFF)

# Whether use MPS
set(USE_MPS OFF)

# Whether use rocBlas
set(USE_ROCBLAS OFF)

# Whether use contrib sort
set(USE_SORT ON)

# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)

# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
#                       operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
#                                     runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)

# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
#   Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)

# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)

# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
#                        offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
#                        TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)

# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)

# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)

# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)

# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)

# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)

# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)

# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)

# Whether use Thrust
set(USE_THRUST OFF)

# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)

# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)

# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)

# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)

# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)

# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")

# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)

# Whether enable BNNS runtime
set(USE_BNNS OFF)

# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)

# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)


# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)

# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)

# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
#   to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
#   be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)

# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)


================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86_cuda/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#--------------------------------------------------------------------
#  Template custom cmake configuration for compiling
#
#  This file is used to override the build options in build.
#  If you want to change the configuration, please use the following
#  steps. Assume you are on the root directory. First copy the this
#  file so that any local changes will be ignored by git
#
#  $ mkdir build
#  $ cp cmake/config.cmake build
#
#  Next modify the according entries, and then compile by
#
#  $ cd build
#  $ cmake ..
#
#  Then build in parallel with 8 threads
#
#  $ make -j8
#--------------------------------------------------------------------

#---------------------------------------------
# Backend runtimes.
#---------------------------------------------

# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA ON)

# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)

# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)

# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)

# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)

# Whether enable Metal runtime
set(USE_METAL OFF)

# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)

# Whether enable OpenGL runtime
set(USE_OPENGL OFF)

# Whether enable MicroTVM runtime
set(USE_MICRO OFF)

# Whether enable RPC runtime
set(USE_RPC ON)

# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)

# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)

# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)

# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)

# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)

# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)

# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)

# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)

# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
#        which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)

#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)

# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)

# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
set(USE_MKL OFF)

# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)

# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)

# Whether use contrib.random in runtime
set(USE_RANDOM ON)

# Whether use NNPack
set(USE_NNPACK OFF)

# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)

# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)

# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)

# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)

# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)

# Whether use cuBLAS
set(USE_CUBLAS OFF)

# Whether use MIOpen
set(USE_MIOPEN OFF)

# Whether use MPS
set(USE_MPS OFF)

# Whether use rocBlas
set(USE_ROCBLAS OFF)

# Whether use contrib sort
set(USE_SORT ON)

# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)

# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
#                       operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
#                                     runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)

# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
#   Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)

# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)

# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
#                        offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
#                        TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)

# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)

# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)

# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)

# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)

# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)

# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)

# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)

# Whether use Thrust
set(USE_THRUST OFF)

# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)

# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)

# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)

# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)

# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)

# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")

# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)

# Whether enable BNNS runtime
set(USE_BNNS OFF)

# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)

# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)


# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)

# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)

# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
#   to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
#   be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)

# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)


================================================
FILE: optimization/nebullvm/nebullvm/operations/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/base.py
================================================
import abc
from typing import Dict, Union

from loguru import logger

from nebullvm.core.models import Device, DeviceType
from nebullvm.tools.feedback_collector import FeedbackCollector
from nebullvm.tools.utils import check_device


class Operation(abc.ABC):
    def __init__(self):
        self._state = {}
        self.device = Device(DeviceType.CPU)
        self.execute_count = 0
        self.logger = logger
        self.feedback_collector = None

    def set_feedback_collector(self, feedback_collector: FeedbackCollector):
        self.feedback_collector = feedback_collector
        for value in self.__dict__.values():
            if isinstance(value, Operation):
                value.set_feedback_collector(feedback_collector)

    @abc.abstractmethod
    def execute(self, **kwargs):
        raise NotImplementedError()

    @property
    def state(self) -> Dict[str, any]:
        return self._state

    def to(self, device: Union[str, Device]):
        if isinstance(device, str):
            self.device = check_device(device)
        else:
            self.device = device
        return self


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/converters.py
================================================
import abc
from pathlib import Path
from typing import Optional, List, Union

from nebullvm.core.models import DeviceType, DeepLearningFramework, ModelParams
from nebullvm.operations.base import Operation
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.operations.conversions.tensorflow import convert_tf_to_onnx
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager


class Converter(Operation, abc.ABC):
    ONNX_EXTENSION = ".onnx"
    TORCH_EXTENSION = ".pt"
    TF_EXTENSION = ".pb"
    SUPPORTED_DEVICES = [DeviceType.GPU, DeviceType.CPU]

    def __init__(self, model_name: Optional[str] = None):
        super().__init__()
        self.model = None
        self.data = None
        self.converted_models = None
        self.model_params = None
        self.device = None
        self.model_name = model_name or "temp"

    def set_state(
        self, model: Union[torch.nn.Module, tf.Module, str], data: DataManager
    ):
        self.model = model
        self.data = data
        return self

    def get_result(self) -> List:
        return [model for model in self.converted_models if model is not None]


class PytorchConverter(Converter):
    DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]

    def execute(
        self,
        save_path: Path,
        model_params: ModelParams,
    ):
        self.converted_models = [self.model]

        if self.device.type not in self.SUPPORTED_DEVICES:
            return

        for framework in self.DEST_FRAMEWORKS:
            if framework is DeepLearningFramework.NUMPY:
                self.onnx_conversion(save_path, model_params)
            else:
                raise NotImplementedError()

    def onnx_conversion(self, save_path, model_params):
        onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
        onnx_model_path = convert_torch_to_onnx(
            torch_model=self.model,
            input_data=self.data,
            model_params=model_params,
            output_file_path=onnx_path,
            device=self.device,
        )
        if self.converted_models is None:
            self.converted_models = [onnx_model_path]
        else:
            self.converted_models.append(onnx_model_path)

    def tensorflow_conversion(self):
        # TODO: Implement conversion from Pytorch to Tensorflow
        raise NotImplementedError()


class TensorflowConverter(Converter):
    DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]

    def execute(
        self,
        save_path: Path,
        model_params: ModelParams,
    ):
        self.converted_models = [self.model]

        if self.device.type not in self.SUPPORTED_DEVICES:
            return

        for framework in self.DEST_FRAMEWORKS:
            if framework is DeepLearningFramework.NUMPY:
                self.onnx_conversion(save_path, model_params)
            else:
                raise NotImplementedError()

    def onnx_conversion(self, save_path, model_params):
        onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
        onnx_model_path = convert_tf_to_onnx(
            model=self.model,
            model_params=model_params,
            output_file_path=onnx_path,
        )
        if self.converted_models is None:
            self.converted_models = [onnx_model_path]
        else:
            self.converted_models.append(onnx_model_path)

    def pytorch_conversion(self):
        # TODO: Implement conversion from Tensorflow to Pytorch
        raise NotImplementedError()


class ONNXConverter(Converter):
    DEST_FRAMEWORKS = []

    def execute(self, save_path, model_params):
        onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
        try:
            model_onnx = onnx.load(str(self.model))
            onnx.save(model_onnx, str(onnx_path))
        except Exception:
            self.logger.error(
                "The provided onnx model path is invalid. Please provide"
                " a valid path to a model in order to use Nebullvm."
            )
            self.converted_models = []

        self.converted_models = [str(onnx_path)]

    def tensorflow_conversion(self):
        # TODO: Implement conversion from ONNX to Tensorflow
        raise NotImplementedError()

    def pytorch_conversion(self):
        # TODO: Implement conversion from ONNX to Pytorch
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/huggingface.py
================================================
from typing import (
    List,
    Dict,
    Sequence,
    Optional,
)

import numpy as np

from nebullvm.core.models import Device
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.huggingface import (
    PreTrainedTokenizer,
    PreTrainedModel,
)
from nebullvm.tools.huggingface import (
    get_output_structure_from_dict,
    get_output_structure_from_text,
    PyTorchTransformerWrapper,
    TensorFlowTransformerWrapper,
)
from nebullvm.tools.utils import is_dict_type


class _HFTextDataset(Sequence):
    def __init__(
        self,
        input_texts: List,
        ys: Optional[List],
        keywords: List[str],
        batch_size: int,
        tokenizer: PreTrainedTokenizer,
        tokenizer_args: Dict,
    ):
        self._input_texts = input_texts
        self._ys = ys
        self._bs = batch_size
        self._keys = keywords
        self._tokenizer = tokenizer
        if self._tokenizer.pad_token is None:
            self._tokenizer.pad_token = self._tokenizer.eos_token
        _tokenizer_args = {"truncation": True, "padding": True}
        _tokenizer_args.update(tokenizer_args)
        self._tokenizer_args = _tokenizer_args

    def __getitem__(self, item: int):
        pointer = self._bs * item
        if pointer >= len(self._input_texts):
            raise IndexError
        mini_batch = self._input_texts[
            pointer : pointer + self._bs  # noqa E203
        ]
        if self._ys is not None:
            mini_batch_y = self._ys[pointer : pointer + self._bs]  # noqa E203
        else:
            mini_batch_y = None
        encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args)
        return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y

    def __len__(self):
        return len(self._input_texts) // self._bs


class _HFDictDataset(Sequence):
    def __init__(
        self,
        input_data: List,
        ys: Optional[List],
        keywords: List[str],
    ):
        self._input_data = input_data
        self._ys = ys
        self._keys = keywords

    def __getitem__(self, item: int):
        pointer = item
        if pointer >= len(self._input_data):
            raise IndexError
        mini_batch = self._input_data[pointer]
        if self._ys is not None:
            mini_batch_y = self._ys[pointer]
        else:
            mini_batch_y = None
        return (
            tuple(self._concatenate(mini_batch, key) for key in self._keys),
            mini_batch_y,
        )

    def __len__(self):
        return len(self._input_data)

    @staticmethod
    def _concatenate(mini_batch, key):
        if isinstance(mini_batch[key], torch.Tensor):
            return torch.concat([mini_batch[key]])
        elif isinstance(mini_batch[key], tf.Tensor):
            return tf.concat([mini_batch[key]], 0)
        else:
            return np.concatenate([mini_batch[key]])


def convert_hf_model(
    model: PreTrainedModel,
    input_data: List,
    device: Device,
    tokenizer: Optional[PreTrainedTokenizer] = None,
    tokenizer_args: Optional[Dict] = None,
    batch_size: int = 1,
    **kwargs,
):
    if is_dict_type(input_data[0]):
        # already tokenized data
        if "labels" in input_data[0]:
            labels = [data.pop("labels") for data in input_data]
        else:
            labels = None
        input_example = input_data[0]
        output_structure, output_type = get_output_structure_from_dict(
            input_example=input_example,
            model=model,
            device=device,
        )
        input_data = _HFDictDataset(
            input_data=input_data,
            ys=labels,
            keywords=list(input_example.keys()),
        )

    else:
        assert tokenizer is not None, (
            "Tokenizer is needed when passing data in string format. Please "
            "provide the tokenizer as keyword argument."
        )
        if tokenizer_args is None:
            tokenizer_args = {}
        if not isinstance(input_data[0], str):
            ys = [data[1] for data in input_data]
            input_data = [data[0] for data in input_data]
        else:
            ys = None
        output_structure, output_type = get_output_structure_from_text(
            text=input_data[0],
            model=model,
            tokenizer=tokenizer,
            tokenizer_args=tokenizer_args,
            device=device,
        )
        input_example = tokenizer(input_data, **tokenizer_args)
        input_data = _HFTextDataset(
            input_texts=input_data,
            ys=ys,
            keywords=list(input_example.keys()),
            batch_size=batch_size,
            tokenizer=tokenizer,
            tokenizer_args=tokenizer_args,
        )
    if isinstance(model, torch.nn.Module):
        wrapper_model = PyTorchTransformerWrapper(
            core_model=model, encoded_input=input_example
        )
    else:
        wrapper_model = TensorFlowTransformerWrapper(
            core_model=model, encoded_input=input_example
        )

    return (
        wrapper_model,
        input_data,
        list(wrapper_model.inputs_types.keys()),
        output_structure,
        output_type,
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/pytorch.py
================================================
from contextlib import nullcontext
from pathlib import Path

from loguru import logger

from nebullvm.config import ONNX_OPSET_VERSION
from nebullvm.core.models import ModelParams, Device, DeviceType, DataType
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.pytorch import (
    create_model_inputs_torch,
)


@torch.inference_mode()
def convert_torch_to_onnx(
    torch_model: Module,
    input_data: DataManager,
    model_params: ModelParams,
    output_file_path: Path,
    device: Device,
):
    """Function importing a custom model in pytorch and converting it in ONNX

    Args:
        torch_model (Module): Pytorch model.
        input_data (DataManager): Custom data provided by user to be
        used as input for the converter.
        model_params (ModelParams): Model Parameters as input sizes and
            dynamic axis information.
        output_file_path (str or Path): Path where storing the output
            ONNX file.
        device (Device): Device where the model will be run.
    """

    if input_data is not None:
        input_tensors = list(input_data.get_list(1)[0])
    else:
        input_tensors = create_model_inputs_torch(model_params.input_infos)

    output_sizes = model_params.output_sizes
    output_types = model_params.output_types

    input_names = [f"input_{i}" for i in range(len(input_tensors))]
    output_names = [f"output_{i}" for i in range(len(output_sizes))]
    dynamic_info = model_params.dynamic_info

    if dynamic_info is not None:
        # This check is needed to enable backward compatibility with
        # previous versions of nebullvm
        if isinstance(list(dynamic_info.inputs[0].values())[0], str):
            onnx_format_inputs = dynamic_info.inputs
        else:
            onnx_format_inputs = [
                {k: v["name"] for (k, v) in d.items()}
                for d in dynamic_info.inputs
            ]

        assert len(dynamic_info.outputs) == len(output_names), (
            f"The number of dynamic outputs provided in the dynamic info "
            f"dict ({len(dynamic_info.outputs)}) is not equal to the number "
            f"of outputs of the model ({len(output_names)}), Detected model "
            f"output shapes are: {output_sizes} "
        )

        dynamic_info = {
            name: dynamic_dict
            for name, dynamic_dict in zip(
                input_names + output_names,
                onnx_format_inputs + dynamic_info.outputs,
            )
        }

    try:
        # try conversion with model on cpu
        if device.type is DeviceType.GPU:
            input_tensors = [x.cpu() for x in input_tensors]
            torch_model.cpu()

        torch.onnx.export(
            torch_model,  # model being run
            tuple(
                input_tensors
            ),  # model input (or a tuple for multiple inputs)
            str(output_file_path),
            # where to save the model (can be a file or file-like object)
            export_params=True,
            # store the trained parameter weights inside the model file
            opset_version=ONNX_OPSET_VERSION,
            # the ONNX version to export the model to
            do_constant_folding=True,
            # whether to execute constant folding for optimization
            input_names=input_names,
            # the model's input names
            output_names=output_names,
            # the model's output names
            dynamic_axes=dynamic_info,
        )

        # Put again model on gpu
        if device.type is DeviceType.GPU:
            torch_model.to(device.to_torch_format())

        return output_file_path
    except Exception:
        # try conversion with model on gpu
        if device.type is DeviceType.GPU:
            input_tensors = [
                x.to(device.to_torch_format()) for x in input_tensors
            ]
            torch_model.to(device.to_torch_format())

            try:
                with torch.autocast("cuda") if output_types[
                    0
                ] is DataType.FLOAT16 else nullcontext():
                    torch.onnx.export(
                        torch_model,  # model being run
                        tuple(
                            input_tensors
                        ),  # model input (or a tuple for multiple inputs)
                        str(output_file_path),
                        # where to save the model
                        # (can be a file or file-like object)
                        export_params=True,
                        # store the trained parameter weights inside the model
                        opset_version=ONNX_OPSET_VERSION,
                        # the ONNX version to export the model to
                        do_constant_folding=True,
                        # whether to execute constant folding for optimization
                        input_names=input_names,
                        # the model's input names
                        output_names=output_names,
                        # the model's output names
                        dynamic_axes=dynamic_info,
                    )

                return output_file_path
            except Exception:
                logger.warning(
                    "Exception raised during conversion from torch"
                    " to onnx model. ONNX pipeline will be unavailable."
                )
                return None
        else:
            logger.warning(
                "Exception raised during conversion from torch"
                " to onnx model. ONNX pipeline will be unavailable."
            )
            return None


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/tensorflow.py
================================================
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Union

from loguru import logger

from nebullvm.config import ONNX_OPSET_VERSION
from nebullvm.core.models import ModelParams
from nebullvm.optional_modules.tensorflow import tensorflow as tf, tf2onnx
from nebullvm.optional_modules.onnx import onnx
from nebullvm.tools.huggingface import TensorFlowTransformerWrapper


def convert_tf_to_onnx(
    model: Union[tf.Module, tf.keras.Model],
    model_params: ModelParams,
    output_file_path: Union[str, Path],
):
    """Convert TF models into ONNX.

    Args:
        model (Union[tf.Module, tf.keras.Model]): TF model.
        model_params (ModelParams): Info about model parameters.
        output_file_path (Path): Path where storing the output file.
    """

    try:
        if isinstance(model, tf.keras.Model) or (
            isinstance(model, TensorFlowTransformerWrapper)
            and isinstance(model.core_model, tf.keras.Model)
        ):
            return convert_keras_to_onnx(model, model_params, output_file_path)
        else:
            return convert_tf_saved_model_to_onnx(model, output_file_path)
    except Exception:
        logger.warning(
            "Something went wrong during conversion from tensorflow"
            " to onnx model. ONNX pipeline will be unavailable."
        )
        return None


def convert_tf_saved_model_to_onnx(
    model: tf.Module, output_file_path: Union[str, Path]
):
    """Convert TF models into ONNX.
    Args:
        model (tf.Module): TF model.
        output_file_path (Path): Path where storing the output file.
    """
    with TemporaryDirectory() as temp_dir:
        tf.saved_model.save(model, export_dir=temp_dir)

        try:
            subprocess.check_output(["python3", "--version"])
            python_cmd = "python3"
        except subprocess.CalledProcessError:
            python_cmd = "python"

        onnx_cmd = [
            python_cmd,
            "-m",
            "tf2onnx.convert",
            "--saved-model",
            f"{temp_dir}",
            "--output",
            f"{output_file_path}",
            "--opset",
            f"{ONNX_OPSET_VERSION}",
        ]
        subprocess.run(onnx_cmd)
        onnx.load(output_file_path)

    return output_file_path


def convert_keras_to_onnx(
    model: tf.keras.Model,
    model_params: ModelParams,
    output_file_path: Union[str, Path],
):
    """Convert keras models into ONNX.

    Args:
        model (tf.keras.Model): keras model.
        model_params (ModelParams): Model Parameters as input sizes and
            dynamic axis information.
        output_file_path (Path): Path where storing the output file.
    """
    # get data types for each input
    dtypes = [
        model_params.input_infos[i].dtype.value
        for i in range(len(model_params.input_infos))
    ]
    # get input shapes for each input
    shapes = [
        [int(x) for x in model_params.input_infos[i].size]
        for i in range(len(model_params.input_infos))
    ]
    # set the dynamic axes for each input
    if isinstance(model, TensorFlowTransformerWrapper):
        names = list(model.inputs_types.keys())
    else:
        names = [f"input_{i}" for i in range(len(model_params.input_infos))]

    input_signature = tuple(
        tf.TensorSpec(
            (
                None
                if model_params.dynamic_info is not None
                and dim in model_params.dynamic_info.inputs[i]
                else shape[dim]
                for dim in range(len(shape))
            ),
            dtype,
            name=name,
        )
        for i, (shape, dtype, name) in enumerate(zip(shapes, dtypes, names))
    )

    onnx_model, _ = tf2onnx.convert.from_keras(
        model,
        input_signature,
        opset=ONNX_OPSET_VERSION,
        output_path=output_file_path,
    )

    return output_file_path


================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/utils.py
================================================
from nebullvm.core.models import DeepLearningFramework
from nebullvm.operations.conversions.converters import (
    PytorchConverter,
    TensorflowConverter,
    ONNXConverter,
    Converter,
)


def get_conversion_op(framework: DeepLearningFramework) -> Converter:
    if framework == DeepLearningFramework.PYTORCH:
        conversion_op = PytorchConverter()
    elif framework == DeepLearningFramework.TENSORFLOW:
        conversion_op = TensorflowConverter()
    else:
        conversion_op = ONNXConverter()

    return conversion_op


================================================
FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/local.py
================================================
from typing import Any, Union, Iterable, Sequence

from nebullvm.operations.base import Operation


class FetchModelFromLocal(Operation):
    def execute(self, model: Any):
        self.state["model"] = model

    def get_model(self) -> any:
        return self.state.get("model")

    def get_result(self) -> Any:
        pass


class FetchDataFromLocal(Operation):
    def execute(self, data: Union[Iterable, Sequence]):
        self.state["data"] = data

    def get_data(self) -> any:
        return self.state.get("data")

    def get_result(self) -> Any:
        pass


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/base.py
================================================
import json
import os
import shutil
from abc import ABC, abstractmethod
from dataclasses import dataclass, InitVar
from pathlib import Path
from tempfile import mkdtemp, TemporaryDirectory
from typing import Union, Dict, Any, List, Optional

import numpy as np

from nebullvm.config import LEARNER_METADATA_FILENAME
from nebullvm.core.models import ModelParams, Device, QuantizationType
from nebullvm.operations.base import Operation
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.onnx import create_model_inputs_onnx
from nebullvm.tools.pytorch import (
    create_model_inputs_torch,
    get_torch_model_size,
)
from nebullvm.tools.tf import create_model_inputs_tf
from nebullvm.tools.transformations import MultiStageTransformation


class BuildInferenceLearner(Operation, ABC):
    def __init__(self):
        super().__init__()
        self.inference_learner = None

    @abstractmethod
    def execute(self, **kwargs):
        raise NotImplementedError()

    def get_result(self) -> Any:
        return self.inference_learner


@dataclass
class BaseInferenceLearner(ABC):
    """Base class for Inference Learners."""

    network_parameters: ModelParams
    input_tfms: Optional[MultiStageTransformation] = None
    input_data: InitVar[List[Any]] = None
    device: Device = None
    quantization_type: QuantizationType = None

    @property
    @abstractmethod
    def name(self) -> str:
        """The name of the InferenceLearner"""

    def __post_init__(self, input_data):
        if self.input_tfms is not None and len(self.input_tfms) < 0:
            self.input_tfms = None
        self._tmp_folder = Path(mkdtemp())
        self._input_data = input_data

    def _store_file(self, file_path: Union[str, Path]):
        return shutil.copy(str(file_path), str(self._tmp_folder))

    def _store_dir(self, dir_path: Union[str, Path]):
        try:
            # For python >= 3.8
            return shutil.copytree(
                str(dir_path), str(self._tmp_folder), dirs_exist_ok=True
            )
        except TypeError:
            # For python <=3.7
            if os.path.isdir(self._tmp_folder):
                shutil.rmtree(str(self._tmp_folder))
            return shutil.copytree(str(dir_path), str(self._tmp_folder))

    def __del__(self, shutil=shutil):
        try:
            shutil.rmtree(self._tmp_folder, ignore_errors=True)
        except Exception:
            pass

    def predict_from_files(
        self, input_files: List[str], output_files: List[str]
    ):
        """Get a model prediction from file.

        The input file is read, processed and a prediction is run on top of it.
        The prediction is then returned into another file (in the same
        directory of the input file itself).

        Args:
            input_files (List[str]): List of paths to the input file.
            output_files (List[str]): List of paths to the file storing
                the prediction.
        """
        inputs = (self._read_file(input_file) for input_file in input_files)
        preds = self(*inputs)
        for pred, output_file in zip(preds, output_files):
            self._save_file(pred, output_file)

    def predict_from_listified_tensors(self, *listified_tensors: List):
        """Predict from listified tensor.

        Method useful to be used in services receiving the input tensor
        from an HTTP call.

        Args:
            listified_tensors (List): List of list-like version of the
                input tensors. Note that each element of the external list is
                a listified input tensor.

        Returns:
            List: List of list-like predictions.
        """
        inputs = (
            self.list2tensor(listified_tensor)
            for listified_tensor in listified_tensors
        )
        if self.input_tfms is not None:
            inputs = (self.input_tfms(_input) for _input in inputs)
        preds = self.predict(*inputs)
        return [self.tensor2list(pred) for pred in preds]

    def list2tensor(self, listified_tensor: List) -> Any:
        """Convert list to tensor.

        Args:
            listified_tensor (List): Listified version of the input tensor.

        Returns:
            Any: Tensor for the prediction.
        """
        raise NotImplementedError()

    def tensor2list(self, tensor: Any) -> List:
        """Convert tensor to list.

        Args:
            tensor (any): Input tensor.

        Returns:
            List: Listified version of the tensor.
        """
        raise NotImplementedError()

    def _read_file(self, input_file: str) -> Any:
        """Read tensor from file.
        Args:
            input_file (str): Path to the file containing the input tensor.

        Returns:
            Any: Tensor read from the file.
        """
        raise NotImplementedError()

    def _save_file(self, prediction: Any, output_file: str):
        """Save prediction in the appropriate format.

        Args:
            prediction (any): The predicted tensor.
            output_file (str): Path to the file where storing the prediction.
        """
        raise NotImplementedError

    def predict(self, *args, **kwargs) -> Any:
        """Take as input a tensor and returns a prediction"""
        out = self(*args, **kwargs)

        # TensorFlow predict method must return a np array
        if isinstance(out[0], tf.Tensor):
            out = tuple(t.numpy() for t in out)

        return out

    @abstractmethod
    def run(self, *args, **kwargs) -> Any:
        """Abstract method implementing the prediction code."""
        raise NotImplementedError()

    def forward(self, *args, **kwargs):
        """Alternative method to the predict one."""
        return self(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        if self.input_tfms is not None:
            args = (self.input_tfms(_input) for _input in args)
        return self.run(*args, **kwargs)

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path): Path to the directory where saving the model.
        """
        raise NotImplementedError()

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path): Path to the directory where the model is stored.

        Returns:
            BaseInferenceLearner: Loaded model.
        """
        raise NotImplementedError()

    @abstractmethod
    def get_size(self):
        """The function returns the size of the optimized model."""
        raise NotImplementedError()

    @abstractmethod
    def free_gpu_memory(self):
        """The function cleans the gpu occupied by the inference learner."""
        raise NotImplementedError

    @abstractmethod
    def get_inputs_example(self):
        """The function returns an example of the input for the optimized
        model predict method.
        """
        raise NotImplementedError()

    @property
    @abstractmethod
    def output_format(self):
        return ".txt"

    @property
    @abstractmethod
    def input_format(self):
        return ".txt"


class LearnerMetadata:
    """Class for storing all the metadata about a model.

    The stored information can be used for loading the appropriate model.

    Attributes:
        class_name (str): Name of the model class. For instance, for the model
            object `CustomModel()`, the class name is 'CustomModel'.
        module_name (str): Path to the python module where the model class
            is defined.
        network_parameters (Dict): Dictionaty containing the network
            parameters, i.e. batch_size, input_size and output_size.
        kwargs: External attributes that will be stored in the Metadata file.
    """

    NAME: str = LEARNER_METADATA_FILENAME
    class_name: str
    module_name: str
    device: str
    quantization_type: str

    def __init__(
        self,
        class_name: str,
        module_name: str,
        network_parameters: Union[ModelParams, Dict],
        input_tfms: Union[MultiStageTransformation, Dict] = None,
        **kwargs,
    ):
        self.class_name = class_name
        self.module_name = module_name
        self.network_parameters = (
            network_parameters.dict()
            if isinstance(network_parameters, ModelParams)
            else network_parameters
        )
        self.input_tfms = (
            input_tfms.to_dict()
            if isinstance(input_tfms, MultiStageTransformation)
            else input_tfms
        )
        self.__dict__.update(**kwargs)

    def __getitem__(self, item):
        if not isinstance(item, str):
            raise TypeError(
                f"Error in key type. Expected str got {type(item)}"
            )
        elif item.startswith("_"):
            raise ValueError("Trying to access a private attribute.")
        return self.__dict__.get(item)

    @classmethod
    def from_model(cls, model: BaseInferenceLearner, **kwargs):
        """Create the metadata from the Inference Learner.

        Args:
            model (BaseInferenceLearner): Model from which extract the
                metadata.
            kwargs: External attributes that will be stored in the Metadata
                file.

        Returns:
            LearnerMetadata: Metadata associated with the model.
        """
        return cls(
            class_name=model.__class__.__name__,
            module_name=model.__module__,
            network_parameters=model.network_parameters,
            input_tfms=model.input_tfms,
            device=model.device.type.value
            if model.device is not None
            else None,
            quantization_type=model.quantization_type.value
            if model.quantization_type is not None
            else None,
            **kwargs,
        )

    @classmethod
    def from_dict(cls, dictionary: Dict):
        """Create the metadata file from a dictionary.

        This method is the reverse one of `to_dict`.

        Args:
            dictionary (Dict): Dictionary containing the metadata.

        Returns:
            LearnerMetadata: Metadata associated with the model.
        """
        if any(
            key not in dictionary
            for key in ("class_name", "module_name", "network_parameters")
        ):
            raise ValueError(
                "The input dictionary should contain both the model class "
                "name and module."
            )
        return cls(**dictionary)

    def to_dict(self) -> Dict:
        """Method for converting the LearnerMetadata in a python dictionary.

        Returns:
            Dict: Dictionary containing the metadata.
        """
        return {
            key: value
            for key, value in self.__dict__.items()
            if (
                len(key) > 0
                and key[0].islower()
                and not key.startswith("_")
                and value is not None
            )
        }

    @classmethod
    def read(cls, path: Union[Path, str]):
        """Read the metadata file and store it into a LearnerMetadata object.

        Args:
            path (Path): Path to the directory containing the metadata file.

        Returns:
            LearnerMetadata: Metadata associated with the model.
        """
        path = Path(path)
        with open(path / cls.NAME, "r") as fin:
            metadata_dict = json.load(fin)
        return cls(**metadata_dict)

    def save(self, path: Union[Path, str]):
        """Save the metadata of the model in a file.

        Args:
            path (Path): Path to the directory where saving the model metadata.
        """
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata_dict = self.to_dict()
        with open(path / self.NAME, "w") as fout:
            json.dump(metadata_dict, fout)

    def load_model(
        self, path: Union[Path, str], **kwargs
    ) -> BaseInferenceLearner:
        """Method for loading the InferenceLearner from its metadata.

        The ModelMetadata file contains all the information necessary for
        loading the Learner, as it contains both the module where the model
        is defined and the class name of the model object. This method calls
        the appropriate class method of the Model object, thus the actual
        model loading is delegate to its methods.

        Args:
            path (Path): Path to the directory containing the files where
                the model optimization is saved.
            kwargs: Dictionary containing the arguments for the model's load
                function.
        """
        exec(f"from {self.module_name} import {self.class_name}")
        model = eval(self.class_name).load(path=path, **kwargs)
        return model


class PytorchBaseInferenceLearner(BaseInferenceLearner, ABC):
    @property
    def input_format(self):
        return ".pt"

    @property
    def output_format(self):
        return ".pt"

    def list2tensor(self, listified_tensor: List) -> torch.Tensor:
        """Convert list to tensor.

        Args:
            listified_tensor (List): Listified version of the input tensor.

        Returns:
            torch.Tensor: Tensor for the prediction.
        """
        return torch.tensor(listified_tensor)

    def tensor2list(self, tensor: torch.Tensor) -> List:
        """Convert tensor to list.

        Args:
            tensor (any): Input tensor.

        Returns:
            List: Listified version of the tensor.
        """
        return tensor.cpu().detach().numpy().tolist()

    def free_gpu_memory(self):
        self.model.cpu()
        self._is_gpu_ready = False

    def set_model_on_gpu(self):
        self.model.to(self.device.to_torch_format())
        self._is_gpu_ready = True

    def _read_file(self, input_file: Union[str, Path]) -> torch.Tensor:
        input_tensor = torch.load(input_file)
        return input_tensor

    def _save_file(
        self, prediction: torch.Tensor, output_file: Union[str, Path]
    ):
        torch.save(prediction, output_file)

    def get_inputs_example(self, random=False):
        if self._input_data is None or random:
            return tuple(
                create_model_inputs_torch(
                    input_infos=self.network_parameters.input_infos,
                )
            )
        else:
            return self._input_data

    def get_size(self):
        try:
            if hasattr(self.model, "core_model"):
                return get_torch_model_size(self.model.core_model)
            else:
                # Normal torch model
                return get_torch_model_size(self.model)
        except RuntimeError:
            with TemporaryDirectory() as tmp_dir:
                self.save(tmp_dir)
                return sum(
                    os.path.getsize(Path(tmp_dir) / f)
                    for f in os.listdir(Path(tmp_dir))
                    if os.path.isfile(Path(tmp_dir) / f)
                )


class TensorflowBaseInferenceLearner(BaseInferenceLearner, ABC):
    @property
    def input_format(self):
        return ".npy"

    @property
    def output_format(self):
        return ".npy"

    def free_gpu_memory(self):
        tf.keras.backend.clear_session()
        self._is_gpu_ready = False

    def set_model_on_gpu(self):
        self._is_gpu_ready = True

    def list2tensor(self, listified_tensor: List) -> tf.Tensor:
        """Convert list to tensor.

        Args:
            listified_tensor (List): Listified version of the input tensor.

        Returns:
            tf.Tensor: Tensor ready to be used for prediction.
        """
        return tf.convert_to_tensor(listified_tensor)

    def tensor2list(self, tensor: tf.Tensor) -> List:
        """Convert tensor to list.

        Args:
            tensor (tf.Tensor): Input tensor.

        Returns:
            List: Listified version of the tensor.
        """
        return tensor.numpy().tolist()

    def _read_file(self, input_file: Union[str, Path]) -> tf.Tensor:
        numpy_array = np.load(input_file)
        input_tensor = tf.convert_to_tensor(numpy_array)
        return input_tensor

    def _save_file(self, prediction: tf.Tensor, output_file: Union[str, Path]):
        prediction.numpy().save(output_file)

    def get_inputs_example(self, random=False):
        if self._input_data is None or random:
            return tuple(
                create_model_inputs_tf(
                    input_infos=self.network_parameters.input_infos,
                )
            )
        else:
            return self._input_data


class NumpyBaseInferenceLearner(BaseInferenceLearner, ABC):
    @property
    def input_format(self):
        return ".npy"

    @property
    def output_format(self):
        return ".npy"

    def list2tensor(self, listified_tensor: List) -> np.ndarray:
        """Convert list to numpy arrays.

        Args:
            listified_tensor (List): Listified version of the input tensor.

        Returns:
            np.array: Tensor ready to be used for prediction.
        """
        return np.array(listified_tensor)

    def tensor2list(self, tensor: np.ndarray) -> List:
        """Convert tensor to list.

        Args:
            tensor (tf.Tensor): Input tensor.

        Returns:
            List: Listified version of the tensor.
        """
        return tensor.tolist()

    def _read_file(self, input_file: Union[str, Path]) -> np.ndarray:
        numpy_array = np.load(input_file)
        return numpy_array

    def _save_file(
        self, prediction: np.ndarray, output_file: Union[str, Path]
    ):
        np.save(output_file, prediction)

    def get_inputs_example(self, random=False):
        if self._input_data is None or random:
            return tuple(
                create_model_inputs_onnx(
                    input_infos=self.network_parameters.input_infos,
                )
            )
        else:
            return self._input_data


class InferenceLearnerWrapper(BaseInferenceLearner, ABC):
    """Wrapper model around InferenceLearners. It's a base class: cannot be
    instantiated.

    For all the BaseInferenceLearner-related methods, the implementation of
    the core model will be used. This class just re-implement the load and save
    methods, allowing (and forcing) then the child class to re-implement the
    `predict` method.

    Attributes:
        network_parameters (ModelParams): Model parameters.
        core_inference_learner (BaseInferenceLearner): Inference Learner.
    """

    CORE_MODEL_SAVE_DIR = "core_model"

    def __init__(self, core_inference_learner: BaseInferenceLearner):
        super().__init__(
            network_parameters=core_inference_learner.network_parameters
        )
        self.core_inference_learner = core_inference_learner

    def list2tensor(self, listified_tensor: List) -> Any:
        return self.core_inference_learner.list2tensor(listified_tensor)

    def tensor2list(self, tensor: Any) -> List:
        return self.core_inference_learner.tensor2list(tensor)

    def _read_file(self, input_file: str) -> Any:
        return self.core_inference_learner._read_file(input_file)

    def _save_file(self, prediction: Any, output_file: str):
        self.core_inference_learner._save_file(prediction, output_file)

    def save(self, path: Union[str, Path], **kwargs):
        core_model_path = Path(path) / self.CORE_MODEL_SAVE_DIR
        core_model_path.mkdir(exist_ok=True, parents=True)
        self.core_inference_learner.save(core_model_path, **kwargs)
        extra_metadata_kwargs = self._get_extra_metadata_kwargs()
        metadata = LearnerMetadata.from_model(self, **extra_metadata_kwargs)
        metadata.save(path)
        self._save_wrapper_extra_info()

    def _get_extra_metadata_kwargs(self) -> Dict:
        raise NotImplementedError

    def _save_wrapper_extra_info(self):
        raise NotImplementedError

    @staticmethod
    def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:
        raise NotImplementedError

    @staticmethod
    def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
        raise NotImplementedError

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        core_model_path = Path(path) / cls.CORE_MODEL_SAVE_DIR
        core_learner = LearnerMetadata.read(core_model_path).load_model(
            core_model_path, **kwargs
        )
        metadata = LearnerMetadata.read(path)
        input_dict = cls._convert_metadata_to_inputs(metadata)
        input_dict = cls._load_wrapper_extra_info(input_dict)
        input_dict.update({"core_inference_learner": core_learner})
        return cls(**input_dict)

    def free_gpu_memory(self):
        return self.core_inference_learner.free_gpu_memory()

    def get_inputs_example(self):
        return self.core_inference_learner.get_inputs_example()

    @property
    def output_format(self):
        return self.core_inference_learner.output_format

    @property
    def input_format(self):
        return self.core_inference_learner.input_format


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/blade_disc.py
================================================
from typing import Optional

from nebullvm.core.models import ModelParams, Device
from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)
from nebullvm.optional_modules.torch import ScriptModule
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class BladeDISCInferenceLearner(TorchScriptInferenceLearner):
    name = "BladeDISC"

    @classmethod
    def from_torch_model(
        cls,
        model: ScriptModule,
        network_parameters: ModelParams,
        device: Device,
        input_tfms: Optional[MultiStageTransformation] = None,
        input_data: DataManager = None,
    ):
        return cls(
            torch_model=model,
            network_parameters=network_parameters,
            input_tfms=input_tfms,
            input_data=input_data,
            device=device,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/builders.py
================================================
from pathlib import Path
from typing import Any, Union

from nebullvm.core.models import (
    ModelParams,
    DeepLearningFramework,
    QuantizationType,
    DeviceType,
)
from nebullvm.operations.inference_learners.base import BuildInferenceLearner
from nebullvm.operations.inference_learners.deepsparse import (
    PytorchDeepSparseInferenceLearner,
)
from nebullvm.operations.inference_learners.faster_transformer import (
    FasterTransformerInferenceLearner,
)
from nebullvm.operations.inference_learners.neural_compressor import (
    PytorchNeuralCompressorInferenceLearner,
)
from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS
from nebullvm.operations.inference_learners.openvino import (
    OPENVINO_INFERENCE_LEARNERS,
)
from nebullvm.operations.inference_learners.tensor_rt import (
    TENSOR_RT_INFERENCE_LEARNERS,
    PytorchTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tensorflow import (
    TensorflowBackendInferenceLearner,
    TFLiteBackendInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
    TorchDynamoInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_neuron import (
    TorchNeuronInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_xla import (
    TorchXLAInferenceLearner,
)
from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
    APACHE_TVM_INFERENCE_LEARNERS,
    PytorchApacheTVMInferenceLearner,
)
from nebullvm.optional_modules.tensor_rt import tensorrt as trt
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import (
    ScriptModule,
    Module,
    GraphModule,
    torch,
)
from nebullvm.optional_modules.tvm import tvm, ExecutorFactoryModule
from nebullvm.tools.onnx import get_input_names, get_output_names
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    VerifyContiguity,
)


class TorchScriptBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ScriptModule,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TorchScriptInferenceLearner(
            torch_model=model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class TorchXLABuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: torch.nn.Module,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TorchXLAInferenceLearner(
            torch_model=model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class TorchNeuronBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ScriptModule,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TorchNeuronInferenceLearner(
            torch_model=model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class TorchDynamoBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ScriptModule,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TorchDynamoInferenceLearner(
            torch_model=model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class TensorflowBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: tf.Module,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TensorflowBackendInferenceLearner(
            model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class TFLiteBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: bytes,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = TFLiteBackendInferenceLearner(
            model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


class DeepSparseBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: Union[str, Path],
        model_params: ModelParams,
        **kwargs,
    ):
        input_names = get_input_names(str(model))
        output_names = get_output_names(str(model))

        self.inference_learner = PytorchDeepSparseInferenceLearner(
            onnx_path=model,
            network_parameters=model_params,
            input_names=input_names,
            output_names=output_names,
            device=self.device,
        )


class ONNXBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: Union[str, Path],
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        source_dl_framework: DeepLearningFramework,
        quantization_type: QuantizationType,
        **kwargs,
    ):
        input_names = get_input_names(str(model))
        output_names = get_output_names(str(model))

        self.inference_learner = ONNX_INFERENCE_LEARNERS[source_dl_framework](
            onnx_path=model,
            network_parameters=model_params,
            input_names=input_names,
            output_names=output_names,
            input_tfms=input_tfms,
            device=self.device,
            quantization_type=quantization_type,
        )


class OpenVINOBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: str,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        source_dl_framework: DeepLearningFramework,
        **kwargs,
    ):
        self.inference_learner = OPENVINO_INFERENCE_LEARNERS[
            source_dl_framework
        ].from_model_name(
            model_name=model + ".xml",
            model_weights=model + ".bin",
            input_tfms=input_tfms,
            network_parameters=model_params,
            device=self.device,
        )


class PyTorchTensorRTBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ScriptModule,
        input_tfms: MultiStageTransformation,
        model_params: ModelParams,
        **kwargs,
    ):
        self.inference_learner = PytorchTensorRTInferenceLearner(
            torch_model=model,
            input_tfms=input_tfms,
            network_parameters=model_params,
            device=self.device,
        )


class ONNXTensorRTBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: Any,
        model_orig: Union[str, Path],
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        source_dl_framework: DeepLearningFramework,
        **kwargs,
    ):
        nvidia_logger = trt.Logger(trt.Logger.ERROR)
        input_names = get_input_names(str(model_orig))
        output_names = get_output_names(str(model_orig))

        input_tfms.append(VerifyContiguity())
        runtime = trt.Runtime(nvidia_logger)
        engine = runtime.deserialize_cuda_engine(model)

        self.inference_learner = TENSOR_RT_INFERENCE_LEARNERS[
            source_dl_framework
        ](
            engine=engine,
            input_tfms=input_tfms,
            network_parameters=model_params,
            input_names=input_names,
            output_names=output_names,
            nvidia_logger=nvidia_logger,
            device=self.device,
        )


class IntelNeuralCompressorBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: GraphModule,
        model_orig: Module,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = PytorchNeuralCompressorInferenceLearner(
            model=model_orig,
            model_quant=model,
            input_tfms=input_tfms,
            network_parameters=model_params,
            device=self.device,
        )


class PyTorchApacheTVMBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ExecutorFactoryModule,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        target_device = (
            str(tvm.target.cuda())
            if self.device.type is DeviceType.GPU
            else "llvm"
        )
        dev = tvm.device(str(target_device), 0)

        input_names = [
            f"input_{i}" for i in range(len(model_params.input_infos))
        ]

        graph_executor_module = tvm.contrib.graph_executor.GraphModule(
            model["default"](dev)
        )
        self.inference_learner = PytorchApacheTVMInferenceLearner(
            input_tfms=input_tfms,
            network_parameters=model_params,
            graph_executor_module=graph_executor_module,
            input_names=input_names,
            lib=model,
            target=target_device,
            device=self.device,
        )


class ONNXApacheTVMBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ExecutorFactoryModule,
        model_orig: str,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        source_dl_framework: DeepLearningFramework,
        **kwargs,
    ):
        target_device = (
            str(tvm.target.cuda())
            if self.device.type is DeviceType.GPU
            else "llvm"
        )
        dev = tvm.device(str(target_device), 0)

        input_names = (
            get_input_names(model_orig)
            if model_orig is not None
            else [f"input_{i}" for i in range(len(model_params.input_infos))]
        )

        graph_executor_module = tvm.contrib.graph_executor.GraphModule(
            model["default"](dev)
        )
        self.inference_learner = APACHE_TVM_INFERENCE_LEARNERS[
            source_dl_framework
        ](
            input_tfms=input_tfms,
            network_parameters=model_params,
            graph_executor_module=graph_executor_module,
            input_names=input_names,
            lib=model,
            target=target_device,
            device=self.device,
        )


class FasterTransformerBuildInferenceLearner(BuildInferenceLearner):
    def execute(
        self,
        model: ScriptModule,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation,
        **kwargs,
    ):
        self.inference_learner = FasterTransformerInferenceLearner(
            torch_model=model,
            network_parameters=model_params,
            input_tfms=input_tfms,
            device=self.device,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/deepsparse.py
================================================
import os
import shutil
from abc import ABC
from pathlib import Path
from typing import Union, List, Generator, Tuple, Dict, Type

import numpy as np
from loguru import logger

from nebullvm.config import ONNX_FILENAMES
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
)
from nebullvm.optional_modules.deepsparse import cpu, compile_model
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.transformations import MultiStageTransformation


class DeepSparseInferenceLearner(BaseInferenceLearner, ABC):
    """Model optimized on CPU using DeepSparse. DeepSparse is an engine
    accelerating sparse computations on CPUs.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    name = "DeepSparse"

    def __init__(
        self,
        onnx_path: Union[str, Path],
        input_names: List[str],
        output_names: List[str],
        device: Device,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.onnx_path = self._store_file(onnx_path)

        # Compile model
        cores_per_socket, _, _ = cpu.cpu_details()
        # Define the number of cores to use, by default it will make use of
        # all physical cores on the system
        num_cores = cores_per_socket
        batch_size = kwargs["network_parameters"].batch_size
        self.engine = compile_model(onnx_path, batch_size, num_cores)

        self.input_names = input_names
        self.output_names = output_names
        self.device = device

    def get_size(self):
        return os.path.getsize(self.onnx_path)

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        metadata = LearnerMetadata.from_model(
            self,
            input_names=self.input_names,
            output_names=self.output_names,
            **kwargs,
        )
        metadata.save(path)

        shutil.copy(
            self.onnx_path,
            Path(path) / ONNX_FILENAMES["model_name"],
        )

    def free_gpu_memory(self):
        raise NotImplementedError("DeepSparse does not support GPU inference.")

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for consistency
                with other Learners.

        Returns:
            DeepSparseInferenceLearner: The optimized model.
        """
        if len(kwargs) > 0:
            logger.warning(
                f"No extra keywords expected for the load method. "
                f"Got {kwargs}."
            )
        onnx_path = os.path.join(str(path), ONNX_FILENAMES["model_name"])
        metadata = LearnerMetadata.read(path)
        input_tfms = metadata.input_tfms
        if input_tfms is not None:
            input_tfms = MultiStageTransformation.from_dict(
                metadata.input_tfms
            )
        device = Device.from_str(metadata.device)
        return cls(
            input_tfms=input_tfms,
            network_parameters=ModelParams(**metadata.network_parameters),
            onnx_path=onnx_path,
            input_names=metadata["input_names"],
            output_names=metadata["output_names"],
            device=device,
        )

    def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):
        inputs = [array for array in input_arrays]
        outputs = self.engine(inputs)
        return outputs


class PytorchDeepSparseInferenceLearner(
    DeepSparseInferenceLearner, PytorchBaseInferenceLearner
):
    """Model optimized on CPU using DeepSparse. DeepSparse is an engine
    accelerating sparse computations on CPUs.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (
            input_tensor.cpu().detach().numpy()
            for input_tensor in input_tensors
        )
        outputs = self._predict_arrays(input_arrays)
        return tuple(torch.from_numpy(output) for output in outputs)


DEEPSPARSE_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[DeepSparseInferenceLearner]
] = {DeepLearningFramework.PYTORCH: PytorchDeepSparseInferenceLearner}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/faster_transformer.py
================================================
from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)


class FasterTransformerInferenceLearner(TorchScriptInferenceLearner):
    MODEL_NAME = "faster_transformer_model_scripted.pt"
    name = "FasterTransformer"


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/huggingface.py
================================================
from abc import ABC
from collections import OrderedDict
from pathlib import Path
from typing import List, Any, Dict, Union

from nebullvm.operations.inference_learners.base import (
    InferenceLearnerWrapper,
    PytorchBaseInferenceLearner,
    LearnerMetadata,
    BaseInferenceLearner,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.diffusers import postprocess_diffusers
from nebullvm.tools.huggingface import restructure_output
from nebullvm.tools.pytorch import get_torch_model_size


class HuggingFaceInferenceLearner(InferenceLearnerWrapper):
    """Class wrapping an InferenceLearner model and giving to it the
    huggingface interface.

    The class fuse both the InterfaceLearner and HuggingFace interfaces, giving
    to the final user a model which can be used whit the prefered API without
    the need of adapting the previous code.

    Attributes:
        network_parameters (ModelParams): Model parameters of the model.
        core_inference_learner (PytorchBaseInferenceLearner): Inference learner
            built using the Pytorch interface.
        output_structure (Dict): Original output structure of the HuggingFace
            model.
        input_names (List[str]): List of all the input keys used for the
            original HuggingFace model.
        output_type (Any, optional): Original output type of the HuggingFace
            model.
    """

    @property
    def name(self) -> str:
        return self.core_inference_learner.name

    def __init__(
        self,
        core_inference_learner: PytorchBaseInferenceLearner,
        output_structure: OrderedDict,
        input_names: List[str],
        output_type: Any = None,
    ):
        super().__init__(core_inference_learner)
        self.output_structure = output_structure
        self.input_names = input_names
        self.output_type = output_type

    def _save_wrapper_extra_info(self):
        pass

    def get_size(self):
        return self.core_inference_learner.get_size()

    @staticmethod
    def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
        return builder_inputs

    def run(self, *args, **kwargs) -> Any:
        """Run the underlying optimized model for getting a prediction.

        The method has an hybrid interface. It accepts inputs either as
        positional or keyword arguments. If only positional arguments are given
        the method expects the inputs to be in the canonical
        nebullvm interface. If only keyword arguments are given the method
        expects them to be in the HuggingFace interface. Mixed representation
        is not allowed and will result in an error.
        """
        if len(args) > 0 and len(kwargs) > 0:
            raise RuntimeError(
                "Not allowed usage of the predict method. "
                "Either the positional or the keyword arguments must be given."
            )
        if len(args) > 0:
            return self.core_inference_learner(*args)
        inputs = (kwargs.pop(name) for name in self.input_names)
        outputs = self.core_inference_learner(*inputs)

        if self.output_type is tuple:
            return outputs
        else:
            return restructure_output(
                outputs, self.output_structure, self.output_type
            )

    def _get_extra_metadata_kwargs(self) -> Dict:
        metadata_kwargs = {
            "output_structure": self.output_structure,
            "output_structure_keys": list(self.output_structure.keys()),
            "input_names": self.input_names,
        }
        if self.output_type is not None:
            metadata_kwargs.update(
                {
                    "output_type": self.output_type.__name__,
                    "output_type_module": self.output_type.__module__,
                }
            )
        return metadata_kwargs

    @staticmethod
    def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:
        # we need to guarantee the preservation of the output structure
        # elements order.
        output_structure = OrderedDict()
        for key in metadata["output_structure_keys"]:
            output_structure[key] = metadata["output_structure"][key]

        inputs = {
            "output_structure": output_structure,
            "input_names": metadata["input_names"],
        }
        if metadata["output_type"] is not None:
            exec(
                f"from {metadata['output_type_module']} "
                f"import {metadata['output_type']}"
            )
            inputs["output_type"] = eval(metadata["output_type"])
        return inputs


class DiffusionInferenceLearner(BaseInferenceLearner, ABC):
    @property
    def name(self) -> str:
        return self.pipeline.unet.model.name

    def __init__(self, pipeline: StableDiffusionPipeline):
        self.pipeline = pipeline

    def __call__(self, *args, **kwargs):
        return self.pipeline(*args, **kwargs)

    def run(self, *args, **kwargs) -> Any:
        self.pipeline(*args, **kwargs)

    def save(self, path: Union[str, Path], **kwargs):
        self.pipeline.unet.model.save(path)

    @classmethod
    def load(
        cls,
        path: Union[Path, str],
        **kwargs,
    ):
        try:
            pipe = kwargs["pipe"]
        except KeyError:
            raise TypeError("Missing required argument 'pipe'")
        optimized_model = LearnerMetadata.read(path).load_model(path)
        return postprocess_diffusers(
            optimized_model,
            pipe,
            optimized_model.device,
        )

    def get_size(self):
        (
            self.pipeline.unet.model.get_size()
            + sum(
                [
                    get_torch_model_size(v)
                    for (k, v) in self.pipeline.__dict__.items()
                    if isinstance(v, torch.nn.Module) and k != "unet"
                ]
            )
            / 1e6
        )

    def free_gpu_memory(self):
        raise self.pipeline.unet.model.free_gpu_memory()

    def get_inputs_example(self):
        raise NotImplementedError()

    @property
    def output_format(self):
        return ".pt"

    @property
    def input_format(self):
        return ".pt"

    def list2tensor(self, listified_tensor: List) -> Any:
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/neural_compressor.py
================================================
from abc import ABC
from pathlib import Path
from typing import Union, Tuple, Dict, Type

from loguru import logger

from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
)
from nebullvm.optional_modules.neural_compressor import (
    cfgs_to_fx_cfgs,
    cfg_to_qconfig,
)
from nebullvm.optional_modules.torch import (
    torch,
    prepare_fx,
    convert_fx,
    Module,
    GraphModule,
)
from nebullvm.tools.pytorch import (
    save_with_torch_fx,
    load_with_torch_fx,
    create_model_inputs_torch,
    get_torch_model_size,
)
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import check_module_version


class NeuralCompressorInferenceLearner(BaseInferenceLearner, ABC):
    """Model optimized on CPU using IntelNeuralCompressor.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        model (torch.fx.GraphModule): Torch fx graph model.
    """

    name = "IntelNeuralCompressor"

    def __init__(
        self,
        model: Union[Module, GraphModule],
        model_quant: GraphModule,
        device: Device,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.model = model
        self.model_quant = model_quant
        self.device = device

    def get_size(self):
        return get_torch_model_size(self.model_quant) + get_torch_model_size(
            self.model
        )

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)

        path_orig_model = Path(path) / Path("model_orig")
        path_quant_model = Path(path) / Path("model_quant")

        save_with_torch_fx(self.model, path_orig_model)
        self.model_quant.save(str(path_quant_model))

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for consistency
                with other Learners.

        Returns:
            DeepSparseInferenceLearner: The optimized model.
        """
        if len(kwargs) > 0:
            logger.warning(
                f"No extra keywords expected for the load method. "
                f"Got {kwargs}."
            )

        metadata = LearnerMetadata.read(path)
        input_tfms = metadata.input_tfms
        if input_tfms is not None:
            input_tfms = MultiStageTransformation.from_dict(
                metadata.input_tfms
            )

        network_parameters = ModelParams(**metadata.network_parameters)

        path_orig_model = Path(path) / Path("model_orig")
        path_quant_model = Path(path) / Path("model_quant") / "best_model.pt"

        model = load_with_torch_fx(
            Path(path_orig_model), "state_dict.pt"
        ).eval()
        state_dict = torch.load(path_quant_model)

        tune_cfg = state_dict.pop("best_configure")
        op_cfgs = cfg_to_qconfig(tune_cfg, tune_cfg["approach"])
        fx_op_cfgs = cfgs_to_fx_cfgs(op_cfgs, tune_cfg["approach"])

        additional_arguments = {}
        if check_module_version(torch, min_version="1.13.0"):
            additional_arguments["example_inputs"] = tuple(
                create_model_inputs_torch(
                    input_infos=network_parameters.input_infos,
                )
            )

        q_model = prepare_fx(
            model,
            fx_op_cfgs,
            **additional_arguments,
        )
        q_model = convert_fx(q_model)

        q_model.load_state_dict(state_dict)
        device = Device.from_str(metadata.device)

        return cls(
            model=model,
            model_quant=q_model,
            device=device,
            input_tfms=input_tfms,
            network_parameters=ModelParams(**metadata.network_parameters),
        )


class PytorchNeuralCompressorInferenceLearner(
    NeuralCompressorInferenceLearner, PytorchBaseInferenceLearner
):
    """Model optimized on CPU using IntelNeuralCompressor.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        model (torch.fx.GraphModule): Torch fx graph model.
    """

    def free_gpu_memory(self):
        raise NotImplementedError(
            "NeuralCompressor does not support GPU inference."
        )

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        inputs = (t.cpu() for t in input_tensors)
        outputs = self.model_quant(*inputs)

        if isinstance(outputs, torch.Tensor):
            outputs = (outputs,)

        return outputs


NEURAL_COMPRESSOR_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[NeuralCompressorInferenceLearner]
] = {DeepLearningFramework.PYTORCH: PytorchNeuralCompressorInferenceLearner}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/onnx.py
================================================
import multiprocessing
import os
import shutil
from abc import ABC
from pathlib import Path
from typing import Union, List, Generator, Tuple, Dict, Type

import cpuinfo
import numpy as np
from loguru import logger

from nebullvm.config import (
    ONNX_FILENAMES,
    ONNX_PROVIDERS,
)
from nebullvm.core.models import (
    QuantizationType,
    Device,
    DeviceType,
    ModelParams,
    DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
    TensorflowBaseInferenceLearner,
    NumpyBaseInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import (
    tensorrt_is_available,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.transformations import MultiStageTransformation


def _running_on_intel_cpu(use_gpu):
    if use_gpu:
        return False  # running on GPU
    cpu_info = cpuinfo.get_cpu_info()["brand_raw"].lower()
    if "intel" in cpu_info:
        return True
    return False


def _get_ort_session_options(use_gpu) -> ort.SessionOptions:
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = (
        ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    )
    if not use_gpu:
        sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
        sess_options.inter_op_num_threads = 1
        sess_options.intra_op_num_threads = max(
            int(
                os.environ.get("NEBULLVM_THREADS_PER_MODEL")
                or multiprocessing.cpu_count()
            ),
            1,
        )
    return sess_options


class ONNXInferenceLearner(BaseInferenceLearner, ABC):
    """Model converted to ONNX and run with Microsoft's onnxruntime.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    name = "ONNXRuntime"

    def __init__(
        self,
        onnx_path: Union[str, Path],
        input_names: List[str],
        output_names: List[str],
        device: Device,
        quantization_type: QuantizationType,
        **kwargs,
    ):
        super().__init__(**kwargs)
        filename = Path(onnx_path).name
        dir_path = str(Path(onnx_path).parent)
        self.device = device

        self.onnx_path = Path(self._store_dir(dir_path)) / filename
        self.sess_options = _get_ort_session_options(
            self.device.type is DeviceType.GPU
        )
        self.quantization_type = quantization_type

        if _running_on_intel_cpu(self.device.type is DeviceType.GPU):
            self.sess_options.add_session_config_entry(
                "session.set_denormal_as_zero", "1"
            )

        self.set_model_on_gpu()

        self._is_gpu_ready = self.device.type is DeviceType.GPU
        self.input_names = input_names
        self.output_names = output_names

    @staticmethod
    def _setup_tensorrt(quantization_type: QuantizationType, device: Device):
        if (
            tensorrt_is_available()
            and os.environ.get("LD_LIBRARY_PATH", False)
            and "tensorrt" in os.environ["LD_LIBRARY_PATH"]
        ):
            ONNX_PROVIDERS["cuda"][0] = (
                "TensorrtExecutionProvider",
                {
                    "device_id": device.idx,
                    "trt_max_workspace_size": device.get_free_memory(),
                    "trt_fp16_enable": True
                    if quantization_type is not None
                    else False,
                    "trt_int8_enable": True
                    if quantization_type is QuantizationType.STATIC
                    else False,
                },
            )
        else:
            if tensorrt_is_available():
                logger.warning(
                    "TensorrtExecutionProvider for onnx is not "
                    "available. If you want to use it, please  "
                    "add the path to tensorrt to the "
                    "LD_LIBRARY_PATH environment variable. "
                    "CUDA provider will be used instead. "
                )
            else:
                logger.warning(
                    "TensorRT is not available. "
                    "If you want to use it, please install it and "
                    "add the path to the LD_LIBRARY_PATH "
                    "environment variable."
                    "CUDA provider will be used instead. "
                )
            if "TensorrtExecutionProvider" in ONNX_PROVIDERS["cuda"]:
                ONNX_PROVIDERS["cuda"].remove("TensorrtExecutionProvider")

    def get_size(self):
        return sum(
            os.path.getsize(self.onnx_path.parents[0] / f)
            for f in os.listdir(self.onnx_path.parents[0])
            if os.path.isfile(self.onnx_path.parents[0] / f)
        )

    def free_gpu_memory(self):
        del self._session
        self._is_gpu_ready = False

    def set_model_on_gpu(self):
        if (
            self.device.type is DeviceType.GPU
            and len(ONNX_PROVIDERS["cuda"]) == 3
        ):
            ONNX_PROVIDERS["cuda"][1] = (
                "CUDAExecutionProvider",
                {
                    "device_id": self.device.idx,
                },
            )
            self._setup_tensorrt(self.quantization_type, self.device)

        ort_session = ort.InferenceSession(
            str(self.onnx_path),
            sess_options=self.sess_options,
            providers=ONNX_PROVIDERS["cuda"]
            if self.device.type is DeviceType.GPU
            else ONNX_PROVIDERS["cpu"],
        )
        self._session = ort_session
        self._is_gpu_ready = True

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        metadata = LearnerMetadata.from_model(
            self,
            input_names=self.input_names,
            output_names=self.output_names,
            **kwargs,
        )

        path = Path(path)
        path.mkdir(exist_ok=True)

        metadata.save(path)

        shutil.copy(
            self.onnx_path,
            os.path.join(str(path), ONNX_FILENAMES["model_name"]),
        )

        try:
            # Tries to load the model
            onnx.load(os.path.join(str(path), ONNX_FILENAMES["model_name"]))
        except FileNotFoundError:
            # If missing files, it means it's saved in onnx external_data
            # format
            src_dir = str(Path(self.onnx_path).parent)
            files = os.listdir(src_dir)
            for fname in files:
                if ".onnx" not in fname:
                    shutil.copy2(
                        os.path.join(src_dir, fname), os.path.join(path, fname)
                    )

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for consistency
                with other Learners.

        Returns:
            ONNXInferenceLearner: The optimized model.
        """
        if len(kwargs) > 0:
            logger.warning(
                f"No extra keywords expected for the load method. "
                f"Got {kwargs}."
            )
        path = Path(path)
        onnx_path = path / ONNX_FILENAMES["model_name"]
        metadata = LearnerMetadata.read(path)
        input_tfms = metadata.input_tfms
        device = Device.from_str(metadata.device)
        quantization_type = (
            QuantizationType(metadata.quantization_type)
            if hasattr(metadata, "quantization_type")
            else None
        )
        if input_tfms is not None:
            input_tfms = MultiStageTransformation.from_dict(
                metadata.input_tfms
            )
        return cls(
            input_tfms=input_tfms,
            network_parameters=ModelParams(**metadata.network_parameters),
            onnx_path=onnx_path,
            input_names=metadata["input_names"],
            output_names=metadata["output_names"],
            device=device,
            quantization_type=quantization_type,
        )

    def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):
        input_dict = {
            name: input_array
            for name, input_array in zip(self.input_names, input_arrays)
        }
        outputs = self._session.run(self.output_names, input_dict)
        return outputs


class PytorchONNXInferenceLearner(
    ONNXInferenceLearner, PytorchBaseInferenceLearner
):
    """Model run with Microsoft's onnxruntime using a Pytorch interface.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        input_arrays = (
            input_tensor.cpu().detach().numpy()
            for input_tensor in input_tensors
        )
        outputs = self._predict_arrays(input_arrays)
        return tuple(
            torch.from_numpy(output).to(self.device.to_torch_format())
            for output in outputs
        )


class TensorflowONNXInferenceLearner(
    ONNXInferenceLearner, TensorflowBaseInferenceLearner
):
    """Model run with Microsoft's onnxruntime using a tensorflow interface.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        input_arrays = (
            input_tensor.numpy()
            if not isinstance(input_tensor, np.ndarray)
            else input_tensor
            for input_tensor in input_tensors
        )
        outputs = self._predict_arrays(input_arrays)
        # noinspection PyTypeChecker
        return tuple(tf.convert_to_tensor(output) for output in outputs)


class NumpyONNXInferenceLearner(
    ONNXInferenceLearner, NumpyBaseInferenceLearner
):
    """Model run with Microsoft's onnxruntime using a numpy interface.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        onnx_path (str or Path): Path to the onnx model.
        input_names (List[str]): Input names used when the onnx model
            was produced.
        output_names (List[str]): Output names used when the onnx model
            was produced.
    """

    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[np.ndarray, ...]): Input tensors belonging to
                the same batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        input_arrays = (input_tensor for input_tensor in input_tensors)
        outputs = self._predict_arrays(input_arrays)
        return tuple(outputs)


ONNX_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[ONNXInferenceLearner]
] = {
    DeepLearningFramework.PYTORCH: PytorchONNXInferenceLearner,
    DeepLearningFramework.TENSORFLOW: TensorflowONNXInferenceLearner,
    DeepLearningFramework.NUMPY: NumpyONNXInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/openvino.py
================================================
import json
import shutil
from abc import ABC
from pathlib import Path
from typing import Dict, Union, Type, Generator, Tuple, List, Optional

import numpy as np
from loguru import logger

from nebullvm.config import OPENVINO_FILENAMES
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
    TensorflowBaseInferenceLearner,
    NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.openvino import (
    Core,
    Model,
    CompiledModel,
    InferRequest,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class OpenVinoInferenceLearner(BaseInferenceLearner, ABC):
    """Model optimized using OpenVINO.

    The class cannot be directly instantiated, but implements all the core
    methods needed for using OpenVINO at inference time.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        exec_network (any): The graph executor. This is the
            central component in the OpenVino optimized model execution.
        input_keys (List): Keys associated to the inputs.
        output_keys (List): Keys associated to the outputs.
        description_file (str): File containing a description of the optimized
            model.
        weights_file (str): File containing the model weights.
    """

    MODEL_NAME = "model.bin"
    name = "OpenVINO"

    def __init__(
        self,
        compiled_model: CompiledModel,
        infer_request: InferRequest,
        input_keys: List,
        output_keys: List,
        description_file: str,
        weights_file: str,
        device: Device,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.compiled_model = compiled_model
        self.infer_request = infer_request
        self.input_keys = input_keys
        self.output_keys = output_keys
        self.device = device
        self.description_file = self._store_file(description_file)
        self.weights_file = self._store_file(weights_file)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for the
                `from_model_name` class method.

        Returns:
            OpenVinoInferenceLearner: The optimized model.
        """
        path = Path(path)

        with open(path / OPENVINO_FILENAMES["metadata"], "r") as fin:
            metadata = json.load(fin)
        metadata.update(kwargs)
        metadata["network_parameters"] = ModelParams(
            **metadata["network_parameters"]
        )
        input_tfms = metadata.get("input_tfms")
        if input_tfms is not None:
            metadata["input_tfms"] = MultiStageTransformation.from_dict(
                input_tfms
            )

        model_name = str(path / OPENVINO_FILENAMES["description_file"])
        model_weights = str(path / OPENVINO_FILENAMES["weights"])
        metadata["device"] = Device.from_str(metadata["device"])
        return cls.from_model_name(
            model_name=model_name, model_weights=model_weights, **metadata
        )

    def get_size(self):
        return len(self.compiled_model.export_model())

    def free_gpu_memory(self):
        raise NotImplementedError("OpenVino does not support GPU inference.")

    @classmethod
    def from_model_name(
        cls,
        network_parameters: ModelParams,
        model_name: str,
        model_weights: str,
        device: Device,
        input_tfms: MultiStageTransformation = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Build the optimized model from the network description and its
        weights.

        Args:
            network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
            model_name (str): File containing a description of the optimized
                model.
            model_weights (str): File containing the model weights.
            device (Device): Device used to run the model.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction.
            input_data (DataManager, optional): User defined data.
        """
        if len(kwargs) > 0:
            logger.warning(f"Found extra parameters: {kwargs}")

        core = Core()
        model = core.read_model(model=model_name, weights=model_weights)

        dynamic_shape = cls._get_dynamic_shape(model, network_parameters)

        if dynamic_shape is not None:
            model.reshape(dynamic_shape)

        compiled_model = core.compile_model(model=model, device_name="CPU")
        infer_request = compiled_model.create_infer_request()

        input_keys = list(
            map(lambda obj: obj.get_any_name(), compiled_model.inputs)
        )
        output_keys = list(
            map(lambda obj: obj.get_any_name(), compiled_model.outputs)
        )

        return cls(
            compiled_model,
            infer_request,
            input_keys,
            output_keys,
            input_tfms=input_tfms,
            network_parameters=network_parameters,
            description_file=model_name,
            weights_file=model_weights,
            input_data=input_data,
            device=device,
        )

    @staticmethod
    def _get_dynamic_shape(
        model: Model, network_parameters: ModelParams
    ) -> Optional[Dict[str, Tuple[int]]]:
        if network_parameters.dynamic_info is None:
            return None

        input_names = [
            list(model_input.names)[0] for model_input in model.inputs
        ]
        input_shapes = [
            input_info.size for input_info in network_parameters.input_infos
        ]
        dynamic_shapes = []

        assert len(input_shapes) == len(
            network_parameters.dynamic_info.inputs
        ), (
            f"Number of inputs defined in dynamic info "
            f"({len(input_shapes)}) is different from the one "
            f"expected from the model "
            f"({len(network_parameters.dynamic_info.inputs)})."
        )

        for input_shape, dynamic_shape_dict in zip(
            input_shapes, network_parameters.dynamic_info.inputs
        ):
            input_shape = list(input_shape)
            for key in dynamic_shape_dict.keys():
                input_shape[int(key)] = -1
            dynamic_shapes.append(tuple(input_shape))

        dynamic_shape_dict = {
            k: v for k, v in zip(input_names, dynamic_shapes)
        }
        return dynamic_shape_dict

    def _get_metadata(self, **kwargs) -> LearnerMetadata:
        # metadata = {
        #     key: self.__dict__[key] for key in ("input_keys", "output_keys")
        # }
        metadata = {}
        metadata.update(kwargs)
        return LearnerMetadata.from_model(self, **metadata)

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = self._get_metadata(**kwargs)

        metadata.save(path)

        shutil.copy(
            self.description_file,
            path / OPENVINO_FILENAMES["description_file"],
        )
        shutil.copy(self.weights_file, path / OPENVINO_FILENAMES["weights"])

    def _predict_array(
        self,
        input_arrays: Generator[np.ndarray, None, None],
    ) -> Generator[np.ndarray, None, None]:

        results = self.infer_request.infer(
            inputs={
                input_key: input_array
                for input_key, input_array in zip(
                    self.input_keys, input_arrays
                )
            }
        )
        results = {
            output_key.get_any_name(): output_arr
            for output_key, output_arr in results.items()
        }

        return (results[output_key] for output_key in self.output_keys)


class PytorchOpenVinoInferenceLearner(
    OpenVinoInferenceLearner, PytorchBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a Pytorch interface.

    This class can be used exactly in the same way as a pytorch Module object.
    At prediction time it takes as input pytorch tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        exec_network (any): The graph executor. This is the
            central component in the OpenVino optimized model execution.
        input_keys (List): Keys associated to the inputs.
        output_keys (List): Keys associated to the outputs.
        description_file (str): File containing a description of the optimized
            model.
        weights_file (str): File containing the model weights.
    """

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (
            input_tensor.cpu().detach().numpy()
            for input_tensor in input_tensors
        )
        output_arrays = self._predict_array(input_arrays)
        return tuple(
            torch.from_numpy(output_array) for output_array in output_arrays
        )


class TensorflowOpenVinoInferenceLearner(
    OpenVinoInferenceLearner, TensorflowBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a tensorflow interface.

    This class can be used exactly in the same way as a tf.Module or
    keras.Model object.
    At prediction time it takes as input tensorflow tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        exec_network (any): The graph executor. This is the
            central component in the OpenVino optimized model execution.
        input_keys (List): Keys associated to the inputs.
        output_keys (List): Keys associated to the outputs.
        description_file (str): File containing a description of the optimized
            model.
        weights_file (str): File containing the model weights.
    """

    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)
        output_arrays = self._predict_array(input_arrays)
        # noinspection PyTypeChecker
        return tuple(
            tf.convert_to_tensor(output_array)
            for output_array in output_arrays
        )


class NumpyOpenVinoInferenceLearner(
    OpenVinoInferenceLearner, NumpyBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a numpy interface.

    This class can be used exactly in the same way as a sklearn or
    numpy-based model.
    At prediction time it takes as input numpy arrays given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        exec_network (any): The graph executor. This is the
            central component in the OpenVino optimized model execution.
        input_keys (List): Keys associated to the inputs.
        output_keys (List): Keys associated to the outputs.
        description_file (str): File containing a description of the optimized
            model.
        weights_file (str): File containing the model weights.
    """

    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[np.ndarray]): Input tensors belonging to
                the same batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[np.ndarray]: Output tensors. Note that the output tensors
                does not correspond to the prediction on the input tensors
                with a 1 to 1 mapping. In fact the output tensors are produced
                as the multiple-output of the model given a (multi-) tensor
                input.
        """
        input_arrays = (input_tensor for input_tensor in input_tensors)
        output_arrays = self._predict_array(input_arrays)
        return tuple(output_arrays)


OPENVINO_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[OpenVinoInferenceLearner]
] = {
    DeepLearningFramework.PYTORCH: PytorchOpenVinoInferenceLearner,
    DeepLearningFramework.TENSORFLOW: TensorflowOpenVinoInferenceLearner,
    DeepLearningFramework.NUMPY: NumpyOpenVinoInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensor_rt.py
================================================
import json
import os
from abc import ABC
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Union, Dict, Type, List, Tuple, Generator, Optional

import numpy as np
from loguru import logger

from nebullvm.config import NVIDIA_FILENAMES
from nebullvm.core.models import (
    Device,
    DeviceType,
    ModelParams,
    DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
    TensorflowBaseInferenceLearner,
    NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.tensor_rt import tensorrt as trt, polygraphy
from nebullvm.optional_modules.torch import torch, ScriptModule
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    VerifyContiguity,
)


class ONNXTensorRTInferenceLearner(BaseInferenceLearner, ABC):
    """Model optimized using TensorRT.

    The class cannot be directly instantiated, but implements all the core
    methods needed for using TensorRT at inference time.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        engine (any): The tensorRT engine.
        input_names (List[str]): Names associated to the model input tensors.
        output_names (List[str]): Names associated to the model output tensors.
        cuda_stream (any, optional): Stream used for communication with Nvidia
            GPUs.
        nvidia_logger (any, optional): Logger used by the Nvidia service
    """

    name = "TensorRT"

    def __init__(
        self,
        engine: Any,
        input_names: List[str],
        output_names: List[str],
        device: Device,
        cuda_stream: Any = None,
        nvidia_logger: Any = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.engine = engine
        self.context = self.engine.create_execution_context()
        self.input_names = input_names
        self.output_names = output_names
        self.cuda_stream = cuda_stream
        self.nvidia_logger = nvidia_logger
        self.output_tensors = None
        self.device = device
        self._set_cuda_env(device.type is DeviceType.GPU)

    def _get_metadata(self, **kwargs) -> LearnerMetadata:
        metadata = {
            key: self.__dict__[key] for key in ("input_names", "output_names")
        }
        metadata.update(kwargs)
        return LearnerMetadata.from_model(self, **metadata)

    def _synchronize_stream(self):
        raise NotImplementedError()

    @property
    def stream_ptr(self):
        raise NotImplementedError()

    @staticmethod
    def _get_default_cuda_stream() -> Any:
        raise NotImplementedError()

    @staticmethod
    def check_env(use_gpu):
        if not use_gpu:
            raise SystemError(
                "You are trying to run an optimizer developed for NVidia gpus "
                "on a machine not connected to any GPU supporting CUDA."
            )

    def _set_cuda_env(self, use_gpu):
        self.check_env(use_gpu)
        if self.nvidia_logger is None:
            self.nvidia_logger = trt.Logger(trt.Logger.WARNING)
        if self.cuda_stream is None:
            self.cuda_stream = self._get_default_cuda_stream()

    @classmethod
    def from_engine_path(
        cls,
        network_parameters: ModelParams,
        engine_path: Union[str, Path],
        input_names: List[str],
        output_names: List[str],
        device: Device,
        nvidia_logger: Any = None,
        cuda_stream: Any = None,
        input_tfms: MultiStageTransformation = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Build the model from the serialised engine.

        Args:
            network_parameters (ModelParams): Model parameters.
            engine_path (str or Path): Path to the serialised engine. The
                serialised engine is the serialised version of the engine
                used for accelerating the inference.
            input_names (List[str]): Names associated to the model input
                tensors.
            output_names (List[str]): Names associated to the model output
                tensors.
            device: (Device): Device where the model wil be run.
            cuda_stream (any, optional): Stream used for communication with
                Nvidia GPUs.
            nvidia_logger (any, optional): Logger used by the Nvidia service
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction.
            input_data (DataManager, optional): User defined data.

        Returns:
            NvidiaInferenceLearner: The optimized model.
        """
        if kwargs:
            logger.warning(
                f"Debug: Got extra keywords in "
                f"NvidiaInferenceLearner::from_engine_path: {kwargs}"
            )
        if nvidia_logger is None:
            nvidia_logger = trt.Logger(trt.Logger.WARNING)
        if input_tfms is None:
            input_tfms = MultiStageTransformation([])
        input_tfms.append(VerifyContiguity())
        runtime = trt.Runtime(nvidia_logger)
        with open(engine_path, "rb") as f:
            serialized_engine = f.read()
        engine = runtime.deserialize_cuda_engine(serialized_engine)
        return cls(
            input_tfms=input_tfms,
            network_parameters=network_parameters,
            engine=engine,
            input_names=input_names,
            output_names=output_names,
            nvidia_logger=nvidia_logger,
            cuda_stream=cuda_stream,
            input_data=input_data,
            device=device,
        )

    def _predict_tensors(
        self,
        input_ptrs: Generator[Any, None, None],
        output_ptrs: Generator[Any, None, None],
        input_shapes: Generator[Any, None, None] = None,
    ):
        buffers = [None] * (len(self.input_names) + len(self.output_names))
        input_idxs = (
            self.engine[input_name] for input_name in self.input_names
        )
        output_idxs = (
            self.engine[output_name] for output_name in self.output_names
        )
        input_shapes = input_shapes or [None] * len(self.input_names)
        for input_idx, input_ptr, input_shape in zip(
            input_idxs, input_ptrs, input_shapes
        ):
            buffers[input_idx] = input_ptr
            if input_shape is not None:
                # If the input shape is empty, we set it to (1,) because
                # TensorRT doesn't accept empty shapes.
                if input_shape == torch.Size([]):
                    input_shape = torch.Size((1,))
                self.context.set_binding_shape(input_idx, input_shape)
        for output_idx, output_ptr in zip(output_idxs, output_ptrs):
            buffers[output_idx] = output_ptr
        self.context.execute_async_v2(buffers, self.stream_ptr)
        self._synchronize_stream()

    def get_size(self):
        return self.engine.serialize().nbytes

    def free_gpu_memory(self):
        # ONNXtensorrt doesn't need to release gpu memory
        pass

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        path = Path(path)
        path.mkdir(exist_ok=True)
        serialized_engine = self.engine.serialize()
        with open(path / NVIDIA_FILENAMES["engine"], "wb") as fout:
            fout.write(serialized_engine)
        metadata = self._get_metadata(**kwargs)
        with open(path / NVIDIA_FILENAMES["metadata"], "w") as fout:
            json.dump(metadata.to_dict(), fout)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for the
                `from_engine_path` class method.

        Returns:
            ONNXTensorRTInferenceLearner: The optimized model.
        """
        path = Path(path)
        with open(path / NVIDIA_FILENAMES["metadata"], "r") as fin:
            metadata = json.load(fin)
        metadata.update(kwargs)
        metadata["network_parameters"] = ModelParams(
            **metadata["network_parameters"]
        )
        input_tfms = metadata.get("input_tfms")
        if input_tfms is not None:
            metadata["input_tfms"] = MultiStageTransformation.from_dict(
                input_tfms
            )
        metadata["device"] = Device(DeviceType.GPU)
        return cls.from_engine_path(
            engine_path=path / NVIDIA_FILENAMES["engine"],
            **metadata,
        )


class PytorchTensorRTInferenceLearner(PytorchBaseInferenceLearner):
    MODEL_NAME = "model_optimized.pt"
    name = "TensorRT"

    def __init__(
        self,
        torch_model: ScriptModule,
        device: Device,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.model = torch_model.eval()
        if device.type is DeviceType.GPU:
            self.model.to(device.to_torch_format())
            self.use_gpu = True
        else:
            self.use_gpu = False
        self.device = device
        self._is_gpu_ready = device.type is DeviceType.GPU

    def get_size(self):
        with TemporaryDirectory() as tmp_dir:
            self.save(tmp_dir)
            return sum(
                os.path.getsize(Path(tmp_dir) / f)
                for f in os.listdir(Path(tmp_dir))
                if os.path.isfile(Path(tmp_dir) / f)
            )

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()

        # PyTorch-TensorRT does not support int64
        input_tensors = (
            t.to(self.device.to_torch_format())
            if t.dtype != torch.int64
            else t.to(torch.int32).to(self.device.to_torch_format())
            for t in input_tensors
        )

        with torch.no_grad():
            res = self.model(*input_tensors)
            if not isinstance(res, tuple):
                res = res.to(self.device.to_torch_format())
                return (res,)
            return tuple(out.to(self.device.to_torch_format()) for out in res)

    def save(self, path: Union[str, Path], **kwargs):
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)
        torch.jit.save(self.model, path / self.MODEL_NAME)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        path = Path(path)
        model = torch.jit.load(path / cls.MODEL_NAME)
        metadata = LearnerMetadata.read(path)
        device = Device(DeviceType.GPU)
        return cls(
            torch_model=model,
            network_parameters=ModelParams(**metadata.network_parameters),
            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
            if metadata.input_tfms is not None
            else None,
            device=device,
        )


class PytorchONNXTensorRTInferenceLearner(
    ONNXTensorRTInferenceLearner, PytorchBaseInferenceLearner
):
    """Model optimized using TensorRT with a Pytorch interface.

    This class can be used exactly in the same way as a pytorch Module object.
    At prediction time it takes as input pytorch tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        engine (any): The tensorRT engine.
        input_names (List[str]): Names associated to the model input tensors.
        output_names (List[str]): Names associated to the model output tensors.
        cuda_stream (any, optional): Stream used for communication with Nvidia
            GPUs.
        nvidia_logger (any, optional): Logger used by the Nvidia service.
    """

    def _synchronize_stream(self):
        self.cuda_stream.synchronize()

    @staticmethod
    def _get_default_cuda_stream() -> Any:
        return torch.cuda.default_stream()

    @property
    def stream_ptr(self):
        return self.cuda_stream.cuda_stream

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_tensors = [
            input_tensor.to(self.device.to_torch_format())
            for input_tensor in input_tensors
        ]
        if self.network_parameters.dynamic_info is None:
            if self.output_tensors is None:
                self.output_tensors = [
                    torch.Tensor(*output_size)
                    .to(self.device.to_torch_format())
                    .to(output_type.to_torch_format())
                    for output_size, output_type in zip(
                        self.network_parameters.output_sizes,
                        self.network_parameters.output_types,
                    )
                ]
            input_sizes = None
        else:
            dynamic_info = self.network_parameters.dynamic_info
            input_sizes = [
                input_tensor.size() for input_tensor in input_tensors
            ]
            self.output_tensors = [
                torch.Tensor(
                    *(
                        x
                        if i not in dynamic_axis.keys()
                        else dynamic_info.retrieve_output_dim(
                            input_sizes, j, i, x
                        )
                        for i, x in enumerate(output_size)
                    ),
                )
                .to(self.device.to_torch_format())
                .to(output_type.to_torch_format())
                for j, (output_size, output_type, dynamic_axis) in enumerate(
                    zip(
                        self.network_parameters.output_sizes,
                        self.network_parameters.output_types,
                        dynamic_info.outputs,
                    )
                )
            ]

        input_ptrs = (
            input_tensor.data_ptr() for input_tensor in input_tensors
        )
        output_ptrs = (
            output_tensor.data_ptr() for output_tensor in self.output_tensors
        )
        self._predict_tensors(input_ptrs, output_ptrs, input_sizes)
        return tuple(
            output_tensor.to(self.device.to_torch_format())
            for output_tensor in self.output_tensors
        )


class BaseArrayONNXTensorRTInferenceLearner(ONNXTensorRTInferenceLearner, ABC):
    """Base Model that can be used for all array-based
    NvidiaInferenceLearners.
    """

    def _synchronize_stream(self):
        self.cuda_stream.synchronize()

    @staticmethod
    def _get_default_cuda_stream() -> Any:
        return polygraphy.cuda.Stream()

    @property
    def stream_ptr(self):
        return self.cuda_stream.ptr

    @staticmethod
    def _convert_to_array_and_free_memory(cuda_array) -> np.ndarray:
        array = cuda_array.numpy()
        cuda_array.free()
        return array

    def _predict_array(
        self,
        cuda_input_arrays: List,
        input_shapes: Optional[List[Tuple[int, ...]]],
    ) -> Generator[np.ndarray, None, None]:

        if self.network_parameters.dynamic_info is None:
            cuda_output_arrays = [
                polygraphy.cuda.DeviceArray(
                    shape=output_size,
                    dtype=output_type.to_numpy_format(),
                )
                for output_size, output_type in zip(
                    self.network_parameters.output_sizes,
                    self.network_parameters.output_types,
                )
            ]
        else:
            dynamic_info = self.network_parameters.dynamic_info
            cuda_output_arrays = [
                polygraphy.cuda.DeviceArray(
                    shape=tuple(
                        x
                        if i not in dyn_out_axis.keys()
                        else dynamic_info.retrieve_output_dim(
                            input_shapes, j, i, x
                        )
                        for i, x in enumerate(output_size)
                    ),
                    dtype=output_type.to_numpy_format(),
                )
                for j, (output_size, output_type, dyn_out_axis) in enumerate(
                    zip(
                        self.network_parameters.output_sizes,
                        self.network_parameters.output_types,
                        dynamic_info.outputs,
                    )
                )
            ]
        input_ptrs = (cuda_array.ptr for cuda_array in cuda_input_arrays)
        output_ptrs = (cuda_array.ptr for cuda_array in cuda_output_arrays)
        self._predict_tensors(input_ptrs, output_ptrs, input_shapes)
        for cuda_input_array in cuda_input_arrays:
            cuda_input_array.free()
        return (
            self._convert_to_array_and_free_memory(array)
            for array in cuda_output_arrays
        )


class TensorflowONNXTensorRTInferenceLearner(
    BaseArrayONNXTensorRTInferenceLearner, TensorflowBaseInferenceLearner
):
    """Model optimized using TensorRT with a tensorflow interface.

    This class can be used exactly in the same way as a tf.Module or
    keras.Model object.
    At prediction time it takes as input tensorflow tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        engine (any): The tensorRT engine.
        input_names (List[str]): Names associated to the model input tensors.
        output_names (List[str]): Names associated to the model output tensors.
        cuda_stream (any, optional): Stream used for communication with Nvidia
            GPUs.
        nvidia_logger (any, optional): Logger used by the Nvidia service.
    """

    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        cuda_input_arrays = [
            polygraphy.cuda.DeviceArray(
                shape=tuple(input_tensor.shape),
                dtype=input_tensor.numpy().dtype,
            ).copy_from(input_tensor.numpy(), stream=self.cuda_stream)
            for input_tensor in input_tensors
        ]
        input_shapes = (
            [tuple(input_tensor.shape) for input_tensor in input_tensors]
            if self.network_parameters.dynamic_info is not None
            else None
        )
        out_arrays = self._predict_array(cuda_input_arrays, input_shapes)
        return tuple(tf.convert_to_tensor(array) for array in out_arrays)


class NumpyONNXTensorRTInferenceLearner(
    BaseArrayONNXTensorRTInferenceLearner, NumpyBaseInferenceLearner
):
    """Model optimized using TensorRT with a tensorflow interface.

    This class can be used exactly in the same way as a tf.Module or
    keras.Model object.
    At prediction time it takes as input tensorflow tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        engine (any): The tensorRT engine.
        input_names (List[str]): Names associated to the model input tensors.
        output_names (List[str]): Names associated to the model output tensors.
        cuda_stream (any, optional): Stream used for communication with Nvidia
            GPUs.
        nvidia_logger (any, optional): Logger used by the Nvidia service.
    """

    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[np.ndarray]): Input tensors belonging to
                the same batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[np.ndarray]: Output tensors. Note that the output tensors
                does not correspond to the prediction on the input tensors
                with a 1 to 1 mapping. In fact the output tensors are produced
                as the multiple-output of the model given a (multi-) tensor
                input.
        """
        cuda_input_arrays = [
            polygraphy.cuda.DeviceArray(
                shape=tuple(input_tensor.shape), dtype=input_tensor.dtype
            ).copy_from(input_tensor, stream=self.cuda_stream)
            for input_tensor in input_tensors
        ]
        input_shapes = (
            [tuple(input_tensor.shape) for input_tensor in input_tensors]
            if self.network_parameters.dynamic_info is not None
            else None
        )
        return tuple(self._predict_array(cuda_input_arrays, input_shapes))


TENSOR_RT_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[ONNXTensorRTInferenceLearner]
] = {
    DeepLearningFramework.PYTORCH: PytorchONNXTensorRTInferenceLearner,
    DeepLearningFramework.TENSORFLOW: TensorflowONNXTensorRTInferenceLearner,
    DeepLearningFramework.NUMPY: NumpyONNXTensorRTInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensorflow.py
================================================
import pickle
from pathlib import Path
from typing import Tuple, Union, Dict, Type

from nebullvm.config import TENSORFLOW_BACKEND_FILENAMES
from nebullvm.core.models import DeviceType, Device, ModelParams
from nebullvm.operations.inference_learners.base import (
    TensorflowBaseInferenceLearner,
    LearnerMetadata,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf


class TensorflowBackendInferenceLearner(TensorflowBaseInferenceLearner):
    name = "XLA"

    def __init__(self, tf_model: tf.Module, device: Device, **kwargs):
        super(TensorflowBackendInferenceLearner, self).__init__(**kwargs)
        self.model = tf_model
        self.device = device
        self._is_gpu_ready = self.device.type is DeviceType.GPU

    def get_size(self):
        return len(pickle.dumps(self.model, -1))

    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        with tf.device(self.device.to_tf_format()):
            res = self.model(input_tensors)
        if not isinstance(res, tuple):
            return (res,)
        return res

    def save(self, path: Union[str, Path], **kwargs):
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)
        self.model.save(path / TENSORFLOW_BACKEND_FILENAMES["tf_model"])

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        path = Path(path)
        metadata = LearnerMetadata.read(path)
        network_parameters = ModelParams(**metadata.network_parameters)
        input_tfms = metadata.input_tfms
        model = tf.keras.models.load_model(
            path / TENSORFLOW_BACKEND_FILENAMES["tf_model"]
        )
        device = Device.from_str(metadata.device)
        return cls(
            tf_model=model,
            network_parameters=network_parameters,
            input_tfms=input_tfms,
            device=device,
        )


class TFLiteBackendInferenceLearner(TensorflowBaseInferenceLearner):
    name = "TFLite"

    def __init__(self, tflite_file: bytes, device: Device, **kwargs):
        super(TFLiteBackendInferenceLearner, self).__init__(**kwargs)
        self.tflite_file = tflite_file
        self.interpreter = tf.lite.Interpreter(model_content=tflite_file)
        self.device = device

    def get_size(self):
        return len(self.tflite_file)

    def free_gpu_memory(self):
        raise NotImplementedError(
            "TFLite does not support GPU inference on Nvidia devices"
        )

    def run(self, *input_tensors: tf.Tensor):
        input_details = self.interpreter.get_input_details()
        output_details = self.interpreter.get_output_details()
        if self.network_parameters.dynamic_info:
            for i, (input_tensor, detail) in enumerate(
                zip(input_tensors, input_details)
            ):
                if input_tensor.shape != tuple(detail["shape"]):
                    self.interpreter.resize_tensor_input(i, input_tensor.shape)
        self.interpreter.allocate_tensors()
        for i, input_tensor in enumerate(input_tensors):
            self.interpreter.set_tensor(i, input_tensor)
        self.interpreter.invoke()
        return tuple(
            tf.convert_to_tensor(
                self.interpreter.get_tensor(output_detail["index"])
            )
            for output_detail in output_details
        )

    def save(self, path: Union[str, Path], **kwargs):
        path = Path(path)
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)
        with open(
            path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"], "wb"
        ) as f:
            f.write(self.tflite_file)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        path = Path(path)
        tflite_file_path = str(
            path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"]
        )

        with open(tflite_file_path, "rb") as f:
            tflite_file = f.read()

        metadata = LearnerMetadata.read(path)
        network_parameters = ModelParams(**metadata.network_parameters)
        input_tfms = metadata.input_tfms
        device = Device.from_str(metadata.device)
        return cls(
            tflite_file=tflite_file,
            network_parameters=network_parameters,
            input_tfms=input_tfms,
            device=device,
        )


TF_BACKEND_LEARNERS_DICT: Dict[
    str,
    Type[
        Union[TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner]
    ],
] = {
    "tf": TensorflowBackendInferenceLearner,
    "tflite": TFLiteBackendInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_dynamo.py
================================================
from pathlib import Path
from typing import Union

from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)


class TorchDynamoInferenceLearner(TorchScriptInferenceLearner):
    name = "TorchDynamo"

    def save(self, path: Union[str, Path], **kwargs):
        # TODO: Implement save function
        # Saving it like a normal PyTorch model raises this error:
        # https://github.com/pytorch/pytorch/issues/93470
        raise NotImplementedError

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        # TODO: Implement load function
        raise NotImplementedError


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_neuron.py
================================================
import os
from pathlib import Path
from tempfile import TemporaryDirectory

from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)


class TorchNeuronInferenceLearner(TorchScriptInferenceLearner):
    name = "TorchNeuron"

    def get_size(self):
        with TemporaryDirectory() as tmp_dir:
            self.save(tmp_dir)
            return sum(
                os.path.getsize(Path(tmp_dir) / f)
                for f in os.listdir(Path(tmp_dir))
                if os.path.isfile(Path(tmp_dir) / f)
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_xla.py
================================================
import os
import pickle
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, Union

from nebullvm.core.models import Device, DeviceType, ModelParams
from nebullvm.operations.inference_learners.base import (
    PytorchBaseInferenceLearner,
    LearnerMetadata,
)
from nebullvm.optional_modules.torch import (
    torch,
)
from nebullvm.tools.transformations import MultiStageTransformation


class TorchXLAInferenceLearner(PytorchBaseInferenceLearner):
    MODEL_NAME = "model_scripted.pt"
    name = "TorchXLA"

    def __init__(self, torch_model: torch.nn.Module, device: Device, **kwargs):
        super().__init__(**kwargs)
        self.model = torch_model.eval()
        if device.type is DeviceType.TPU:
            self.model.to(device.to_torch_format())
        self.device = device
        self._is_gpu_ready = self.device.type is DeviceType.TPU

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        if self.device.type is DeviceType.TPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        if self.device.type is DeviceType.TPU:
            input_tensors = (
                t.to(self.device.to_torch_format()) for t in input_tensors
            )
        with torch.no_grad():
            res = self.model(*input_tensors)
            if not isinstance(res, tuple):
                return (res,)
            return tuple(out for out in res)

    def get_size(self):
        try:
            if hasattr(self.model, "core_model"):
                return len(pickle.dumps(self.model.core_model, -1))
            else:
                # Normal torch model
                return len(pickle.dumps(self.model, -1))
        except RuntimeError:
            with TemporaryDirectory() as tmp_dir:
                self.save(tmp_dir)
                return sum(
                    os.path.getsize(Path(tmp_dir) / f)
                    for f in os.listdir(Path(tmp_dir))
                    if os.path.isfile(Path(tmp_dir) / f)
                )

    def save(self, path: Union[str, Path], **kwargs):
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)
        self.model.cpu()
        torch.save(self.model, path / self.MODEL_NAME)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        path = Path(path)
        model = torch.load(path / cls.MODEL_NAME)
        metadata = LearnerMetadata.read(path)
        device = Device.from_str(metadata.device)
        model.to(device.to_torch_format())
        return cls(
            torch_model=model,
            network_parameters=ModelParams(**metadata.network_parameters),
            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
            if metadata.input_tfms is not None
            else None,
            device=device,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torchscript.py
================================================
from pathlib import Path
from typing import Tuple, Union, Optional, List

from nebullvm.core.models import Device, DeviceType, ModelParams
from nebullvm.operations.inference_learners.base import (
    PytorchBaseInferenceLearner,
    LearnerMetadata,
)
from nebullvm.optional_modules.torch import (
    torch,
    symbolic_trace,
    Module,
    ScriptModule,
    GraphModule,
)
from nebullvm.tools.transformations import MultiStageTransformation


class TorchScriptInferenceLearner(PytorchBaseInferenceLearner):
    MODEL_NAME = "model_scripted.pt"
    name = "TorchScript"

    def __init__(self, torch_model: ScriptModule, device: Device, **kwargs):
        super().__init__(**kwargs)
        self.model = torch_model.eval()
        if device.type is DeviceType.GPU:
            self.model.to(device.to_torch_format())
        self.device = device
        self._is_gpu_ready = self.device.type is DeviceType.GPU

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
            self.set_model_on_gpu()
        if self.device.type is DeviceType.GPU:
            input_tensors = (
                t.to(self.device.to_torch_format()) for t in input_tensors
            )
        with torch.no_grad():
            res = self.model(*input_tensors)
            if not isinstance(res, tuple):
                res = res.to(self.device.to_torch_format())
                return (res,)
            return tuple(out.to(self.device.to_torch_format()) for out in res)

    def save(self, path: Union[str, Path], **kwargs):
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = LearnerMetadata.from_model(self, **kwargs)
        metadata.save(path)

        torch.jit.save(self.model, path / self.MODEL_NAME)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        path = Path(path)
        model = torch.jit.load(path / cls.MODEL_NAME)
        metadata = LearnerMetadata.read(path)
        device = Device.from_str(metadata.device)
        return cls(
            torch_model=model,
            network_parameters=ModelParams(**metadata.network_parameters),
            input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
            if metadata.input_tfms is not None
            else None,
            device=device,
        )

    @classmethod
    def from_torch_model(
        cls,
        model: Union[Module, GraphModule],
        network_parameters: ModelParams,
        device: Device,
        input_tfms: Optional[MultiStageTransformation] = None,
        input_data: List[torch.Tensor] = None,
    ):
        if device.type is DeviceType.GPU:
            input_data = [t.to(device.to_torch_format()) for t in input_data]

        if not isinstance(model, torch.fx.GraphModule):
            model.eval()
            try:
                model_scripted = symbolic_trace(model)
                model_scripted = torch.jit.script(model_scripted)
            except Exception:
                try:
                    model_scripted = torch.jit.script(model)
                except Exception:
                    model_scripted = torch.jit.trace(model, tuple(input_data))
        else:
            model_scripted = torch.jit.script(model)

        return cls(
            torch_model=model_scripted,
            network_parameters=network_parameters,
            input_tfms=input_tfms,
            input_data=input_data,
            device=device,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tvm.py
================================================
import os
import shutil
from abc import ABC
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Union, Type, Dict, Any, List, Generator, Tuple, Optional

import numpy as np

from nebullvm.config import (
    TVM_FILENAMES,
)
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
    LearnerMetadata,
    PytorchBaseInferenceLearner,
    TensorflowBaseInferenceLearner,
    NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.tvm import (
    GraphModule,
    tvm,
    ExecutorFactoryModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)


class ApacheTVMInferenceLearner(BaseInferenceLearner, ABC):
    """Model optimized using ApacheTVM.

    The class cannot be directly instantiated, but implements all the core
    methods needed for using ApacheTVM at inference time.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        graph_executor_module (GraphModule): The graph executor. This is the
            central component in the ApacheTVM optimized model execution.
        input_names (List[str]): Names associated to the model input tensors.
        lib (Module): Component needed for loading the ApacheTVM optimized
            model.
        target (str): Target device. It can be wither `llvm` for targeting CPUs
            or "cuda" for targeting GPUs.
        engine_path (Path, optional): Path to the serialized engine. To be used
            after loading the model (avoiding double engine serialization).
    """

    name = "ApacheTVM"

    def __init__(
        self,
        graph_executor_module: GraphModule,
        input_names: List[str],
        lib: ExecutorFactoryModule,
        target: str,
        device: Device,
        engine_path: Path = None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.graph_executor_module = graph_executor_module
        self.input_names = input_names
        self.lib = lib
        self.target = target
        self.engine_path = (
            self._store_file(engine_path)
            if engine_path is not None
            else engine_path
        )
        self.device = device

    def get_size(self):
        with TemporaryDirectory() as tmp_dir:
            self.save(tmp_dir)
            return sum(
                os.path.getsize(Path(tmp_dir) / f)
                for f in os.listdir(Path(tmp_dir))
                if os.path.isfile(Path(tmp_dir) / f)
            )

    def _has_half_precision_transformation(self):
        for tfm in self.input_tfms.to_list():
            if isinstance(tfm, HalfPrecisionTransformation):
                return True
        return False

    def _predict_array(
        self, input_arrays: Generator[np.ndarray, None, None]
    ) -> Generator[np.ndarray, None, None]:
        for name, array in zip(self.input_names, input_arrays):
            self.graph_executor_module.set_input(name, array)
        self.graph_executor_module.run()

        tvm_outputs = (
            self.graph_executor_module.get_output(
                i,
                tvm.nd.empty(
                    shape=output_size,
                    dtype="float16"
                    if self._has_half_precision_transformation()
                    else "float32",
                ),
            ).numpy()
            for i, output_size in enumerate(
                self.network_parameters.output_sizes
            )
        )
        return tvm_outputs

    def free_gpu_memory(self):
        # TODO: check if tvm needs to release GPU
        pass

    def save(self, path: Union[str, Path], **kwargs):
        """Save the model.

        Args:
            path (Path or str): Path to the directory where the model will
                be stored.
            kwargs (Dict): Dictionary of key-value pairs that will be saved in
                the model metadata file.
        """
        path = Path(path)
        path.mkdir(exist_ok=True)
        metadata = LearnerMetadata.from_model(
            self, input_names=self.input_names, target=self.target, **kwargs
        )
        metadata.save(path)
        if self.engine_path is None:
            self.lib.export_library(path / TVM_FILENAMES["engine"])
        else:
            shutil.copy(self.engine_path, path)

    @classmethod
    def load(cls, path: Union[Path, str], **kwargs):
        """Load the model.

        Args:
            path (Path or str): Path to the directory where the model is
                stored.
            kwargs (Dict): Dictionary of additional arguments for the
                `from_runtime_module` class method.

        Returns:
            ApacheTVMInferenceLearner: The optimized model.
        """
        path = Path(path)
        metadata = LearnerMetadata.read(path).to_dict()
        network_parameters = ModelParams(**metadata["network_parameters"])
        lib = tvm.runtime.load_module(path / TVM_FILENAMES["engine"])
        target_device = metadata["target"]
        input_names = metadata["input_names"]
        input_tfms = metadata.get("input_tfms")
        if input_tfms is not None:
            metadata["input_tfms"] = MultiStageTransformation.from_dict(
                input_tfms
            )
        device = Device.from_str(metadata["device"])
        self = cls.from_runtime_module(
            network_parameters=network_parameters,
            lib=lib,
            target_device=target_device,
            input_names=input_names,
            device=device,
        )
        self.engine_path = path / TVM_FILENAMES["engine"]
        return self

    @classmethod
    def from_runtime_module(
        cls,
        network_parameters: ModelParams,
        lib: ExecutorFactoryModule,
        target_device: str,
        input_names: List[str],
        device: Device,
        input_tfms: MultiStageTransformation = None,
        input_data: DataManager = None,
    ):
        """Build the model from the runtime module (lib).

        Args:
            network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
            lib (Module): Component needed for loading the ApacheTVM optimized
                model.
            target_device (str): The target device. Either `llvm` (CPU)
                or `cuda`.
            input_names (List[str]): Names associated to the model input
                tensors.
            device (Device): The device where the model will be executed.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction.
            input_data (DataManager, optional): User defined data.
        """
        dev = tvm.device(str(target_device), 0)
        graph_executor_module = GraphModule(lib["default"](dev))
        return cls(
            input_tfms=input_tfms,
            network_parameters=network_parameters,
            graph_executor_module=graph_executor_module,
            input_names=input_names,
            lib=lib,
            target=target_device,
            input_data=input_data,
            device=device,
        )


class BaseArrayApacheTVMInferenceLearner(ApacheTVMInferenceLearner, ABC):
    """Base Model that can be used for all array-based
    ApacheTVMInferenceLearners.
    """

    def _inner_predict(
        self,
        input_arrays: Generator[np.ndarray, None, None],
        input_shapes: Optional[List[Tuple[int, ...]]],
    ) -> Generator[np.ndarray, None, None]:
        if self.network_parameters.dynamic_info is not None:
            input_arrays = (
                np.pad(
                    input_array,
                    [
                        (0, abs(x - y))
                        for x, y in zip(
                            input_array.shape,
                            input_size,
                        )
                    ],
                    mode="constant",
                    constant_values=0,
                )
                for input_array, input_size in zip(
                    input_arrays, self.network_parameters.input_sizes
                )
            )

        output_arrays = self._predict_array(input_arrays)
        if self.network_parameters.dynamic_info is not None:
            assert input_shapes is not None
            dynamic_info = self.network_parameters.dynamic_info
            return (
                output_array[
                    tuple(
                        slice(
                            0,
                            None
                            if x not in out_dynamic_dict.keys()
                            else dynamic_info.retrieve_output_dim(
                                input_shapes, j, i, x
                            ),
                        )
                        for i, x in enumerate(output_array.shape)
                    )
                ]
                for j, (output_array, out_dynamic_dict) in enumerate(
                    zip(output_arrays, dynamic_info.outputs)
                )
            )

        return output_arrays


class PytorchApacheTVMInferenceLearner(
    BaseArrayApacheTVMInferenceLearner, PytorchBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a Pytorch interface.

    This class can be used exactly in the same way as a pytorch Module object.
    At prediction time it takes as input pytorch tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        graph_executor_module (GraphModule): The graph executor. This is the
            central component in the ApacheTVM optimized model execution.
        input_names (List[str]): Names associated to the model input tensors.
        lib (Module): Component needed for loading the ApacheTVM optimized
            model.
        target (str): Target device. It can be wither `llvm` for targeting CPUs
            or "cuda" for targeting GPUs.
    """

    def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (
            input_tensor.cpu().detach().numpy()
            for input_tensor in input_tensors
        )
        input_shapes = (
            [tuple(input_tensor.shape) for input_tensor in input_tensors]
            if self.network_parameters.dynamic_info is not None
            else None
        )
        output_arrays = self._inner_predict(input_arrays, input_shapes)
        return tuple(
            torch.from_numpy(array).to(self.device.to_torch_format())
            for array in output_arrays
        )

    @staticmethod
    def _convert_device(device: Any):
        if isinstance(device, int):
            return "cpu"
        return device


class TensorflowApacheTVMInferenceLearner(
    BaseArrayApacheTVMInferenceLearner, TensorflowBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a tensorflow interface.

    This class can be used exactly in the same way as a tf.Module or
    keras.Model object.
    At prediction time it takes as input tensorflow tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        graph_executor_module (GraphModule): The graph executor. This is the
            central component in the ApacheTVM optimized model execution.
        input_names (List[str]): Names associated to the model input tensors.
        lib (Module): Component needed for loading the ApacheTVM optimized
            model.
        target (str): Target device. It can be wither `llvm` for targeting CPUs
            or "cuda" for targeting GPUs.
    """

    def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[Tensor]): Input tensors belonging to the same
                batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[Tensor]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)
        input_shapes = (
            [tuple(input_tensor.shape) for input_tensor in input_tensors]
            if self.network_parameters.dynamic_info is not None
            else None
        )
        return tuple(
            tf.convert_to_tensor(out)
            for out in self._inner_predict(input_arrays, input_shapes)
        )


class NumpyApacheTVMInferenceLearner(
    BaseArrayApacheTVMInferenceLearner, NumpyBaseInferenceLearner
):
    """Model optimized using ApacheTVM with a tensorflow interface.

    This class can be used exactly in the same way as a tf.Module or
    keras.Model object.
    At prediction time it takes as input tensorflow tensors given as positional
    arguments.

    Attributes:
        network_parameters (ModelParams): The model parameters as batch
                size, input and output sizes.
        graph_executor_module (GraphModule): The graph executor. This is the
            central component in the ApacheTVM optimized model execution.
        input_names (List[str]): Names associated to the model input tensors.
        lib (Module): Component needed for loading the ApacheTVM optimized
            model.
        target (str): Target device. It can be wither `llvm` for targeting CPUs
            or "cuda" for targeting GPUs.
    """

    def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
        """Predict on the input tensors.

        Note that the input tensors must be on the same batch. If a sequence
        of tensors is given when the model is expecting a single input tensor
        (with batch size >= 1) an error is raised.

        Args:
            input_tensors (Tuple[ndarray]): Input tensors belonging to the
                same batch. The tensors are expected having dimensions
                (batch_size, dim1, dim2, ...).

        Returns:
            Tuple[ndarray]: Output tensors. Note that the output tensors does
                not correspond to the prediction on the input tensors with a
                1 to 1 mapping. In fact the output tensors are produced as the
                multiple-output of the model given a (multi-) tensor input.
        """
        input_arrays = (input_tensor for input_tensor in input_tensors)
        input_shapes = (
            [tuple(input_tensor.shape) for input_tensor in input_tensors]
            if self.network_parameters.dynamic_info is not None
            else None
        )
        return tuple(self._inner_predict(input_arrays, input_shapes))


APACHE_TVM_INFERENCE_LEARNERS: Dict[
    DeepLearningFramework, Type[ApacheTVMInferenceLearner]
] = {
    DeepLearningFramework.PYTORCH: PytorchApacheTVMInferenceLearner,
    DeepLearningFramework.TENSORFLOW: TensorflowApacheTVMInferenceLearner,
    DeepLearningFramework.NUMPY: NumpyApacheTVMInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/utils.py
================================================
from pathlib import Path
from typing import Union, Any

from nebullvm.operations.inference_learners.base import LearnerMetadata
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.tools.diffusers import postprocess_diffusers


def load_model(path: Union[Path, str], pipe: StableDiffusionPipeline = None):
    """Load the optimized model previously saved in the given path.

    Args:
        path (Union[Path, str]): Path to the directory where the model is
            saved.
        pipe (StableDiffusionPipeline): Diffusion pipeline to be used for
            loading the model. This parameter is only needed if the model
            to be loaded is a diffusion model. Default: None.

    Returns:
        InferenceLearner: Model optimized by Speedster.
    """
    optimized_model = LearnerMetadata.read(path).load_model(path)
    if pipe is not None:
        optimized_model = postprocess_diffusers(
            optimized_model, pipe, optimized_model.device
        )
    return optimized_model


def save_model(model: Any, path: Union[Path, str]):
    """Save the optimized model in the given path.

    Args:
        model (Any): Model to be saved.
        path (Union[Path, str]): Path to the directory where to
            save the model.

    Returns:
        InferenceLearner: Model optimized by Speedster.
    """
    if isinstance(model, StableDiffusionPipeline):
        model.unet.model.save(path)
    else:
        model.save(path)


================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/base.py
================================================
import abc

from nebullvm.operations.base import Operation


class Measure(Operation, abc.ABC):
    def __init__(self):
        super().__init__()
        self.measure_result = None

    @abc.abstractmethod
    def execute(self, **kwargs):
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/measures.py
================================================
from typing import List, Tuple, Any, Callable, Dict

import numpy as np

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import (
    BenchmarkOriginalModelResult,
    DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.operations.measures.base import Measure
from nebullvm.operations.measures.utils import (
    compute_torch_latency,
    compute_tf_latency,
    compute_onnx_latency,
    compute_relative_difference,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import run_onnx_model
from nebullvm.tools.pytorch import run_torch_model
from nebullvm.tools.tf import run_tf_model

COMPUTE_OUTPUT_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {
    DeepLearningFramework.PYTORCH: run_torch_model,
    DeepLearningFramework.TENSORFLOW: run_tf_model,
    DeepLearningFramework.NUMPY: run_onnx_model,
}

COMPUTE_LATENCY_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {
    DeepLearningFramework.PYTORCH: compute_torch_latency,
    DeepLearningFramework.TENSORFLOW: compute_tf_latency,
    DeepLearningFramework.NUMPY: compute_onnx_latency,
}


class MetricDropMeasure(Measure):
    def __init__(self):
        super().__init__()
        self.valid = None

    def execute(
        self,
        optimized_learner: BaseInferenceLearner,
        input_data: List[Tuple[Any, ...]],
        base_outputs_list: List[Tuple[Any, ...]],
        perf_loss_ths: float,
        metric_func: Callable = None,
        ys: List = None,
        aggregation_func: Callable = np.mean,
    ):
        metric_func = metric_func or compute_relative_difference
        relative_differences = []
        if ys is None:
            ys = [None] * len(input_data)

        assert len(input_data) == len(base_outputs_list) == len(ys), (
            "INTERNAL ASSERT FAILED: error during computation of precision "
            "of the optimized model, got wrong dimensions of the data. "
        )

        for inputs, base_outputs, y in zip(input_data, base_outputs_list, ys):
            opt_outputs = optimized_learner(*inputs)
            relative_difference = max(
                metric_func(base_output, opt_output, y)
                for base_output, opt_output in zip(base_outputs, opt_outputs)
            )
            relative_differences.append(relative_difference)
        relative_difference = aggregation_func(relative_differences)
        self.valid = relative_difference <= perf_loss_ths
        self.measure_result = relative_difference

    def get_result(self) -> Tuple[bool, float]:
        return self.valid, self.measure_result


class LatencyOriginalModelMeasure(Measure):
    def __init__(self):
        super().__init__()
        self.outputs = None

    def execute(
        self,
        model: Any,
        input_data: DataManager,
        dl_framework: DeepLearningFramework,
    ) -> BenchmarkOriginalModelResult:
        self.logger.info("Benchmark performance of original model")

        self.outputs = [
            tuple(
                COMPUTE_OUTPUT_FRAMEWORK[dl_framework](
                    model, tuple(input_tensors[0]), self.device
                )
            )
            for input_tensors in input_data
        ]

        inputs = input_data.get_list(QUANTIZATION_DATA_NUM)
        self.measure_result, _ = COMPUTE_LATENCY_FRAMEWORK[dl_framework](
            inputs, model, self.device
        )
        self.logger.info(
            f"Original model latency: {self.measure_result} sec/iter"
        )

        return BenchmarkOriginalModelResult(
            latency_seconds=self.measure_result,
            model_outputs=self.outputs,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/utils.py
================================================
import time
from typing import Tuple, List, Union, Any

import numpy as np
from loguru import logger

from nebullvm.config import ONNX_PROVIDERS
from nebullvm.core.models import Device, DeviceType
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import (
    convert_to_numpy,
    get_input_names,
    get_output_names,
)


def compute_torch_latency(
    xs: List[Tuple[torch.Tensor]],
    model: Module,
    device: Device,
    steps: int = 100,
    warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
    """Compute the latency associated with the torch model.

    Args:
        xs (List[Tuple[torch.Tensor]]): List of tuples containing the
            input tensors (a single batch for the model).
        model (Module): Torch model.
        device (Device): Device where computing the latency.
        steps (int, optional): Number of input data to be used to compute the
            latency of the model. It must be a number <= len(xs). Default: 100.
        warmup_steps (int, optional): Number of input data to be used to warm
            up the model. It must be a number <= len(xs). Default: 10.

    Returns:
        Float: Average latency.
        List[Float]: List of latencies obtained.
    """
    if device.type is not DeviceType.TPU:
        xs = [
            tuple(t.to(device.to_torch_format()) for t in tensors)
            for tensors in xs
        ]
        model = model.to(device.to_torch_format())
    model.eval()
    latencies = []
    with torch.no_grad():
        for i in range(warmup_steps):
            _ = model.forward(*xs[i])
        for i in range(steps):
            starting_time = time.time()
            _ = model.forward(*xs[i])
            latencies.append(time.time() - starting_time)
        latency = np.mean(latencies)
    return latency, latencies


def compute_tf_latency(
    xs: List[Tuple[tf.Tensor]],
    model: Union[tf.Module, tf.keras.Model],
    device: Device,
    steps: int = 100,
    warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
    """Compute the latency associated with the tensorflow model.

    Args:
        xs (List[Tuple[tf.Tensor]]): List of tuples containing the
            input tensors (a single batch for the model).
        model (Module or keras.Model): TF model.
        device (Device): Device where computing the latency.
        steps (int, optional): Number of input data to be used to compute the
            latency of the model. It must be a number <= len(xs). Default: 100.
        warmup_steps (int, optional): Number of input data to be used to warm
            up the model. It must be a number <= len(xs). Default: 10.

    Returns:
        Float: Average latency.
        List[Float]: List of latencies obtained.
    """
    latencies = []
    with tf.device(device.to_tf_format()):
        for i in range(warmup_steps):
            _ = model(xs[i])
        for i in range(steps):
            starting_time = time.time()
            _ = model(xs[i])
            latencies.append(time.time() - starting_time)
        latency = np.mean(latencies)
        return latency, latencies


def compute_onnx_latency(
    xs: List[Tuple[np.array]],
    model: str,
    device: Device,
    steps: int = 100,
    warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
    """Compute the latency associated with the ONNX model.

    Args:
        xs (List[Tuple[np.array]]): List of tuples containing the
            inputs (a single batch for the model).
        model (str): ONNX model path.
        device (Device): Device where computing the latency.
        steps (int, optional): Number of input data to be used to compute the
            latency of the model. It must be a number <= len(xs). Default: 100.
        warmup_steps (int, optional): Number of input data to be used to warm
            up the model. It must be a number <= len(xs). Default: 10.

    Returns:
        Float: Average latency.
        List[Float]: List of latencies obtained.
    """
    from nebullvm.optional_modules.onnxruntime import onnxruntime as ort

    input_names = get_input_names(model)
    output_names = get_output_names(model)

    if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3:
        ONNX_PROVIDERS["cuda"][1] = (
            "CUDAExecutionProvider",
            {
                "device_id": device.idx,
            },
        )

    model = ort.InferenceSession(
        model,
        providers=ONNX_PROVIDERS["cuda"][1:]
        if device.type is DeviceType.GPU
        else ONNX_PROVIDERS["cpu"],
    )

    latencies = []
    for i in range(warmup_steps):
        inputs = {name: array for name, array in zip(input_names, xs[i])}
        _ = model.run(output_names=output_names, input_feed=inputs)
    for i in range(steps):
        inputs = {name: array for name, array in zip(input_names, xs[i])}
        starting_time = time.time()
        _ = model.run(output_names=output_names, input_feed=inputs)
        latencies.append(time.time() - starting_time)
    latency = np.mean(latencies)
    return latency, latencies


def compute_optimized_running_time(
    optimized_model: BaseInferenceLearner,
    input_data: DataManager,
    steps: int = 100,
    min_steps: int = 5,
    warmup_steps: int = 10,
) -> float:
    """Compute the running time of the optimized model.

    Args:
        optimized_model (BaseInferenceLearner): Optimized model.
        input_data: (DataManager): Dataset used to compute latency.
        steps (int, optional): Number of input data to be used to
            compute the latency of the model. Default: 100.
        min_steps (int, optional): Minimum number of iterations to
            be performed. Default: 5.
        warmup_steps (int, optional): Number of input data to be used
            to warm up the model. Default: 10.

    Returns:
        Float: Average latency.
    """

    latencies = []
    last_median = None

    # Warmup
    inputs_list = input_data.get_split("test").get_list(warmup_steps)
    for model_inputs in inputs_list:
        _ = optimized_model(*model_inputs)

    # Compute latency
    inputs_list = input_data.get_split("test").get_list(steps)
    for model_inputs in inputs_list:
        starting_time = time.time()
        _ = optimized_model(*model_inputs)
        latencies.append(time.time() - starting_time)
        if len(latencies) > min_steps:
            median = np.median(latencies)
            diff = (
                np.abs(median - last_median) / last_median
                if last_median is not None
                else 1.0
            )
            if diff < 0.05:
                return median
            last_median = median
    return np.median(latencies)


def compute_relative_difference(
    tensor_1: Any,
    tensor_2: Any,
    y: Any = None,
    eps: float = 1e-5,
) -> float:
    if y is not None:
        logger.debug(
            "Received a label for the precision computation. "
            "It will be ignored."
        )

    tensor_1, tensor_2 = map(convert_to_numpy, (tensor_1, tensor_2))

    assert tensor_1.shape == tensor_2.shape, (
        "The outputs of the original and optimized models have "
        "different shapes"
    )

    diff = np.abs(tensor_1 - tensor_2) / (
        np.maximum(np.abs(tensor_1), np.abs(tensor_2)) + eps
    )
    return float(np.mean(diff))


def compute_accuracy_drop(tensor_1: Any, tensor_2: Any, y: Any) -> float:
    assert y is not None, (
        "No label found in the dataloader provided. "
        "To use accuracy metric, you must set also the labels"
    )
    tensor_1, tensor_2, y = map(convert_to_numpy, (tensor_1, tensor_2, y))
    accuracy_1 = np.mean(tensor_1.argmax(axis=-1) == y)
    accuracy_2 = np.mean(tensor_2.argmax(axis=-1) == y)
    return accuracy_1 - accuracy_2


QUANTIZATION_METRIC_MAP = {
    "accuracy": compute_accuracy_drop,
    "numeric_precision": compute_relative_difference,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/base.py
================================================
import abc
from typing import Any, Dict, List, Optional

from nebullvm.core.models import QuantizationType
from nebullvm.operations.base import Operation


class Compiler(Operation, abc.ABC):
    supported_ops: Dict[str, List[Optional[QuantizationType]]]

    def __init__(self):
        super().__init__()
        self.compiled_model = None

    @abc.abstractmethod
    def execute(self, **kwargs):
        raise NotImplementedError()

    @abc.abstractmethod
    def _compile_model(self, **kwargs) -> Any:
        raise NotImplementedError()

    @abc.abstractmethod
    def _quantize_model(self, **kwargs) -> Any:
        raise NotImplementedError()

    def get_result(self) -> Any:
        return self.compiled_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/deepsparse.py
================================================
from pathlib import Path
from typing import Union

from nebullvm.core.models import (
    ModelParams,
    QuantizationType,
)
from nebullvm.operations.conversions.converters import (
    PytorchConverter,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.optional_modules.torch import (
    Module,
    GraphModule,
)
from nebullvm.tools.data import DataManager


class DeepSparseCompiler(Compiler):
    supported_ops = {
        "cpu": [None],
        "gpu": [],
    }

    def __init__(self):
        super().__init__()
        self.conversion_op = PytorchConverter()

    def execute(
        self,
        model: Module,
        onnx_output_path: str,
        model_params: ModelParams,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using DeepSparse Compiler.

        Args:
            model (torch.nn.Module): The pytorch model.
            onnx_output_path (str): Path where the converted ONNX model will be
                stored.
            model_params (ModelParams): The model parameters.
            quantization_type (QuantizationType): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        self.compiled_model = self._compile_model(
            model, onnx_output_path, input_data, model_params
        )

    def _compile_model(
        self,
        model: Union[Module, GraphModule],
        onnx_output_path: str,
        input_data: DataManager,
        model_params: ModelParams,
    ) -> str:
        self.conversion_op.model_name = "model_pruned"
        onnx_pruned_path = Path(onnx_output_path)
        self.conversion_op.to(self.device).set_state(
            model, input_data
        ).execute(onnx_pruned_path, model_params)
        onnx_pruned_path = str(onnx_pruned_path / "model_pruned.onnx")
        return onnx_pruned_path

    @staticmethod
    def _quantize_model(**kwargs):
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/__init__.py
================================================
from copy import deepcopy
from typing import Union

from nebullvm.core.models import QuantizationType, DeviceType
from nebullvm.operations.optimizations.compilers.faster_transformer.bert import (  # noqa: E501
    detect_and_swap_bert_model,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
    TorchScriptCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
    get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.torch import (
    GraphModule,
    Module,
    ScriptModule,
    torch,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.huggingface import PyTorchTransformerWrapper

default_lib_path = str(
    get_faster_transformer_repo_path()
    / "build"
    / "lib"
    / "libth_transformer.so"
)


def detect_and_swap_model(model, data_type="fp16", remove_padding=False):
    """currently only supports:
    - BertModel and model with BertModel as .bert attribute
    """
    model = detect_and_swap_bert_model(
        model,
        data_type=data_type,
        lib_path=default_lib_path,
        remove_padding=remove_padding,
    )
    if data_type == "fp16":
        model.half()
    elif data_type == "bf16":
        model.bfloat16()
    return model


class FasterTransformerCompiler(TorchScriptCompiler):
    supported_ops = {
        "cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],
        "gpu": [
            None,
            QuantizationType.HALF,
        ],
    }

    @torch.no_grad()
    def _compile_model(
        self,
        model: Union[Module, GraphModule],
        input_data: DataManager,
        quantization_type: QuantizationType,
    ) -> ScriptModule:
        model = deepcopy(model)  # Some operations modify the model in-place
        if isinstance(model, PyTorchTransformerWrapper):
            # .core_model is a huggingface model
            data_type = (
                "fp16"
                if quantization_type is QuantizationType.HALF
                else "fp32"
            )
            model.core_model = detect_and_swap_model(
                model.core_model, data_type=data_type, remove_padding=False
            )
            if self.device.type is DeviceType.GPU:
                model.cuda()

        return super()._compile_model(model, input_data, quantization_type)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/__init__.py
================================================
import os

from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import (  # noqa: E501
    BertModel as FasterBertModel,
)
from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import (  # noqa: E501
    CustomEncoder,
    EncoderWeights,
)
from nebullvm.operations.optimizations.compilers.utils import (
    get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.huggingface import BertModel as HFBertModel
from nebullvm.optional_modules.torch import torch


default_lib_path = str(
    get_faster_transformer_repo_path()
    / "build"
    / "lib"
    / "libth_transformer.so"
)


def swap_bert_encoder(model, data_type, lib_path, remove_padding=False):
    """
    Replace the encoder of the model with a custom encoder
    that uses the Faster Transformer library.
    """
    weights = EncoderWeights(
        model.config.num_hidden_layers,
        model.config.hidden_size,
        model.state_dict(),
    )
    weights.to_cuda()
    if data_type == "fp16":
        weights.to_half()
    elif data_type == "bf16":
        weights.to_bfloat16()
    lib_path = os.path.abspath(lib_path)
    enc = CustomEncoder(
        model.config.num_hidden_layers,
        model.config.num_attention_heads,
        model.config.hidden_size // model.config.num_attention_heads,
        weights,
        remove_padding=remove_padding,
        path=lib_path,
    )
    enc_ = torch.jit.script(enc)
    model.replace_encoder(enc_)


def swap_model(
    model: HFBertModel, data_type, lib_path, remove_padding=False
) -> FasterBertModel:
    # bert model need some custom code to call the custom encoder
    # so we need to use custom bert class
    new_model = FasterBertModel(model.config)
    new_model.load_state_dict(model.state_dict())
    swap_bert_encoder(new_model, data_type, lib_path, remove_padding)
    return new_model


def detect_and_swap_bert_model(
    model, data_type, lib_path=default_lib_path, remove_padding=False
):
    if type(model) == HFBertModel:
        model = swap_model(model, data_type, lib_path, remove_padding)
    if hasattr(model, "bert") and type(model.bert) == HFBertModel:
        model.bert = swap_model(
            model.bert, data_type, lib_path, remove_padding
        )
    return model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/checkpoint_quantization.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/checkpoint_quantization.py # noqa: E501
# Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

import numpy as np
from loguru import logger

from nebullvm.optional_modules.torch import torch

ACTIVATION_AMAX_NUM = 72
INT8O_GEMM_NUM = 8
TRT_FUSED_MHA_AMAX_NUM = 3
SCALE_RESERVE_NUM = 21


def checkpoint_quantization(
    init_dict, sparse, ths_path="./lib/libth_transformer.so"
):
    logger.info("Quantizing checkpoint ...")
    torch.classes.load_library(ths_path)
    weight_quantize = torch.ops.fastertransformer.weight_quantize

    def init_graph():
        layer_num = 0
        regex = re.compile("layer.\d+")  # noqa: W605
        amaxTotalNum = 0
        for name, tensor_value in init_dict.items():
            if "intermediate.dense.weight" in name and amaxTotalNum == 0:
                amaxTotalNum = (
                    ACTIVATION_AMAX_NUM
                    + 9 * tensor_value.size(1)
                    + INT8O_GEMM_NUM
                    + TRT_FUSED_MHA_AMAX_NUM
                    + SCALE_RESERVE_NUM
                )
            tmp = regex.findall(name)
            if len(tmp) < 1:
                continue
            num_tmp = int(tmp[0].replace("layer.", ""))
            if layer_num < num_tmp:
                layer_num = num_tmp
        layer_num = layer_num + 1
        # add new var for amax
        for i in range(layer_num):
            init_dict[
                "bert.encoder.layer.{}.amaxList".format(i)
            ] = torch.zeros((amaxTotalNum,), dtype=torch.float32)
        return layer_num, amaxTotalNum

    layer_num, amaxTotalNum = init_graph()

    kernel_name_list = [
        "attention.self.query",
        "attention.self.key",
        "attention.self.value",
        "attention.output.dense",
        "intermediate.dense",
        "output.dense",
    ]

    amax_name_list = [
        "attention.self.query._input_quantizer",
        "attention.self.query._aftergemm_quantizer",
        "attention.self.matmul_q_input_quantizer",
        "attention.self.key._aftergemm_quantizer",
        "attention.self.matmul_k_input_quantizer",
        "attention.self.value._aftergemm_quantizer",
        "attention.self.matmul_v_input_quantizer",
        "attention.self.softmax_input_quantizer",
        "attention.self.matmul_a_input_quantizer",
        "attention.output.dense._input_quantizer",
        "attention.output.dense._aftergemm_quantizer",
        "intermediate.dense._input_quantizer",
        "intermediate.dense._aftergemm_quantizer",
        "output.dense._input_quantizer",
        "output.dense._aftergemm_quantizer",
        "special_F2Bias_scale",
    ]

    int8O_gemm_weight_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
    int8O_gemm_weight_list = [
        "attention.self.query",
        "attention.self.key",
        "attention.self.value",
        "attention.self.matmul_k_input_quantizer",
        "attention.self.matmul_v_input_quantizer",
        "attention.output.dense",
        "intermediate.dense",
        "output.dense",
    ]

    int8O_gemm_input_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
    int8O_gemm_input_list = [
        "attention.self.query._input_quantizer",
        "attention.self.key._input_quantizer",
        "attention.self.value._input_quantizer",
        "attention.self.matmul_q_input_quantizer",
        "attention.self.matmul_a_input_quantizer",
        "attention.output.dense._input_quantizer",
        "intermediate.dense._input_quantizer",
        "output.dense._input_quantizer",
    ]

    int8O_gemm_output_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
    int8O_gemm_output_list = [
        "attention.self.query._aftergemm_quantizer",
        "attention.self.key._aftergemm_quantizer",
        "attention.self.value._aftergemm_quantizer",
        "attention.self.softmax_input_quantizer",
        "attention.output.dense._input_quantizer",
        "attention.output.dense._aftergemm_quantizer",
        "intermediate.dense._aftergemm_quantizer",
        "output.dense._aftergemm_quantizer",
    ]

    same_value_tuple_list = [
        (
            "attention.self.query._input_quantizer",
            "attention.self.key._input_quantizer",
            "attention.self.value._input_quantizer",
            "attention.output.add_residual_input_quantizer",
        ),
        (
            "intermediate.dense._input_quantizer",
            "output.add_residual_input_quantizer",
        ),
    ]

    factor = 1000000.0  # noqa: F841
    for i in range(layer_num):
        amaxList = np.zeros([amaxTotalNum]).astype(np.float32)
        amax_id = 0
        # verify some quantizers have same value.
        # input_quantizer is per-tensor quantization
        for same_value_tuple in same_value_tuple_list:
            tmp_v = init_dict[
                "bert.encoder.layer.{}.{}._amax".format(i, same_value_tuple[0])
            ].numpy()
            for same_value_name in same_value_tuple:
                tmp_v_2 = init_dict[
                    "bert.encoder.layer.{}.{}._amax".format(i, same_value_name)
                ].numpy()
                assert np.allclose(tmp_v, tmp_v_2)

        for amax_name in amax_name_list:
            if amax_name == "special_F2Bias_scale":
                if i != layer_num - 1:
                    quant_max = init_dict[
                        "bert.encoder.layer.{}.{}._amax".format(
                            i + 1, amax_name_list[0]
                        )
                    ].item()
                    amax = abs(quant_max)
                else:
                    # not used, placeholder
                    amax = 1.0
                amaxList[amax_id] = amax
                amax_id += 1
                amaxList[amax_id] = amax / 127.0
                amax_id += 1
                amaxList[amax_id] = amax / 127.0 / 127.0
                amax_id += 1
                amaxList[amax_id] = 127.0 / amax
                amax_id += 1
                continue

            quant_max = init_dict[
                "bert.encoder.layer.{}.{}._amax".format(i, amax_name)
            ].item()
            amax = abs(quant_max)  # round(abs(quant_max)*factor)/factor
            if amax_name in int8O_gemm_input_list:
                int8O_gemm_input_amax_list[
                    int8O_gemm_input_list.index(amax_name)
                ] = amax
                if amax_name == "attention.self.query._input_quantizer":
                    int8O_gemm_input_amax_list[
                        int8O_gemm_input_list.index(
                            "attention.self.key._input_quantizer"
                        )
                    ] = amax
                    int8O_gemm_input_amax_list[
                        int8O_gemm_input_list.index(
                            "attention.self.value._input_quantizer"
                        )
                    ] = amax
            if amax_name in int8O_gemm_output_list:
                int8O_gemm_output_amax_list[
                    int8O_gemm_output_list.index(amax_name)
                ] = amax
            if amax_name in int8O_gemm_weight_list:
                int8O_gemm_weight_amax_list[
                    int8O_gemm_weight_list.index(amax_name)
                ] = amax
            amaxList[amax_id] = amax
            amax_id += 1
            amaxList[amax_id] = amax / 127.0
            amax_id += 1
            amaxList[amax_id] = amax / 127.0 / 127.0
            amax_id += 1
            amaxList[amax_id] = 127.0 / amax
            amax_id += 1

        # kernel amax starts from ACTIVATION_AMAX_NUM
        assert amax_id == 64
        amax_id = ACTIVATION_AMAX_NUM
        for kernel_id, kernel_name in enumerate(kernel_name_list):
            kernel = (
                init_dict[
                    "bert.encoder.layer.{}.{}.weight".format(i, kernel_name)
                ]
                .transpose(-1, -2)
                .contiguous()
            )
            quant_max2 = init_dict[
                "bert.encoder.layer.{}.{}._weight_quantizer._amax".format(
                    i, kernel_name
                )
            ]
            amax2 = abs(quant_max2)
            if amax2.dim() == 0:
                quant_max_processed = torch.full(
                    (kernel.size(1),),
                    amax2.item(),
                    dtype=amax2.dtype,
                    device=amax2.device,
                )
            else:
                quant_max_processed = amax2.view(-1)
            kernel_processed = weight_quantize(
                kernel, quant_max_processed.cuda(), sparse
            )
            init_dict[
                "bert.encoder.layer.{}.{}.weight".format(i, kernel_name)
            ] = kernel_processed
            if kernel_name in int8O_gemm_weight_list:
                int8O_gemm_weight_amax_list[
                    int8O_gemm_weight_list.index(kernel_name)
                ] = quant_max_processed[0]
            for e in quant_max_processed:
                amaxList[amax_id] = e
                amax_id += 1

        # for int8O gemm deQuant
        for j in range(INT8O_GEMM_NUM):
            amaxList[amax_id] = (
                int8O_gemm_input_amax_list[j] * int8O_gemm_weight_amax_list[j]
            ) / (127.0 * int8O_gemm_output_amax_list[j])
            amax_id += 1

        # for trt fused MHA amax
        # QKV_addBias_amax
        amaxList[amax_id] = np.maximum(
            np.maximum(amaxList[8], amaxList[16]), amaxList[24]
        )
        amax_id += 1
        # softmax amax
        amaxList[amax_id] = amaxList[32]
        amax_id += 1
        # bmm2 amax
        amaxList[amax_id] = amaxList[36]
        amax_id += 1

        init_dict["bert.encoder.layer.{}.amaxList".format(i)] = torch.tensor(
            amaxList, dtype=torch.float32
        )
    logger.info("Quantizing checkpoint done.")
    return init_dict


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/modeling_bert.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py  # noqa: E501
# This file is mostly copied from the FasterTransformer repo
# https://github.com/NVIDIA/FasterTransformer
# Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional

from loguru import logger

from nebullvm.optional_modules.torch import torch, torch_distributed as dist

from nebullvm.optional_modules.huggingface import (
    BertConfig,
    BertEmbeddings,
    BertEncoder,
    BertPooler,
    BertPreTrainedModel,
)

from .checkpoint_quantization import checkpoint_quantization


class EncoderWeights(object):
    def __init__(
        self,
        layer_num,
        hidden_dim,
        weights=None,
        sparse=False,
        tensor_para_size=1,
        pipeline_para_size=1,
    ):
        """weights need be a state_dict of bert model"""
        self.layer_num = layer_num
        self.int8 = False
        self.hidden_dim = hidden_dim
        self.weights = {}
        self.tensor_para_size = tensor_para_size
        self.pipeline_para_size = pipeline_para_size

        self.use_mpi = dist.is_mpi_available()

        if self.use_mpi:
            try:
                dist.init_process_group(backend="mpi")
            except:  # noqa: E722
                logger.info(
                    "[INFO] WARNING: Exception occurred in "
                    "dist.init_process_group(backend='mpi')."
                    "Maybe the process group has been initialized somewhere else."  # noqa: E501
                )
        else:
            logger.info("[INFO] MPI is not available in this PyTorch build.")
            assert (
                tensor_para_size == 1
            ), "[FATAL] MPI is required for tensor_para_size > 1."
            assert (
                pipeline_para_size == 1
            ), "[FATAL] MPI is required for pipeline_para_size > 1."

        self.rank = dist.get_rank() if self.use_mpi else 0
        self.device_count = torch.cuda.device_count()
        self.device = self.rank % self.device_count
        torch.cuda.set_device(self.device)

        world_size = dist.get_world_size() if self.use_mpi else 1  # noqa: F841
        self.tensor_para_rank = self.rank % self.tensor_para_size
        self.pipeline_para_rank = self.rank // self.tensor_para_size
        if weights is None:
            self._generated_weights = True
            for i in range(layer_num):
                pre = "encoder.layer." + str(i) + "."
                self.weights[
                    pre + "attention.self.query.weight"
                ] = torch.zeros(hidden_dim, hidden_dim)
                self.weights[pre + "attention.self.query.bias"] = torch.zeros(
                    hidden_dim
                )
                self.weights[pre + "attention.self.key.weight"] = torch.zeros(
                    hidden_dim, hidden_dim
                )
                self.weights[pre + "attention.self.key.bias"] = torch.zeros(
                    hidden_dim
                )
                self.weights[
                    pre + "attention.self.value.weight"
                ] = torch.zeros(hidden_dim, hidden_dim)
                self.weights[pre + "attention.self.value.bias"] = torch.zeros(
                    hidden_dim
                )
                self.weights[
                    pre + "attention.output.dense.weight"
                ] = torch.zeros(hidden_dim, hidden_dim)
                self.weights[
                    pre + "attention.output.dense.bias"
                ] = torch.zeros(hidden_dim)
                self.weights[
                    pre + "attention.output.LayerNorm.weight"
                ] = torch.zeros(hidden_dim)
                self.weights[
                    pre + "attention.output.LayerNorm.bias"
                ] = torch.zeros(hidden_dim)
                self.weights[pre + "intermediate.dense.weight"] = torch.zeros(
                    4 * hidden_dim, hidden_dim
                )  # noqa: E501
                self.weights[pre + "intermediate.dense.bias"] = torch.zeros(
                    4 * hidden_dim
                )
                self.weights[pre + "output.dense.weight"] = torch.zeros(
                    hidden_dim, 4 * hidden_dim
                )
                self.weights[pre + "output.dense.bias"] = torch.zeros(
                    hidden_dim
                )
                self.weights[pre + "output.LayerNorm.weight"] = torch.zeros(
                    hidden_dim
                )
                self.weights[pre + "output.LayerNorm.bias"] = torch.zeros(
                    hidden_dim
                )
            for k, v in self.weights.items():
                if not k.endswith("_amax"):
                    self.weights[k] = torch.nn.init.uniform_(v, -1, 1)
            if sparse:
                for k, v in self.weights.items():
                    if (
                        "query.weight" in k
                        or "key.weight" in k
                        or "value.weight" in k
                        or "dense.weight" in k
                    ):
                        v_shape = v.shape
                        v = v.view(-1, 4)
                        _, indices = torch.topk(
                            torch.abs(v), 2, dim=-1, largest=False
                        )
                        v.scatter_(1, indices, 0)
                        self.weights[k] = v.view(v_shape)
        else:
            self._generated_weights = False
            for k, v in weights.items():
                ks = k.split(".")
                if ks[-2] == "LayerNorm":
                    if ks[-1] == "gamma":
                        ks[-1] = "weight"
                    elif ks[-1] == "beta":
                        ks[-1] = "bias"
                self.weights[".".join(ks)] = v

    def listed_weights(self):
        ret = []
        start_layer = (
            self.pipeline_para_rank * self.layer_num // self.pipeline_para_size
        )
        end_layer = (
            (self.pipeline_para_rank + 1)
            * self.layer_num
            // self.pipeline_para_size
        )
        if not self.int8:
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.query.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 0
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.query.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.key.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 2
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.key.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.value.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 4
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.value.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.dense.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 6
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.dense.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.LayerNorm.weight"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.LayerNorm.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "intermediate.dense.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 10
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "intermediate.dense.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.dense.weight"
                        ].transpose(-1, -2)
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )  # 12
            ret[-1] = (
                ret[-1]
                .split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[
                    self.tensor_para_rank
                ]
                .contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.dense.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.LayerNorm.weight"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.LayerNorm.bias"
                        ]
                        for layer_idx in range(start_layer, end_layer)
                    ],
                    0,
                ).contiguous()
            )
        else:
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.query.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 0
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.query.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.key.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 2
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.key.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.value.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 4
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.self.value.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.dense.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 6
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.dense.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.LayerNorm.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "attention.output.LayerNorm.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "intermediate.dense.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 10
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "intermediate.dense.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.dense.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )  # 12
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.dense.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.LayerNorm.weight"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "output.LayerNorm.bias"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "amaxList"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
            ret.append(
                torch.stack(
                    [
                        self.weights[
                            "encoder.layer."
                            + str(layer_idx)
                            + "."
                            + "h_amaxList"
                        ]
                        for layer_idx in range(self.layer_num)
                    ],
                    0,
                ).contiguous()
            )
        return ret

    def to_cuda(self):
        if not self.int8:
            for k, v in self.weights.items():
                self.weights[k] = v.cuda()
        else:
            h_scale_list = {}
            for k, v in self.weights.items():
                if "amaxList" in k:
                    k_h = k.replace("amaxList", "h_amaxList")
                    h_scale_list[k_h] = v
                self.weights[k] = v.cuda()
            for k, v in h_scale_list.items():
                self.weights[k] = v

    def to_half(self):
        if self.int8:
            raise RuntimeError(
                "Cannot cast to half if the weights have been casted to int8."
            )
        for k, v in self.weights.items():
            self.weights[k] = v.half()

    def to_bfloat16(self):
        if self.int8:
            raise RuntimeError(
                "Cannot cast to bfloat16 if the weights have been casted to int8."  # noqa: E501
            )
        for k, v in self.weights.items():
            self.weights[k] = v.bfloat16()

    def to_int8(self, sparse=False, ths_path="./lib/libth_transformer.so"):
        if self._generated_weights:
            amax_tensor_1 = torch.Tensor(self.hidden_dim).fill_(127.0)
            amax_tensor_2 = torch.Tensor(self.hidden_dim * 4).fill_(127.0)
            for i in range(self.layer_num):
                pre = "encoder.layer." + str(i) + "."
                self.weights[
                    pre + "attention.self.query._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.query._weight_quantizer._amax"
                ] = amax_tensor_1
                self.weights[
                    pre + "attention.self.query._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.key._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.key._weight_quantizer._amax"
                ] = amax_tensor_1
                self.weights[
                    pre + "attention.self.key._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.value._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.value._weight_quantizer._amax"
                ] = amax_tensor_1
                self.weights[
                    pre + "attention.self.value._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.matmul_q_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.matmul_k_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.matmul_v_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.matmul_a_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.self.softmax_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.output.dense._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.output.dense._weight_quantizer._amax"
                ] = amax_tensor_1
                self.weights[
                    pre + "attention.output.dense._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.output.add_local_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "attention.output.add_residual_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "intermediate.dense._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "intermediate.dense._weight_quantizer._amax"
                ] = amax_tensor_2
                self.weights[
                    pre + "intermediate.dense._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "output.dense._input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "output.dense._weight_quantizer._amax"
                ] = amax_tensor_1
                self.weights[
                    pre + "output.dense._aftergemm_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "output.add_local_input_quantizer._amax"
                ] = torch.tensor(127.0)
                self.weights[
                    pre + "output.add_residual_input_quantizer._amax"
                ] = torch.tensor(127.0)
        if (
            "encoder.layer.0.attention.self.query._input_quantizer._amax"
            not in self.weights
        ):
            raise RuntimeError(
                "There is no quantization node in the checkpoint, cannot be quantized to int8."  # noqa: E501
            )
        if self.int8:
            return
        self.int8 = True
        for k, v in self.weights.items():
            if k.endswith("bias") or k.endswith("LayerNorm.weight"):
                self.weights[k] = v.half()
            elif k.endswith("weight"):
                self.weights[k] = v.float().cuda()
            else:
                self.weights[k] = v.float().cpu()
        self.weights = checkpoint_quantization(
            self.weights, sparse, ths_path, verbose=False
        )


class CustomEncoder(torch.nn.Module):
    def __init__(
        self,
        layer_num,
        head_num,
        head_size,
        weights,
        int8_mode=0,
        remove_padding=False,
        sparse=False,
        path="./lib/libth_transformer.so",
        tensor_para_size=1,
        pipeline_para_size=1,
    ):
        super().__init__()
        self.layer_num = layer_num
        self.remove_padding = remove_padding
        self.int8_mode = int8_mode
        logger.info(f"loading faster transformer library from {path}")
        torch.classes.load_library(path)

        weights_ = weights.listed_weights()

        self.use_mpi = dist.is_mpi_available()

        if self.use_mpi:
            try:
                dist.init_process_group(backend="mpi")
            except:  # noqa: E722
                logger.info(
                    "[INFO] WARNING: Exception occurred in"
                    "dist.init_process_group(backend='mpi')."
                    "Maybe the process group has been initialized somewhere else."  # noqa: E501
                )
        else:
            logger.info("[INFO] MPI is not available in this PyTorch build.")
            assert (
                tensor_para_size == 1
            ), "[FATAL] MPI is required for tensor_para_size > 1."
            assert (
                pipeline_para_size == 1
            ), "[FATAL] MPI is required for pipeline_para_size > 1."

        if int8_mode == 0:
            assert len(weights_) == 16
            try:
                self.encoders = torch.classes.FasterTransformer.Bert(
                    *weights_,
                    head_num,
                    head_size,
                    4 * head_num * head_size,
                    remove_padding,
                    layer_num,
                    sparse,
                    1.0,
                    tensor_para_size,
                    pipeline_para_size,
                )
            except:  # noqa: E722
                # legacy ths for 20.03 image
                self.encoders = torch.classes.FasterTransformerBert(
                    *weights_,
                    head_num,
                    head_size,
                    4 * head_num * head_size,
                    remove_padding,
                    layer_num,
                    sparse,
                    1.0,
                    tensor_para_size,
                    pipeline_para_size,
                )
        else:
            assert len(weights_) == 18
            assert (
                tensor_para_size == 1
            ), "INT8 BERT still only support tensor_para_size = 1"
            assert (
                pipeline_para_size == 1
            ), "INT8 BERT still only support pipeline_para_size = 1"
            try:
                self.encoders = torch.classes.FasterTransformer.INT8Bert(
                    *weights_,
                    head_num,
                    head_size,
                    remove_padding,
                    layer_num,
                    int8_mode,
                    sparse,
                    1.0,
                )
            except:  # noqa: E722
                # legacy ths for 20.03 image
                self.encoders = torch.classes.FasterTransformerINT8Bert(
                    *weights_,
                    head_num,
                    head_size,
                    remove_padding,
                    layer_num,
                    int8_mode,
                    sparse,
                    1.0,
                )

    def forward(self, hidden_states, attention_mask, sequence_lengths):
        hidden_states = self.encoders.forward(hidden_states, sequence_lengths)
        return (hidden_states,)


class HuggingFaceEncoder(torch.nn.Module):
    def __init__(self, layer_num, head_num, head_size, weights=None):
        super().__init__()
        hidden_dim = head_num * head_size
        # TODO(bhsueh) The implementation of hidden_act='gelu' is differen
        #  to FT's (and google BERT) implementation
        # FT's implementation is equivalent to hidden_act='gelu_new',
        # but there are some issues for int8 sparse under gelu_new
        conf = BertConfig(
            hidden_size=hidden_dim,
            intermediate_size=4 * hidden_dim,
            num_attention_heads=head_num,
            num_hidden_layers=layer_num,
            hidden_act="gelu",
        )
        self.encoder = BertEncoder(conf)
        w = {}
        for k, v in weights.weights.items():
            if k.startswith("encoder") and not k.endswith("_amax"):
                w[k[13:]] = weights.weights[k]
        self.encoder.load_state_dict(w)
        self.head_mask = [None] * layer_num

    def forward(self, hidden_states, attention_mask):
        extended_attention_mask = (1.0 - attention_mask) * -10000.0
        output = self.encoder(
            hidden_states,
            extended_attention_mask,
            self.head_mask,
            return_dict=False,
        )
        return output


# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # noqa: E501
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model modified from HuggingFace transformers. """


class BertModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

        self.init_weights()
        self.use_ext_encoder = False

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"  # noqa: E501
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds"
            )

        device = (
            input_ids.device if input_ids is not None else inputs_embeds.device
        )

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(
                input_shape, dtype=torch.long, device=device
            )

        if self.use_ext_encoder:
            # if attention_mask.dim() == 3:
            #     extended_attention_mask = attention_mask
            # elif attention_mask.dim() == 2:
            #     extended_attention_mask = attention_mask[:, None, :].repeat(1, input_shape[1], 1) # noqa: E501
            # else:
            #     raise ValueError(
            #         "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(# noqa: E501
            #             input_shape, attention_mask.shape
            #         )
            #     )
            assert attention_mask.dim() == 2
            extended_attention_mask = attention_mask.view(
                -1, 1, 1, attention_mask.size(-1)
            )
            m_2 = extended_attention_mask.transpose(-1, -2)
            extended_attention_mask = extended_attention_mask * m_2
            extended_attention_mask = extended_attention_mask.to(
                dtype=next(self.parameters()).dtype
            )  # fp16 compatibility
            seq_lens = torch.sum(attention_mask, 1, dtype=torch.int32).cuda()
        else:
            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # noqa: E501
            # ourselves in which case we just need to make it broadcastable to all heads. # noqa: E501
            if attention_mask.dim() == 3:
                extended_attention_mask = attention_mask[:, None, :, :]
            elif attention_mask.dim() == 2:
                extended_attention_mask = attention_mask[:, None, None, :]
            else:
                raise ValueError(
                    "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(  # noqa: E501
                        input_shape, attention_mask.shape
                    )
                )
            # Since attention_mask is 1.0 for positions we want to attend
            # and 0.0 for masked positions, this operation will create a
            # tensor which is 0.0 for positions we want to attend
            # and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax,
            # this is effectively the same as removing these entirely.
            extended_attention_mask = extended_attention_mask.to(
                dtype=next(self.parameters()).dtype
            )  # fp16 compatibility
            extended_attention_mask = (
                1.0 - extended_attention_mask
            ) * -10000.0

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
        )
        if self.use_ext_encoder:
            encoder_outputs = self.encoder(
                embedding_output, extended_attention_mask, seq_lens
            )
        else:
            head_mask = [None] * self.config.num_hidden_layers
            encoder_outputs = self.encoder(
                embedding_output,
                attention_mask=extended_attention_mask,
                head_mask=head_mask,
            )

        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

        outputs = (sequence_output, pooled_output,) + encoder_outputs[
            1:
        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions) # noqa: E501

    def replace_encoder(self, new_encoder):
        self.encoder = new_encoder
        self.use_ext_encoder = True


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/__init__.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/gpt_summarization.py # noqa: E501
# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import tempfile
from typing import Callable, Iterable, List, Optional, Tuple, Union

from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils import \
    gpt_decoder
from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils.huggingface_gpt_convert import (  # noqa: E501
    main as convert_huggingface_gpt_to_faster_transformer,
)
from nebullvm.operations.optimizations.compilers.utils import (
    get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.huggingface import GPT2LMHeadModel
from nebullvm.optional_modules.torch import torch

lib_path = default_lib_path = str(
    get_faster_transformer_repo_path()
    / "build"
    / "lib"
    / "libth_transformer.so"
)


class FasterTransformerGPT2Wrapper(torch.nn.Module):
    def __init__(self, model: gpt_decoder.Gpt, config):
        super().__init__()
        self.model = model
        self.config = config
        self.device = model.device

    @torch.no_grad()
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
        max_length: Optional[int] = None,
        min_length: Optional[int] = None,
        do_sample: Optional[bool] = None,
        early_stopping: Optional[bool] = None,
        num_beams: Optional[int] = 1,
        temperature: Optional[float] = None,
        penalty_alpha: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        typical_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        bad_words_ids: Optional[Iterable[int]] = None,
        force_words_ids: Optional[
            Union[Iterable[int], Iterable[Iterable[int]]]
        ] = None,
        bos_token_id: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        length_penalty: Optional[float] = None,
        no_repeat_ngram_size: Optional[int] = None,
        encoder_no_repeat_ngram_size: Optional[int] = None,
        num_return_sequences: Optional[int] = None,
        max_time: Optional[float] = None,
        max_new_tokens: Optional[int] = None,
        decoder_start_token_id: Optional[int] = None,
        use_cache: Optional[bool] = None,
        num_beam_groups: Optional[int] = None,
        diversity_penalty: Optional[float] = None,
        prefix_allowed_tokens_fn: Optional[
            Callable[[int, torch.Tensor], List[int]]
        ] = None,
        # logits_processor: Optional[LogitsProcessorList] = None,
        # renormalize_logits: Optional[bool] = None,
        # stopping_criteria: Optional[StoppingCriteriaList] = None,
        # constraints: Optional[List[Constraint]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        forced_bos_token_id: Optional[int] = None,
        forced_eos_token_id: Optional[int] = None,
        remove_invalid_values: Optional[bool] = None,
        synced_gpus: Optional[bool] = False,
        exponential_decay_length_penalty: Optional[Tuple[int, float]] = None,
        suppress_tokens: Optional[List[int]] = None,
        begin_suppress_tokens: Optional[List[int]] = None,
        forced_decoder_ids: Optional[List[List[int]]] = None,
    ):

        input_lengths = torch.tensor(
            [len(input) for input in inputs],
            dtype=torch.int32,
            device=self.model.device,
        )
        batch_size = len(inputs)

        def convert_to_tensor_if_not(value, dtype=torch.float32):
            if value is None:
                return value
            if isinstance(value, torch.Tensor):
                return value
            return value * torch.ones(batch_size, dtype=dtype)  # cpu tensor

        top_k = convert_to_tensor_if_not(top_k, dtype=torch.int32)
        top_p = convert_to_tensor_if_not(top_p, dtype=torch.float32)
        temperature = convert_to_tensor_if_not(
            temperature, dtype=torch.float32
        )
        repetition_penalty = convert_to_tensor_if_not(
            repetition_penalty, dtype=torch.float32
        )
        min_length = convert_to_tensor_if_not(min_length, dtype=torch.int32)
        len_penalty = convert_to_tensor_if_not(
            length_penalty, dtype=torch.float32
        )
        if max_length is None:
            # gen_length is required for faster transformer
            # infer it from the model config
            max_length = self.config.n_ctx
        output_dict = self.model.generate(
            input_token_ids=inputs,
            input_lengths=input_lengths,
            gen_length=max_length - len(inputs[0]),
            eos_token_id=eos_token_id,
            # local_batch_size=None,
            beam_width=num_beams,
            top_k=top_k,
            top_p=top_p,
            # top_p_decay: Optional[torch.FloatTensor] = None,
            # top_p_min: Optional[torch.FloatTensor] = None,
            # top_p_reset_ids: Optional[torch.IntTensor] = None,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            # presence_penalty: Optional[torch.FloatTensor] = None,
            min_length=min_length,
            len_penalty=len_penalty,
            # beam_search_diversity_rate: Optional[torch.FloatTensor] = None,
            # stop_words_list: Optional[torch.IntTensor] = None,
            # bad_words_list: Optional[torch.IntTensor] = None,
            # sequence_limit_lengths: Optional[torch.IntTensor] = None,
            # random_seed: Optional[torch.LongTensor] = None,
            # memory_length: Optional[int] = None,
            return_output_length=True,
            return_log_probs=False,
        )
        output_token_ids = output_dict["output_token_ids"]
        output_lengths = output_dict["output_lengths"]
        # tokens = output_token_ids[0, 0, input_lengths[0]:output_lengths[0]]
        tokens = [
            # output_token_ids[i, 0, input_lengths[i]:output_lengths[i]]
            output_token_ids[i, 0, : output_lengths[i]]
            for i in range(batch_size)
        ]
        return tokens


def convert_gpt2_lm_head_model(
    model: GPT2LMHeadModel,
    tokenizer,
    weight_data_type="fp32",
    data_type="fp16",
    use_fp32_to_compute_logit=False,
):
    """
    currently doens't support fp8 or multi-gpu
    """
    weights_data_type = weight_data_type
    temp_dir = tempfile.TemporaryDirectory()
    temp_dir_path = temp_dir.name
    ft_model_location = saved_dir = temp_dir_path + "/gpt2"
    hf_config = model.config.to_dict()
    # convert huggingface model to faster transformer model
    convert_huggingface_gpt_to_faster_transformer(
        saved_dir=saved_dir,
        model=model.transformer,
        weight_data_type=weight_data_type,
    )

    head_num = hf_config["n_head"]
    layer_num = hf_config["n_layer"]
    start_id = hf_config["bos_token_id"]
    end_id = hf_config["eos_token_id"]
    size_per_head = hf_config["n_embd"] // head_num

    vocab_size = tokenizer.vocab_size

    tensor_para_size = 1
    pipeline_para_size = 1
    ckpt_path = os.path.join(ft_model_location, f"{tensor_para_size}-gpu")
    max_seq_len = hf_config["n_ctx"]
    int8_mode = 0  # 0: no quantization, 1: quantize weights to int8
    # load faster transformer model, note that the lm_head is not saved
    # it's reconstructed during loading from the embedding weights
    gpt = gpt_decoder.Gpt(
        num_heads=head_num,
        size_per_head=size_per_head,
        num_layers=layer_num,
        vocab_size=vocab_size,
        start_id=start_id,
        end_id=end_id,
        tensor_para_size=tensor_para_size,
        pipeline_para_size=pipeline_para_size,
        lib_path=lib_path,
        max_seq_len=max_seq_len,
        int8_mode=int8_mode,
        inference_data_type=data_type,
        weights_data_type=weights_data_type,
        use_fp32_to_compute_logit=use_fp32_to_compute_logit,
    )
    gpt.load(ckpt_path, data_type)
    return FasterTransformerGPT2Wrapper(gpt, model.config)


# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token
# model = hf_model = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda").eval()
# hf_config = hf_model.config.to_dict()


# model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# weight_data_type = weights_data_type = "fp32" # fp32 or fp16
# data_type = "fp32" # fp32 or fp16
# faster_model= convert_gpt2_lm_head_model(
# model, tokenizer,
# weight_data_type=weight_data_type,
# data_type=data_type)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/gpt_decoder.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/gpt_decoder.py # noqa: E501
# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from abc import abstractmethod
from pathlib import Path
from typing import List, Literal, Optional, Union
import os

import numpy as np

from . import comm
from . import profiler
from .gpt import GptInitModelParameters

from nebullvm.optional_modules.torch import torch

PathLike = Union[str, Path]


def to_numpy_dtype(maybe_str_dtype: Union[str, np.dtype]):
    assert isinstance(maybe_str_dtype, (str, np.dtype))
    if isinstance(maybe_str_dtype, str):
        try:
            dtype = {
                "fp16": np.float16,
                "float16": np.float16,
                "fp32": np.float32,
                "float32": np.float32,
            }[maybe_str_dtype]
        except KeyError:
            raise ValueError(
                f"Cannot convert to numpy data type, got {maybe_str_dtype}"
            )
    else:
        dtype = maybe_str_dtype
    return dtype


def to_torch_dtype(maybe_str_dtype: Union[str, torch.dtype]):

    if isinstance(maybe_str_dtype, torch.dtype):
        dtype = maybe_str_dtype
    else:
        try:
            dtype = {
                "bf16": torch.bfloat16,
                "fp16": torch.float16,
                "fp32": torch.float32,
                "bfloat16": torch.bfloat16,
                "float16": torch.float16,
                "float32": torch.float32,
            }[maybe_str_dtype]
        except KeyError:
            raise ValueError(
                f"Cannot convert to torch data type, got {maybe_str_dtype}"
            )
    return dtype


def load_weight_from_bin(
    checkpoint_path: PathLike,
    shape: List[int],
    weight_dtype: Union[str, np.dtype],
):
    """Load a weight from a bin file.

    # Args.
        checkpoint_path: str or Path,
            a checkpoint file path of an FT's layer weight.
        shape: list of int, the shape of weight tensor.
        weight_dtype: str or np.dtype, the data type of the stored weight.
    """
    weight_dtype = to_numpy_dtype(weight_dtype)
    return torch.from_numpy(np.fromfile(checkpoint_path, dtype=weight_dtype))


LayernormType = Literal["pre_layernorm", "post_layernorm"]


class GptLayerWeights:
    def __init__(
        self,
        num_heads: int,
        size_per_head: int,
        inter_size: int,
        num_layers: int,
        tensor_para_size: int = 1,
        pipeline_para_size: int = 1,
        has_adapters: bool = False,
        adapter_inter_size: int = 0,
        int8_mode: int = 0,
    ):

        assert num_heads % tensor_para_size == 0, (
            f"num_heads ({num_heads}) is not multiple of "
            "tensor para size ({tensor_para_size})"
        )

        self.num_heads = num_heads
        self.size_per_head = size_per_head
        self.hidden_units = num_heads * size_per_head
        self.num_layers = num_layers

        self.tensor_para_size = tensor_para_size
        self.tensor_para_rank = comm.get_tensor_para_rank()
        self.pipeline_para_size = pipeline_para_size
        self.pipeline_para_rank = comm.get_pipeline_para_rank()

        self.has_adapters = has_adapters
        self.adapter_inter_size = adapter_inter_size

        self.local_num_layers = num_layers // pipeline_para_size
        self.local_num_heads = num_heads // tensor_para_size
        self.local_hidden_units = self.local_num_heads * size_per_head
        self.local_inter_size = inter_size // tensor_para_size
        self.local_adapter_inter_size = (
            self.adapter_inter_size // tensor_para_size
        )

        self.weight_transpose_calibrate_quantize = None
        assert int8_mode in [0, 1], "Invalid int8 mode for GPT. Must be 0 or 1"
        self.int8_mode = int8_mode
        if self.int8_mode == 1:
            quant = (
                torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix  # noqa: E501
            )
            self.weight_transpose_calibrate_quantize = lambda x: quant(
                x, torch.int8
            )

        self.weights = None
        self.int8_weights = None
        self.int8_scales = None

        self.expected_weight_shapes = list()

        # pylint:disable=line-too-long
        # Transformer blocks
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # input layernorm weight
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # input layernorm bias
        self.expected_weight_shapes.extend(
            [(self.hidden_units, self.local_hidden_units * 3)]
            * self.local_num_layers
        )  # attention qkv weight
        self.expected_weight_shapes.extend(
            [(self.local_hidden_units * 3,)] * self.local_num_layers
        )  # attention qkv bias
        self.expected_weight_shapes.extend(
            [(self.local_hidden_units, self.hidden_units)]
            * self.local_num_layers
        )  # attention dense weight
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # attention dense bias
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # post attention layernorm weight
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # post attention layernorm bias
        self.expected_weight_shapes.extend(
            [(self.hidden_units, self.local_inter_size)]
            * self.local_num_layers
        )  # ffn_kernel1
        self.expected_weight_shapes.extend(
            [(self.local_inter_size,)] * self.local_num_layers
        )  # ffn_bias1
        self.expected_weight_shapes.extend(
            [(self.local_inter_size, self.hidden_units)]
            * self.local_num_layers
        )  # ffn_kernel2
        self.expected_weight_shapes.extend(
            [(self.hidden_units,)] * self.local_num_layers
        )  # ffn_bias2

        # Adapters
        if self.has_adapters:
            self.expected_weight_shapes.extend(
                [(self.hidden_units, self.local_adapter_inter_size)]
                * self.local_num_layers
            )  # adaptor1_kernel1
            self.expected_weight_shapes.extend(
                [(self.local_adapter_inter_size,)] * self.local_num_layers
            )  # adaptor1_bias1
            self.expected_weight_shapes.extend(
                [(self.local_adapter_inter_size, self.hidden_units)]
                * self.local_num_layers
            )  # adaptor1_kernel2
            self.expected_weight_shapes.extend(
                [(self.hidden_units,)] * self.local_num_layers
            )  # adaptor1_bias2
            self.expected_weight_shapes.extend(
                [(self.hidden_units, self.local_adapter_inter_size)]
                * self.local_num_layers
            )  # adaptor2_kernel1
            self.expected_weight_shapes.extend(
                [(self.local_adapter_inter_size,)] * self.local_num_layers
            )  # adaptor2_bias1
            self.expected_weight_shapes.extend(
                [(self.local_adapter_inter_size, self.hidden_units)]
                * self.local_num_layers
            )  # adaptor2_kernel2
            self.expected_weight_shapes.extend(
                [(self.hidden_units,)] * self.local_num_layers
            )  # adaptor2_bias2
        # pylint:enable=line-too-long

    @classmethod
    def from_config(cls, config: GptInitModelParameters):
        return cls(
            num_heads=config.head_num,
            size_per_head=config.size_per_head,
            inter_size=4 * config.head_num * config.size_per_head,
            num_layers=config.layer_num,
            tensor_para_size=config.tensor_para_size,
            pipeline_para_size=config.pipeline_para_size,
            has_adapters=config.has_adapters,
            adapter_inter_size=config.adapter_inter_size,
            int8_mode=config.int8_mode,
        )

    @property
    def dtype(self):
        return self.weights[0].dtype

    @property
    def device(self):
        return self.weights[0].device

    def _map(self, func):
        for i in range(len(self.weights)):
            if isinstance(self.weights[i], list):
                for j in range(len(self.weights[i])):
                    self.weights[i][j] = func(self.weights[i][j])
            else:
                self.weights[i] = func(self.weights[i])

    def _map_int8(self, func):
        for i in range(len(self.int8_weights)):
            if isinstance(self.int8_weights[i], list):
                for j in range(len(self.int8_weights[i])):
                    self.int8_weights[i][j] = func(self.int8_weights[i][j])

            else:
                self.int8_weights[i] = func(self.int8_weights[i])
        for i in range(len(self.int8_scales)):
            if isinstance(self.int8_scales[i], list):
                for j in range(len(self.int8_scales[i])):
                    self.int8_scales[i][j] = func(self.int8_scales[i][j])
            else:
                self.int8_scales[i] = func(self.int8_scales[i])

    def float(self):
        if self.dtype == torch.float32:
            return
        self._map(lambda x: x.float())

    def half(self):
        if self.dtype == torch.float16:
            return
        self._map(lambda x: x.half())
        if self.int8_mode == 1:
            self._map_int8(lambda w: w.half())

    def bfloat16(self):
        if self.dtype == torch.bfloat16:
            return
        self._map(lambda x: x.bfloat16())
        if self.int8_mode == 1:
            self._map_int8(lambda w: w.bfloat16())

    def cuda(self, device=None):
        self._map(lambda x: x.cuda(device))
        if self.int8_mode == 1:
            self._map_int8(lambda x: x.cuda(device))

    def to(self, device=None):
        self._map(lambda x: x.to(device))
        if self.int8_mode == 1:
            self._map_int8(lambda x: x.to(device))

    def is_valid_pp_group(self, layer, pp_rank):
        return layer // self.layers_per_device == pp_rank

    def load(
        self,
        checkpoint_path: PathLike,
        compute_dtype: torch.dtype,
        weight_dtype: Optional[Union[str, np.dtype]] = None,
        device: Optional[Union[int, str, torch.device]] = None,
    ):
        """Load checkpoint weights.

        # Args.
            checkpoint_path: str or Path,
                a checkpoint directory where FT checkpoint files locate.
            weight_dtype: str or np.dtype, the data type of stored weights.
        """

        checkpoint_path = Path(checkpoint_path)
        if not checkpoint_path.exists():
            raise FileNotFoundError(
                f"Could not find checkpoint {str(checkpoint_path)}"
            )

        weight_dtype = to_numpy_dtype(weight_dtype)
        print(
            f"Load weights from {str(checkpoint_path)} (data type: {weight_dtype}"  # noqa: E501
        )

        self.weights = list()
        self.int8_weights = list()
        self.int8_scales = list()
        torch.cuda.empty_cache()

        def _load_from_file(fname):
            quant_sub_names = [
                "attention.query_key_value.weight",
                "attention.dense.weight",
                "dense_h_to_4h.weight",
                "dense_4h_to_h.weight",
            ]
            _weight = torch.from_numpy(
                np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
            )
            _weight = _weight.to(compute_dtype)
            weight_index = len(self.weights)
            expected_shape = self.expected_weight_shapes[weight_index]

            try:
                if _weight.nelement() > 0:
                    _weight = _weight.reshape(expected_shape)
            except:  # noqa: E722
                raise ValueError(
                    f"num_heads, size_per_head, vocab_size, and max_seq_len must be the same "  # noqa: E501
                    f"as the ones during training (weight: {fname} expected shape: {expected_shape}, "  # noqa: E501
                    f"got shape: {_weight.shape})."
                )

            should_quantize = any(
                sub_name in fname for sub_name in quant_sub_names
            )
            if self.int8_mode != 0 and should_quantize:
                calibrate = self.weight_transpose_calibrate_quantize
                int8_weight, int8_scales = calibrate(_weight)

                # int8 weights should appear in same order as FP weights.
                # Move to device and add to the int8 list.
                dummy_weight = torch.empty(0, dtype=compute_dtype)
                if device is not None:
                    int8_weight = int8_weight.to(device)
                    int8_scales = int8_scales.to(device)
                    dummy_weight = dummy_weight.to(device)

                self.int8_weights.append(int8_weight)
                self.int8_scales.append(int8_scales)
                self.weights.append(dummy_weight)
            else:
                if device is not None:
                    _weight = _weight.to(device)
                self.weights.append(_weight)

        # Load
        # pylint:disable=line-too-long
        layer_offset = self.local_num_layers * self.pipeline_para_rank
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.input_layernorm.weight.bin"
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.input_layernorm.bias.bin"
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.attention.query_key_value.weight.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.attention.query_key_value.bias.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.attention.dense.weight.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.attention.dense.bias.bin"
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.post_attention_layernorm.weight.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.post_attention_layernorm.bias.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.bias.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
            )
            for i in range(self.local_num_layers)
        ]
        [
            _load_from_file(
                f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.bias.bin"
            )
            for i in range(self.local_num_layers)
        ]

        if self.has_adapters:
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.bias.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]
            [
                _load_from_file(
                    f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.bias.bin"  # noqa: E501
                )
                for i in range(self.local_num_layers)
            ]

        assert len(self.weights) == len(
            self.expected_weight_shapes
        ), "Incorrect number of weights loaded"


class FtModuleBase:
    def __init__(self):
        self.weight = None

    @classmethod
    @abstractmethod
    def from_config(cls, config: GptInitModelParameters, **kwargs):
        raise NotImplementedError

    @abstractmethod
    def _initialize_model(self, force_init=False):
        raise NotImplementedError

    @abstractmethod
    def forward(self, *args, **kwargs):
        raise NotImplementedError

    def set_weight(self, weight: GptLayerWeights):
        old_weight_dtype = (
            self.weight.dtype if self.weight is not None else None
        )
        self.weight = weight
        if old_weight_dtype is None or old_weight_dtype != self.weight.dtype:
            self._initialize_model(force_init=True)

    @property
    def dtype(self):
        assert self.weight is not None
        return self.weight.dtype

    @property
    def device(self):
        assert self.weight is not None
        return self.weight.device

    def cuda(self, device=None):
        assert torch.cuda.is_available()
        self.weight.cuda(device)
        return self

    def to(self, device=None):
        self.weight.to(device)
        return self

    def float(self):
        self.weight.float()
        self._initialize_model(force_init=True)
        return self

    def half(self):
        self.weight.half()
        self._initialize_model(force_init=True)
        return self

    def bfloat16(self):
        self.weight.bfloat16()
        self._initialize_model(force_init=True)
        return self


class GptContextDecoder(FtModuleBase):
    def __init__(
        self,
        num_heads: int,
        size_per_head: int,
        inter_size: int,
        num_layers: int,
        tensor_para_size: int = 1,
        pipeline_para_size: int = 1,
        remove_padding: bool = True,
        shared_contexts_ratio: float = 1.0,
        layernorm_eps: float = 1e-6,
        layernorm_type: LayernormType = "pre_layernorm",
        activation_type: str = "gelu",
        has_adapters: bool = False,
        adapter_inter_size: int = 0,
        int8_mode: int = 0,
    ):
        super().__init__()
        self.num_heads = num_heads
        self.size_per_head = size_per_head
        self.hidden_size = self.num_heads * self.size_per_head
        self.inter_size = inter_size
        self.num_layers = num_layers

        self.tensor_para_size = tensor_para_size
        self.pipeline_para_size = pipeline_para_size

        self.remove_padding = remove_padding
        self.shared_contexts_ratio = shared_contexts_ratio

        self.layernorm_eps = layernorm_eps
        self.layernorm_type = layernorm_type
        self.activation_type = activation_type
        self.has_adapters = has_adapters
        self.adapter_inter_size = adapter_inter_size

        assert int8_mode in [0, 1]
        self.int8_mode = int8_mode

        self.ft_op = None
        self.weight = None

    def __repr__(self):
        args_dict = dict(
            num_heads=self.num_heads,
            size_per_head=self.size_per_head,
            hidden_size=self.hidden_size,
            inter_size=self.inter_size,
            num_layers=self.num_layers,
            tensor_para_size=self.tensor_para_size,
            pipeline_para_size=self.pipeline_para_size,
            remove_padding=self.remove_padding,
            shared_contexts_ratio=self.shared_contexts_ratio,
            layernorm_eps=self.layernorm_eps,
            layernorm_type=self.layernorm_type,
            activation_type=self.activation_type,
            has_adapters=self.has_adapters,
            adapter_inter_size=self.adapter_inter_size,
            int8_mode=self.int8_mode,
        )
        args_str = ",\n    ".join([f"{k}: {v}" for k, v in args_dict.items()])
        return f"{self.__class__.__name__}[\n{    args_str}\n]"

    @classmethod
    def from_config(cls, config: GptInitModelParameters, **kwargs):
        return cls(
            num_heads=config.head_num,
            size_per_head=config.size_per_head,
            inter_size=4 * config.head_num * config.size_per_head,
            num_layers=config.layer_num,
            tensor_para_size=config.tensor_para_size,
            pipeline_para_size=config.pipeline_para_size,
            remove_padding=kwargs.get("remove_padding", True),
            shared_contexts_ratio=kwargs.get("shared_contexts_ratio", 1.0),
            layernorm_eps=config.layernorm_eps,
            layernorm_type=config.layernorm_type,
            activation_type=config.activation_type,
            has_adapters=config.has_adapters,
            adapter_inter_size=config.adapter_inter_size,
            int8_mode=config.int8_mode,
        )

    def _initialize_model(self, force_init=False):
        if self.weight is None:
            self.weight = GptLayerWeights(
                num_heads=self.num_heads,
                size_per_head=self.size_per_head,
                inter_size=self.inter_size,
                num_layers=self.num_layers,
                tensor_para_size=self.tensor_para_size,
                pipeline_para_size=self.pipeline_para_size,
                has_adapters=self.has_adapters,
                adapter_inter_size=self.adapter_inter_size,
                int8_mode=self.int8_mode,
            )
        if not force_init and self.ft_op is not None:
            return
        if self.ft_op is not None:
            del self.ft_op

        self.ft_op = (
            torch.classes.FasterTransformer.ParallelGptContextDecoderOp(
                self.num_heads,
                self.size_per_head,
                self.inter_size,
                self.num_layers,
                self.tensor_para_size,
                self.pipeline_para_size,
                self.layernorm_eps,
                self.layernorm_type,
                self.activation_type,
                self.has_adapters,
                self.adapter_inter_size,
                self.int8_mode,
                self.weight.weights,
                self.weight.int8_weights,
                self.weight.int8_scales,
                self.remove_padding,
            )
        )

    def forward(
        self,
        input_embeds: torch.Tensor,
        attention_mask: torch.Tensor,
        input_lengths: torch.IntTensor,
        memory_length: Optional[int] = None,
        compact_index: Optional[torch.IntTensor] = None,
        batch_to_compact_index: Optional[torch.IntTensor] = None,
        linear_bias_slopes: Optional[torch.Tensor] = None,
    ):
        """

        # Args.
            input_embeds: Tensor, (batch * beam, max_input_length, hidden_dim),
                input hidden states.
            attention_mask: Tensor, (batch * beam, max_input_length, max_input_length),
                input attention mask.
            input_lengths: (batch * beam,), input sequence lengths.
            memory_length: int, the length of memory to keep key/cache values.
            compact_index: IntTensor, (compact_batch_size,)
                The index of input sequences of a compact batch. If None, the FT op
                doesn't apply the shared context feature and as result the inference
                time may increase.
            batch_to_compact_index: IntTensor, (batch * beam,)
                The index map from the original input batch to the compact batch.
                This must be provided if compact_index is not None.
            linear_bias_slopes: (num_heads,)
                The slope per head of linear attention bias - ALiBi. If None, a base
                self attention will be performed.
        # Returns
            hidden_states: Tensor, (batch * beam, max_input_length, hidden_dim),
                decoder outputs.
            key_cache: Tensor, (num_layers, batch * beam, local_num_heads, size_per_head / x, memory_length, x), # noqa: E501
                key cache of attention of inputs.
                x = 16 / sizeof(T), memory_length = max_input_length or max_input_length + gen_length # noqa: E501
            value_cache: Tensor, (num_layers, batch * beam, local_num_heads, memory_length, hidden_dim) # noqa: E501
                value cache of attention
            last_token_hidden_states: Tensor, (batch * beam, hidden_dim)
                hidden states of the last input token.
        """
        self._initialize_model()
        # outputs: output hidden states
        (
            decoder_ouptut,
            key_cache,
            value_cache,
            last_token_hidden_states,
        ) = self.ft_op.forward(
            input_embeds,
            attention_mask,
            input_lengths,
            memory_length,
            compact_index,
            batch_to_compact_index,
            linear_bias_slopes,
        )
        return decoder_ouptut, key_cache, value_cache, last_token_hidden_states


class GptDecoder(FtModuleBase):
    def __init__(
        self,
        num_heads: int,
        size_per_head: int,
        inter_size: int,
        num_layers: int,
        tensor_para_size: int = 1,
        pipeline_para_size: int = 1,
        layernorm_eps: float = 1e-6,
        layernorm_type: LayernormType = "pre_layernorm",
        activation_type: str = "gelu",
        has_adapters: bool = False,
        adapter_inter_size: int = 0,
        int8_mode: int = 0,
    ):
        super().__init__()
        self.num_heads = num_heads
        self.size_per_head = size_per_head
        self.hidden_size = self.num_heads * self.size_per_head
        self.inter_size = inter_size
        self.num_layers = num_layers

        self.tensor_para_size = tensor_para_size
        self.pipeline_para_size = pipeline_para_size

        self.layernorm_eps = layernorm_eps
        self.layernorm_type = layernorm_type
        self.activation_type = activation_type
        self.has_adapters = has_adapters
        self.adapter_inter_size = adapter_inter_size

        self.int8_mode = int8_mode

        self.ft_op = None
        self.weight = None

    def __repr__(self):
        args_dict = dict(
            num_heads=self.num_heads,
            size_per_head=self.size_per_head,
            hidden_size=self.hidden_size,
            inter_size=self.inter_size,
            num_layers=self.num_layers,
            tensor_para_size=self.tensor_para_size,
            pipeline_para_size=self.pipeline_para_size,
            layernorm_eps=self.layernorm_eps,
            layernorm_type=self.layernorm_type,
            activation_type=self.activation_type,
            has_adapters=self.has_adapters,
            adapter_inter_size=self.adapter_inter_size,
            int8_mode=self.int8_mode,
        )
        args_str = ",\n    ".join(
            [f"{k}: {v}" for k, v in args_dict.items()]
        )  # noqa: E501
        return f"{self.__class__.__name__}[\n    {args_str}\n]"

    @classmethod
    def from_config(cls, config: GptInitModelParameters, **kwargs):
        hidden_dim = config.head_num * config.size_per_head
        return cls(
            num_heads=config.head_num,
            size_per_head=config.size_per_head,
            inter_size=4 * hidden_dim,
            num_layers=config.layer_num,
            tensor_para_size=config.tensor_para_size,
            pipeline_para_size=config.pipeline_para_size,
            layernorm_eps=config.layernorm_eps,
            layernorm_type=config.layernorm_type,
            activation_type=config.activation_type,
            has_adapters=config.has_adapters,
            adapter_inter_size=config.adapter_inter_size,
            int8_mode=config.int8_mode,
        )

    def _initialize_model(self, force_init=False):
        if self.weight is None:
            self.weight = GptLayerWeights(
                num_heads=self.num_heads,
                size_per_head=self.size_per_head,
                inter_size=self.inter_size,
                num_layers=self.num_layers,
                tensor_para_size=self.tensor_para_size,
                pipeline_para_size=self.pipeline_para_size,
                has_adapters=self.has_adapters,
                adapter_inter_size=self.adapter_inter_size,
                int8_mode=self.int8_mode,
            )
        if not force_init and self.ft_op is not None:
            return
        if self.ft_op is not None:
            del self.ft_op
        self.ft_op = torch.classes.FasterTransformer.ParallelGptDecoderOp(
            self.num_heads,
            self.size_per_head,
            self.inter_size,
            self.num_layers,
            self.tensor_para_size,
            self.pipeline_para_size,
            self.layernorm_eps,
            self.layernorm_type,
            self.activation_type,
            self.has_adapters,
            self.adapter_inter_size,
            self.weight.int8_mode,
            self.weight.weights,
            self.weight.int8_weights,
            self.weight.int8_scales,
        )

    def forward(
        self,
        max_input_length: int,
        step: int,
        ite: int,
        input_embeds: torch.Tensor,
        sequence_lengths: torch.IntTensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        finished: torch.BoolTensor,
        total_padding_tokens: torch.IntTensor,
        masked_tokens: torch.BoolTensor,
        cache_indirection: Optional[torch.IntTensor] = None,
        linear_bias_slopes: Optional[torch.Tensor] = None,
    ):
        """

        # Args.
            max_input_length: int, maximum input context length.
            step: int, the current step index.
            ite: int, local batch iteration.
            input_embeds: Tensor, (local_batch * beam, hidden_dim),
                input hidden state to decoder.
            sequence_lengths: IntTensor, (local_batch * beam,),
                the current sequence lengths.
            key_cache: Tensor, key cache buffer.
            value_cache: Tensor, value cache buffer.
            finished: BoolTensor, (local_batch * beam,),
                whether to finish sentence generation.
            total_padding_tokens IntTensor, (local_batch * beam,),
                the number of padded tokens.
            masked_tokens: BoolTensor, (local_batch * beam, memory_length),
                a mask tensor that indicates padded tokens.
            cache_indirection: IntTensor, (local_batch * beam,),
                cache of beam positions if needed if beam > 1.
            linear_bias_slopes Tensor, (num_heads,)
                slopes head of linear position bias (ALiBi) (optional).
        # Returns
            IntTensor, (batch * beam,) output token ids.
        """

        self._initialize_model()

        outputs = self.ft_op.forward(
            max_input_length,
            step,
            ite,
            input_embeds,
            sequence_lengths,
            finished,
            total_padding_tokens,
            masked_tokens,
            key_cache,
            value_cache,
            cache_indirection,
            linear_bias_slopes,
        )
        return outputs[0]


class Gpt:
    def __init__(
        self,
        num_heads: int,
        size_per_head: int,
        num_layers: int,
        vocab_size: int,
        start_id: int,
        end_id: int,
        lib_path: PathLike,
        tensor_para_size: int = 1,
        pipeline_para_size: int = 1,
        remove_padding: bool = True,
        shared_contexts_ratio: float = 1.0,
        layernorm_eps: float = 1e-6,
        layernorm_type: LayernormType = "pre_layernorm",
        activation_type: str = "gelu",
        has_positional_encoding: bool = True,
        max_seq_len: int = 0,
        has_pre_decoder_layernorm: bool = False,
        has_post_decoder_layernorm: bool = True,
        has_adapters: bool = False,
        adapter_inter_size: int = 0,
        int8_mode: int = 0,
        inference_data_type: Optional[str] = None,
        weights_data_type: str = "fp32",
        use_fp32_to_compute_logit: bool = False,
        **kwargs,
    ):
        super().__init__()

        inference_data_type = inference_data_type or weights_data_type

        self.config = GptInitModelParameters(
            head_num=num_heads,
            size_per_head=size_per_head,
            layer_num=num_layers,
            max_seq_len=max_seq_len,
            tensor_para_size=tensor_para_size,
            vocab_size=vocab_size,
            start_id=start_id,
            end_id=end_id,
            pipeline_para_size=pipeline_para_size,
            data_type=inference_data_type,
            weights_data_type=weights_data_type,
            layernorm_eps=layernorm_eps,
            layernorm_type=layernorm_type,
            activation_type=activation_type,
            has_positional_encoding=has_positional_encoding,
            has_pre_decoder_layernorm=has_pre_decoder_layernorm,
            has_post_decoder_layernorm=has_post_decoder_layernorm,
            has_adapters=has_adapters,
            adapter_inter_size=adapter_inter_size,
            int8_mode=int8_mode,
            sparse=kwargs.get("sparse", False),
        )
        self.use_fp32_to_compute_logit = use_fp32_to_compute_logit

        self.weight = None
        self.shared_contexts_ratio = shared_contexts_ratio

        torch.classes.load_library(os.path.abspath(lib_path))

        # Embeddings to encode or decode tokens.
        hidden_dim = num_heads * size_per_head

        # Pad vocab size for FT.
        local_vocab_size = math.ceil(
            self.config.vocab_size / self.config.tensor_para_size
        )
        if self.config.data_type == "fp16":
            local_vocab_size = math.ceil(local_vocab_size / 8) * 8
        self.vocab_size_padded = (
            local_vocab_size * self.config.tensor_para_size
        )
        self.vocab_size = self.config.vocab_size

        self.decode_op = torch.classes.FasterTransformer.DynamicDecodeOp(
            self.vocab_size,
            self.vocab_size_padded,
            self.config.tensor_para_size,
            self.config.pipeline_para_size,
            torch.float,
        )

        self._parameters = {}

        def register_param(name, p):
            self._parameters[name] = p
            setattr(self, name, p)

        register_param(
            "context_decoder",
            GptContextDecoder.from_config(
                self.config,
                remove_padding=remove_padding,
                shared_contexts_ratio=shared_contexts_ratio,
                **kwargs,
            ),
        )
        register_param(
            "decoder", GptDecoder.from_config(self.config, **kwargs)
        )

        compute_dtype = to_torch_dtype(inference_data_type)

        if comm.is_pipeline_group_first():
            register_param(
                "word_embedding",
                torch.nn.Embedding(
                    self.vocab_size_padded, hidden_dim, dtype=compute_dtype
                ),
            )
            self._mask_padded_vocab_weights(self.word_embedding.weight)
            if self.config.has_positional_encoding:
                register_param(
                    "position_encoding",
                    torch.nn.Embedding(
                        self.config.max_seq_len,
                        hidden_dim,
                        dtype=compute_dtype,
                    ),
                )
            else:
                self.position_encoding = None
            if self.config.has_pre_decoder_layernorm:
                register_param(
                    "pre_decoder_layernorm",
                    torch.nn.LayerNorm(
                        hidden_dim, eps=layernorm_eps, dtype=compute_dtype
                    ),
                )
            else:
                self.pre_decoder_layernorm = None

        if comm.is_pipeline_group_last():
            if has_post_decoder_layernorm:
                register_param(
                    "post_decoder_layernorm",
                    torch.nn.LayerNorm(
                        hidden_dim, eps=layernorm_eps, dtype=compute_dtype
                    ),
                )
            else:
                self.post_decoder_layernorm = None
            self.lm_head_ctype = (
                compute_dtype
                if not self.use_fp32_to_compute_logit
                else torch.float32
            )
            register_param(
                "lm_head",
                torch.nn.Linear(
                    hidden_dim,
                    self.vocab_size_padded,
                    bias=False,
                    dtype=self.lm_head_ctype,
                ),
            )
            self._mask_padded_vocab_weights(self.lm_head.weight)

    @classmethod
    def from_config(cls, config: GptInitModelParameters, **kwargs):
        return cls(
            num_heads=config.head_num,
            size_per_head=config.size_per_head,
            num_layers=config.layer_num,
            max_seq_len=config.max_seq_len,
            tensor_para_size=config.tensor_para_size,
            vocab_size=config.vocab_size,
            start_id=config.start_id,
            end_id=config.end_id,
            pipeline_para_size=config.pipeline_para_size,
            inference_data_type=config.data_type,
            weights_data_type=config.weights_data_type,
            layernorm_eps=config.layernorm_eps,
            layernorm_type=config.layernorm_type,
            activation_type=config.activation_type,
            has_positional_encoding=config.has_positional_encoding,
            has_pre_decoder_layernorm=config.has_pre_decoder_layernorm,
            has_post_decoder_layernorm=config.has_post_decoder_layernorm,
            has_adapters=config.has_adapters,
            adapter_inter_size=config.adapter_inter_size,
            int8_mode=config.int8_mode,
            **kwargs,
        )

    def load(
        self,
        checkpoint_path: PathLike,
        inference_data_type: Optional[Union[str, torch.dtype]] = None,
        config: Optional[GptInitModelParameters] = None,
        device: Optional[Union[str, int, torch.device]] = None,
    ):

        checkpoint_path = Path(checkpoint_path)
        device = device or comm.get_device()
        config = config or self.config

        compute_dtype = to_torch_dtype(inference_data_type or self.dtype)

        self.weight = GptLayerWeights.from_config(config)
        self.weight.load(
            checkpoint_path, compute_dtype, config.weights_data_type, device
        )

        self.context_decoder.set_weight(self.weight)
        self.decoder.set_weight(self.weight)

        weight_dtype = to_numpy_dtype(config.weights_data_type)

        def _safe_load_from_bin(param: torch.nn.Parameter, fname):
            if (checkpoint_path / fname).exists():
                # np_w is 1-D array since a bin file doesn't have shape info.
                w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
                param.data = (
                    torch.from_numpy(w_)
                    .reshape(param.data.shape)
                    .to(compute_dtype)
                )
            else:
                raise FileNotFoundError(f"Faile to load {fname}")

        def _safe_load_lm_head_from_bin(param, fname, ctype):
            if (checkpoint_path / fname).exists():
                shape = (
                    self.vocab_size,
                    self.config.head_num * self.config.size_per_head,
                )
                # np_w is 1-D array since a bin file doesn't have shape info.
                w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
                param.data = param.data.to(ctype)
                param.data[: self.vocab_size, :] = (
                    torch.from_numpy(w_).reshape(shape).to(ctype)
                )
            else:
                print(f"Faile to load {fname}")
                torch.nn.init.normal_(param).to(compute_dtype)
            self._mask_padded_vocab_weights(param)

        # pylint:disable=line-too-long
        if comm.is_pipeline_group_first():
            _safe_load_lm_head_from_bin(
                self.word_embedding.weight, "model.wte.bin", compute_dtype
            )
            self._mask_padded_vocab_weights(self.word_embedding.weight)
            if self.position_encoding is not None:
                _safe_load_from_bin(
                    self.position_encoding.weight, "model.wpe.bin"
                )
            if self.pre_decoder_layernorm is not None:
                _safe_load_from_bin(
                    self.pre_decoder_layernorm.weight,
                    "model.pre_decoder_layernorm.weight.bin",
                )
                _safe_load_from_bin(
                    self.pre_decoder_layernorm.bias,
                    "model.pre_decoder_layernorm.bias.bin",
                )
        if comm.is_pipeline_group_last():
            if self.post_decoder_layernorm is not None:
                _safe_load_from_bin(
                    self.post_decoder_layernorm.weight,
                    "model.final_layernorm.weight.bin",
                )
                _safe_load_from_bin(
                    self.post_decoder_layernorm.bias,
                    "model.final_layernorm.bias.bin",
                )
            if (checkpoint_path / "model.lm_head.weight.bin").exists():
                _safe_load_lm_head_from_bin(
                    self.lm_head.weight,
                    "model.lm_head.weight.bin",
                    self.lm_head_ctype,
                )
            else:
                if self.use_fp32_to_compute_logit:
                    _safe_load_lm_head_from_bin(
                        self.lm_head.weight, "model.wte.bin", torch.float32
                    )
                else:
                    # In this branch we can share the pre and post
                    # decoder embeddings, but ONLY pipeline size is 1.
                    # When pipeline size > 1, these two weights will end up on
                    # different GPUs, so we must load the
                    # post decoder weight again (else case).
                    if comm.get_pipeline_para_size() == 1:
                        self.lm_head.weight = self.word_embedding.weight
                    else:
                        _safe_load_lm_head_from_bin(
                            self.lm_head.weight, "model.wte.bin", compute_dtype
                        )

        self.to(device)

    @property
    def dtype(self):
        assert self.weight is not None
        return self.weight.dtype

    @property
    def device(self):
        assert self.weight is not None
        return self.weight.device

    def cuda(self, device=None):
        assert torch.cuda.is_available()
        for name, param in self._parameters.items():
            setattr(self, name, param.cuda(device))
        return self

    def to(self, device=None):
        for name, param in self._parameters.items():
            setattr(self, name, param.to(device))
        return self

    def float(self):
        for name, param in self._parameters.items():
            setattr(self, name, param.float())
        return self

    def half(self):
        for name, param in self._parameters.items():
            setattr(self, name, param.half())
        return self

    def bfloat16(self):
        for name, param in self._parameters.items():
            setattr(self, name, param.bfloat16())
        return self

    def _mask_padded_vocab_weights(self, weight: torch.Tensor):
        assert self.vocab_size_padded >= self.vocab_size
        if self.vocab_size_padded > self.vocab_size:
            weight.data[self.vocab_size :, ...] = 0  # noqa: E203

    def generate_pad_mask(self, input_lengths, memory_length, init_step=0):
        """Generate a pad mask tensor.

        # Args.
            input_lengths: (batch_size * beam_width,), input lengths
            memory_length: the length of key/value cache memory.
            init_step: int, initial step.
        # Return
            masked_tokens: BoolTensor,
                (batch_size * beam_width, memory_length),
                True if init_step + input_length[i] <= j <
                    init_step + max_input_length,
                where i is a batch-beam index and j is a time step
                modulo by memory_length.
        """
        max_input_length = input_lengths.max()
        input_lengths = input_lengths.unsqueeze(1)
        shift = init_step % memory_length
        step_indices = torch.arange(
            init_step, init_step + memory_length, device=input_lengths.device
        )
        step_indices = (
            step_indices.roll(shift)
            .unsqueeze(0)
            .tile(input_lengths.shape[0], 1)
        )
        masked_tokens = torch.logical_and(
            step_indices >= input_lengths,
            step_indices < init_step + max_input_length,
        )
        return masked_tokens

    def get_local_batch_size(self, batch_size):
        """Get a local batch size by the same way that FT Gpt does."""
        local_batch_size = batch_size
        pp_size = self.decoder.pipeline_para_size
        if pp_size > 1:
            if local_batch_size % pp_size == 0:
                local_batch_size //= pp_size
            while local_batch_size > 1024 and local_batch_size % 2 == 0:
                local_batch_size //= 2
        return local_batch_size

    @torch.no_grad()
    def generate(
        self,
        input_token_ids: torch.IntTensor,
        input_lengths: torch.IntTensor,
        gen_length: int,
        eos_token_id: Optional[int] = None,
        local_batch_size: Optional[int] = None,
        beam_width: int = 1,
        top_k: Optional[torch.IntTensor] = None,
        top_p: Optional[torch.FloatTensor] = None,
        top_p_decay: Optional[torch.FloatTensor] = None,
        top_p_min: Optional[torch.FloatTensor] = None,
        top_p_reset_ids: Optional[torch.IntTensor] = None,
        temperature: Optional[torch.FloatTensor] = None,
        repetition_penalty: Optional[torch.FloatTensor] = None,
        presence_penalty: Optional[torch.FloatTensor] = None,
        min_length: Optional[torch.IntTensor] = None,
        len_penalty: Optional[torch.FloatTensor] = None,
        beam_search_diversity_rate: Optional[torch.FloatTensor] = None,
        stop_words_list: Optional[torch.IntTensor] = None,
        bad_words_list: Optional[torch.IntTensor] = None,
        sequence_limit_lengths: Optional[torch.IntTensor] = None,
        random_seed: Optional[torch.LongTensor] = None,
        memory_length: Optional[int] = None,
        return_output_length: bool = False,
        return_log_probs: bool = False,
    ):
        """

        # Args.
            input_token_ids: IntTensor, (batch_size, max_input_length),
                input hidden state to decoder.
            input_lengths: IntTensor, (batch_size),
                the lengths of input context sequences.
            gen_length: int, the number of tokens to generate.
            local_batch_size: int, optional, a batch size of
                local iteration. (disabled)
            eos_token_id: int, eos token id.
            beam_width: int, number of beams for beam search.
                If 1, sampling decode will be used.
            top_k: IntTensor, (batch_size,) top-k sampling.
                The number of most probable tokens to keep
                for sampling per sentence in a batcch.
            top_p: FloatTensor, (batch_size,), top-p sampling.
                The cumulative probability
                of to filter the set of most probable tokens.
            top_p_decay: FloatTensor, (batch_size,)
                The decay of top-p value for top_p sampling.
            top_p_min: FloatTensor, (batch_size,)
                The minimum top p values in top-p decaying.
            top_p_reset_ids: IntTensor, (batch_size,)
                reset ids for resetting top_p values for top p sampling
            temperature: FloatTensor, (batch_size,),
                The temperature value for smoothing the logit distribution.
            repetition_penalty: FloatTensor, (batch_size,),
                The repetition penalty.
            presence_penalty: FloatTensor, (batch_size,),
                The presence penalty, which is exclusive with
                repetition_penalty.
                Only one of repetition and presence penalties is allowed.
            min_length: IntTensor, (batch_size,),
                Minimum length for each sentences. EOS is masked if length is
                below min.
            len_penalty: FloatTensor, (batch_size,)
                The exponent of the length penalty of beam scores.
            beam_search_diversity_rate: FloatTensor, (batch_size,),
                The diversity rate of beam search.
            stop_words_list: IntTensor, (batch_size, 2, stop_words_length)
                When FT generates words in this list, it will stop the
                generation. An extension of stop id.
            bad_words_list IntTensor, (batch_size, 2, bad_words_length)
                The words in the list will never be sampled.
            sequence_limit_lengths: IntTensor, (batch_size,), The maximum
                length of a generated sequence.
            memory_length: int, the length of cache memory. If None, it will
                be max_input_length + gen_length.
        # Returns
            IntTensor, (batch_size, beam_width, max_seq_length) output
            token ids.
        """
        assert (
            self.weight is not None
        ), "Please call load() first to initialize weights."

        input_token_ids = input_token_ids.type(torch.int32).to(self.device)
        input_lengths = input_lengths.type(torch.int32).to(self.device)

        batch_size = len(input_token_ids)
        max_input_length = input_token_ids.shape[-1]
        max_seq_length = max_input_length + gen_length
        memory_length = memory_length or max_seq_length

        # TODO: Enable local batch later. We currently disable local batching due to # noqa: E501
        #   an input mismatch issue of FT's decode_op: FT's decode_op requires logits # noqa: E501
        #   of shape (batch_size, ...) but we have logits of shape (local_batch_size, ...) # noqa: E501
        #   After fixing FT's side, we will enable local batch.
        # local_batch_size = local_batch_size or self.get_local_batch_size(batch_size) # noqa: E501
        # num_local_batches, last_chunk = divmod(batch_size, local_batch_size)
        # if last_chunk > 0:
        #     num_local_batches += 1
        assert local_batch_size is None or local_batch_size == batch_size
        local_batch_size = batch_size
        num_local_batches = 1

        device = self.device

        eos_token_id = (
            eos_token_id if eos_token_id is not None else self.config.end_id
        )
        assert (
            eos_token_id is not None
        ), "eos_token-id must be specified in generation."
        eos_token_ids = eos_token_id * torch.ones(
            batch_size, dtype=torch.int32, device=device
        )
        assert repetition_penalty is None or presence_penalty is None, (
            "Found ambiguous parameters repetition_penalty and "
            "presence_penalty which are mutually exclusive. "
            "Please provide one of repetition_penalty and presence_penalty."
        )
        # Setup decoder_op prior to calling the forward function.
        self.decode_op.setup(
            batch_size,
            beam_width,
            top_k,
            top_p,
            temperature,
            repetition_penalty,
            presence_penalty,
            min_length,
            len_penalty,
            beam_search_diversity_rate,
            random_seed,
            top_p_decay,
            top_p_min,
            top_p_reset_ids,
        )

        # Prepare input and output arguments.
        if beam_width > 1:
            # Tiling for beam search.
            input_token_ids = input_token_ids.repeat(1, beam_width).view(
                batch_size * beam_width, -1
            )
            input_lengths = (
                input_lengths.view(-1, 1).repeat(1, beam_width).view(-1)
            )
            if sequence_limit_lengths is not None:
                sequence_limit_lengths = (
                    sequence_limit_lengths.view(-1, 1)
                    .repeat(1, beam_width)
                    .view(-1)
                )
            # src/tgt cache indirections.
            cache_indirection = torch.zeros(
                (2, batch_size, beam_width, memory_length),
                dtype=torch.int32,
                device=device,
            )
            parent_ids = torch.zeros(
                max_seq_length,
                batch_size * beam_width,
                dtype=torch.int32,
                device=device,
            )
        else:
            cache_indirection = None
            src_cache_indirection = None
            tgt_cache_indirection = None
            parent_ids = None

        pad_lengths = max_input_length - input_lengths
        # Since tril() doesn't support bf16 dtype,
        # we create of bool type and then cast it to dtype.
        attention_mask = (
            torch.ones(
                (max_input_length, max_input_length),
                dtype=torch.bool,
                device=device,
            )
            .tril()
            .unsqueeze(0)
            .tile(input_token_ids.shape[0], 1, 1)
            .to(self.dtype)
        )
        for b, input_length in enumerate(input_lengths):
            attention_mask[b, input_length:, ...] = 0
        masked_tokens = self.generate_pad_mask(input_lengths, memory_length)
        finished = torch.zeros_like(input_lengths).bool()
        sequence_lengths = (max_input_length - 1) * torch.ones_like(
            input_lengths
        )

        if return_log_probs or beam_width > 1:
            cum_log_probs = torch.zeros(batch_size * beam_width, device=device)
            output_log_probs = torch.zeros(
                (gen_length, batch_size * beam_width), device=device
            )
        else:
            cum_log_probs = None
            output_log_probs = None

        # Contiguous buffer for each decode_op step,
        # it will be transposed tensor for the final output.
        output_token_ids = torch.zeros(
            (max_seq_length, batch_size * beam_width),
            dtype=torch.int32,
            device=device,
        )
        output_token_ids[:max_input_length, ...] = input_token_ids.T

        if comm.is_pipeline_group_first():
            # Prepare input tensors of decoder.
            input_embeds = self.word_embedding(input_token_ids)
            if self.position_encoding is not None:
                position_ids = torch.arange(
                    0, max_input_length, dtype=torch.int, device=device
                )
                position_ids = position_ids.unsqueeze(0).view(
                    -1, max_input_length
                )
                input_embeds += self.position_encoding(position_ids)
            if self.pre_decoder_layernorm is not None:
                input_embeds = self.pre_decoder_layernorm(input_embeds)
        else:
            # Dummy input_embeds
            input_embeds = torch.empty(
                size=(
                    batch_size * beam_width,
                    max_input_length,
                    self.context_decoder.hidden_size,
                ),
                dtype=self.context_decoder.dtype,
                device=device,
            )

        use_shared_contexts = (
            (self.shared_contexts_ratio > 0.0)
            and (max_input_length >= 1)
            and (batch_size > 1)
        )
        batch_to_compact, compact_to_batch = None, None
        if use_shared_contexts:
            find_context_duplications = (
                torch.ops.fastertransformer.find_context_duplications
            )
            batch_to_compact, compact_to_batch = find_context_duplications(
                input_token_ids
            )
            use_shared_contexts = (
                compact_to_batch.shape[0]
                <= self.shared_contexts_ratio * batch_size
            )

            if not use_shared_contexts:
                batch_to_compact, compact_to_batch = None, None

        profiler.start("ft-context-decoder")
        (
            _,
            k_cache,
            v_cache,
            last_token_hidden_states,
        ) = self.context_decoder.forward(
            input_embeds=input_embeds,
            attention_mask=attention_mask,
            input_lengths=input_lengths,
            memory_length=memory_length,
            batch_to_compact_index=batch_to_compact,
            compact_index=compact_to_batch,
        )
        profiler.stop("ft-context-decoder")

        for step in range(max_input_length, max_seq_length):
            src_indir_idx = (step - max_input_length) % 2
            tgt_indir_idx = 1 - src_indir_idx

            is_generation_done = torch.tensor(
                [True], dtype=torch.bool, device=device
            )
            for ite in range(num_local_batches):
                # The indices of the current local batch-beam.
                bbidx = range(
                    ite * local_batch_size * beam_width,
                    min(
                        (ite + 1) * local_batch_size * beam_width,
                        batch_size * beam_width,
                    ),
                )
                if cache_indirection is not None:
                    bidx = range(
                        ite * local_batch_size,
                        min((ite + 1) * local_batch_size, batch_size),
                    )
                    src_cache_indirection = cache_indirection[
                        src_indir_idx, bidx, ...
                    ]
                    tgt_cache_indirection = cache_indirection[
                        tgt_indir_idx, bidx, ...
                    ]

                if step == max_input_length:
                    hidden_states = last_token_hidden_states[bbidx, ...]
                else:
                    if comm.is_pipeline_group_first():
                        input_embeds = self.word_embedding(
                            output_token_ids[step - 1, bbidx]
                        )
                        if self.position_encoding is not None:
                            position_ids = (step - 1) * torch.ones_like(
                                pad_lengths[bbidx]
                            )
                            input_embeds += self.position_encoding(
                                position_ids
                            )
                        if self.pre_decoder_layernorm is not None:
                            input_embeds = self.pre_decoder_layernorm(
                                input_embeds
                            )
                    else:
                        # Dummy input_imbeds
                        input_embeds = torch.empty(
                            size=(len(bbidx), self.decoder.hidden_size),
                            dtype=self.decoder.dtype,
                            device=device,
                        )

                    profiler.start("ft-decoder")
                    hidden_states = self.decoder.forward(
                        max_input_length=max_input_length,
                        step=step,
                        ite=ite,
                        input_embeds=input_embeds,
                        sequence_lengths=sequence_lengths[bbidx],
                        key_cache=k_cache,
                        value_cache=v_cache,
                        finished=finished[bbidx],
                        total_padding_tokens=pad_lengths[bbidx],
                        cache_indirection=src_cache_indirection,
                        masked_tokens=masked_tokens[bbidx, ...],
                    )
                    profiler.stop("ft-decoder")

                if comm.is_pipeline_group_last():
                    if self.post_decoder_layernorm is not None:
                        hidden_states = self.post_decoder_layernorm(
                            hidden_states
                        )

                    # We use logits of fp32 type to avoid overflow issue.
                    if self.use_fp32_to_compute_logit:
                        # The FT GPT op internally uses FP32 compute type
                        # for matrix multiplication.
                        # This will produce the same result with the
                        # end-to-end FT's GPT op.
                        logits = torch.nn.functional.linear(
                            hidden_states.float(), self.lm_head.weight
                        )
                    else:
                        logits = self.lm_head(hidden_states).float()

                    profiler.start("ft-decode")
                    should_stop = self.decode_op.forward(
                        logits.view(batch_size, beam_width, -1),
                        step,
                        max_input_length,
                        ite,
                        local_batch_size,
                        eos_token_ids,
                        top_k,
                        top_p,
                        temperature,
                        repetition_penalty,
                        presence_penalty,
                        min_length,
                        len_penalty,
                        beam_search_diversity_rate,
                        top_p_decay,
                        top_p_min,
                        top_p_reset_ids,
                        None,
                        input_lengths,
                        sequence_limit_lengths,
                        stop_words_list,
                        bad_words_list,
                        src_cache_indirection,
                        output_token_ids.view(-1, batch_size, beam_width),
                        finished,
                        sequence_lengths,
                        cum_log_probs,
                        output_log_probs,
                        parent_ids,
                        tgt_cache_indirection,
                    )
                    profiler.stop("ft-decode")
                    is_generation_done &= should_stop

            # Broadcast from the last pipeline node if needed.
            profiler.start("ft-bcast")
            tensors_to_bcast = [
                output_token_ids[step, ...],
                finished,
                sequence_lengths,
                is_generation_done,
            ]
            if beam_width > 1:
                tensors_to_bcast.append(tgt_cache_indirection)
            self.decode_op.broadcast_from_last_pipeline(tensors_to_bcast)
            profiler.stop("ft-bcast")

            if is_generation_done or finished.all():
                break

        # Transpose (L, batch, beam) -> (batch, beam, L)
        output_token_ids = output_token_ids.view(
            -1, batch_size, beam_width
        ).permute(1, 2, 0)

        # Increase sequence_length by 1 because the sequence length of time step t is t - 1. # noqa: E501
        sequence_lengths += 1

        # Outputs
        output_dict = dict(output_token_ids=output_token_ids)
        if return_output_length:
            output_dict["output_lengths"] = sequence_lengths
        if return_log_probs:
            output_dict["cum_log_probs"] = cum_log_probs
            output_dict["output_log_probs"] = output_log_probs
        return output_dict


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/huggingface_gpt_convert.py
================================================
# Based on https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/huggingface_gpt_convert.py # noqa: E501
# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Convert huggingface GPT model. Use https://huggingface.co/gpt2 as demo.
"""

import argparse
import configparser
import os
import sys

from loguru import logger
import numpy as np
from transformers import GPT2Model  # transformers-4.10.0-py3

from nebullvm.optional_modules.torch import torch

dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.append(dir_path + "/../../../..")
sys.path.append(dir_path)


def get_weight_data_type(data_type):
    if data_type == "fp32":
        return np.float32
    elif data_type == "fp16":
        return np.float16
    else:
        assert False, f"Invalid weight data type {data_type}"


def split_and_convert_process(i, saved_dir, factor, key, args, val):

    if (
        key.find("input_layernorm.weight") != -1
        or key.find("input_layernorm.bias") != -1
        or key.find("attention.dense.bias") != -1
        or key.find("post_attention_layernorm.weight") != -1
        or key.find("post_attention_layernorm.bias") != -1
        or key.find("mlp.dense_4h_to_h.bias") != -1
        or key.find("final_layernorm.weight") != -1
        or key.find("final_layernorm.bias") != -1
    ):

        # shared weights, only need to convert the weights of rank 0
        if i == 0:
            saved_path = saved_dir + "/model." + key + ".bin"
            val.tofile(saved_path)

    elif (
        key.find("attention.dense.weight") != -1
        or key.find("mlp.dense_4h_to_h.weight") != -1
    ):
        split_vals = np.split(val, factor, axis=0)
        for j in range(factor):
            saved_path = (
                saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            )
            split_vals[j].tofile(saved_path)

    elif (
        key.find("mlp.dense_h_to_4h.weight") != -1
        or key.find("mlp.dense_h_to_4h.bias") != -1
    ):

        split_vals = np.split(val, factor, axis=-1)
        for j in range(factor):
            saved_path = (
                saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            )
            split_vals[j].tofile(saved_path)

    elif key.find("attention.query_key_value.bias") != -1:
        local_dim = (int)(val.shape[-1] / 3)

        val = val.reshape(3, local_dim)
        split_vals = np.split(val, factor, axis=-1)

        for j in range(factor):
            saved_path = (
                saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            )
            split_vals[j].tofile(saved_path)

    elif key.find("attention.query_key_value.weight") != -1:
        hidden_dim = val.shape[0]
        local_dim = (int)(val.shape[-1] / 3)

        val = val.reshape(hidden_dim, 3, local_dim)
        split_vals = np.split(val, factor, axis=-1)

        for j in range(factor):
            saved_path = (
                saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            )
            split_vals[j].tofile(saved_path)

    else:
        logger.warning("[ERROR] cannot find key '{}'".format(key))


def split_and_convert(args):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    model = GPT2Model.from_pretrained(args.in_file).to(torch_device)
    main(
        args.saved_dir,
        model,
        args.trained_gpu_num,
        args.infer_gpu_num,
        args.processes,
        args.weight_data_type,
    )


def main(
    saved_dir,
    model: GPT2Model,
    trained_gpu_num=1,
    infer_gpu_num=1,
    processes=1,
    weight_data_type="fp32",
):
    assert isinstance(model, GPT2Model), "model must be GPT2Model"
    args = None
    saved_dir = saved_dir + "/%d-gpu/" % infer_gpu_num

    if not os.path.exists(saved_dir):
        os.makedirs(saved_dir)
    # ckpt_name = args.in_file

    t_gpu_num = trained_gpu_num
    i_gpu_num = infer_gpu_num
    assert i_gpu_num % t_gpu_num == 0

    factor = (int)(i_gpu_num / t_gpu_num)

    # load position_embedding from rank 0
    # torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # model = GPT2Model.from_pretrained(args.in_file).to(torch_device)

    hf_config = vars(model.config)

    # NOTE: save parameters to config files (loaded by triton backends)
    config = configparser.ConfigParser()
    config["gpt"] = {}
    try:
        config["gpt"]["model_name"] = (
            "gpt"
            if hf_config["_name_or_path"] == ""
            else hf_config["_name_or_path"]
        )
        config["gpt"]["head_num"] = str(hf_config["n_head"])
        n_embd = hf_config["n_embd"]
        config["gpt"]["size_per_head"] = str(n_embd // hf_config["n_head"])
        config["gpt"]["inter_size"] = str(n_embd * 4)
        config["gpt"]["max_pos_seq_len"] = str(hf_config["n_positions"])
        config["gpt"]["num_layer"] = str(hf_config["n_layer"])
        config["gpt"]["vocab_size"] = str(hf_config["vocab_size"])
        config["gpt"]["start_id"] = str(hf_config["bos_token_id"])
        config["gpt"]["end_id"] = str(hf_config["eos_token_id"])
        config["gpt"]["weight_data_type"] = weight_data_type
        with open(saved_dir + "/config.ini", "w") as configfile:
            config.write(configfile)
    except:  # noqa: E722
        logger.warning("Fail to save the config in config.ini.")

    np_weight_data_type = get_weight_data_type(weight_data_type)

    huggingface_model_name_pattern = [
        "ln_1.bias",
        "ln_1.weight",
        "attn.c_attn.bias",
        "attn.c_attn.weight",
        "attn.c_proj.bias",
        "attn.c_proj.weight",
        "ln_2.bias",
        "ln_2.weight",
        "mlp.c_fc.bias",
        "mlp.c_fc.weight",
        "mlp.c_proj.bias",
        "mlp.c_proj.weight",
    ]

    ft_model_name_pattern = [
        "input_layernorm.bias",
        "input_layernorm.weight",
        "attention.query_key_value.bias",
        "attention.query_key_value.weight",
        "attention.dense.bias",
        "attention.dense.weight",
        "post_attention_layernorm.bias",
        "post_attention_layernorm.weight",
        "mlp.dense_h_to_4h.bias",
        "mlp.dense_h_to_4h.weight",
        "mlp.dense_4h_to_h.bias",
        "mlp.dense_4h_to_h.weight",
    ]

    # torch.multiprocessing.set_start_method("spawn")
    # torch.multiprocessing.set_sharing_strategy("file_system")
    # pool = multiprocessing.Pool(args.processes)
    for name, param in model.named_parameters():
        if name.find("weight") == -1 and name.find("bias") == -1:
            continue
        if name == "wpe.weight":
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
                saved_dir + "model.wpe.bin"
            )
        elif name == "wte.weight":
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
                saved_dir + "model.wte.bin"
            )
        elif name == "ln_f.bias":
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
                saved_dir + "model.final_layernorm.bias.bin"
            )
        elif name == "ln_f.weight":
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
                saved_dir + "model.final_layernorm.weight.bin"
            )
        elif name == "lm_head.weight":
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
                saved_dir + "model.lm_head.weight.bin"
            )
        else:
            for i in range(len(huggingface_model_name_pattern)):
                if name.find(huggingface_model_name_pattern[i]) != -1:
                    new_name = name.replace("h.", "layers.").replace(
                        huggingface_model_name_pattern[i],
                        ft_model_name_pattern[i],
                    )
                    # pool.starmap(split_and_convert_process,
                    # [(0, saved_dir, factor, new_name, args,
                    # param.detach().cpu().numpy().astype(np_weight_data_type))],
                    # )
                    split_and_convert_process(
                        0,
                        saved_dir,
                        factor,
                        new_name,
                        args,
                        param.detach()
                        .cpu()
                        .numpy()
                        .astype(np_weight_data_type),
                    )

    # pool.close()
    # pool.join()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument(
        "-saved_dir",
        "-o",
        type=str,
        help="file name of output file",
        required=True,
    )
    parser.add_argument(
        "-in_file",
        "-i",
        type=str,
        help="file name of input checkpoint file",
        required=True,
    )
    parser.add_argument(
        "-trained_gpu_num",
        "-t_g",
        type=int,
        help="How many gpus for inference",
        default=1,
    )
    parser.add_argument(
        "-infer_gpu_num",
        "-i_g",
        type=int,
        help="How many gpus for inference",
        required=True,
    )
    parser.add_argument(
        "-processes",
        "-p",
        type=int,
        help="How many processes to spawn for conversion (default: 4)",
        default=4,
    )
    parser.add_argument(
        "-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]
    )

    args = parser.parse_args()
    logger.info("\n=============== Argument ===============")
    for key in vars(args):
        logger.info("{}: {}".format(key, vars(args)[key]))
    logger.info("========================================")

    split_and_convert(args)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/intel_neural_compressor.py
================================================
from pathlib import Path
from typing import Union

from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.intel_neural_compressor import (  # noqa: E501
    quantize_neural_compressor,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.torch import Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class IntelNeuralCompressorCompiler(Compiler):
    supported_ops = {
        "cpu": [
            QuantizationType.STATIC,
            QuantizationType.DYNAMIC,
        ],
        "gpu": [],
    }

    def __init__(self):
        super().__init__()
        self.model_orig = None

    def execute(
        self,
        model: Module,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using IntelNeuralCompressor library.

        Args:
            model (torch.nn.Module): The pytorch model.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train")

        self.model_orig = model

        if quantization_type is not None:
            quantized_model = self._quantize_model(
                model, quantization_type, input_tfms, train_input_data
            )
            self.compiled_model = self._compile_model(quantized_model)

    def _compile_model(self, model: Union[str, Path]):
        return model

    @staticmethod
    def _quantize_model(
        model: Module,
        quantization_type: QuantizationType,
        input_tfms: MultiStageTransformation,
        input_data: DataManager,
    ):
        return quantize_neural_compressor(
            model, quantization_type, input_tfms, input_data
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/onnxruntime.py
================================================
from pathlib import Path
from typing import Union, List, Tuple

import numpy as np

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler

from nebullvm.operations.optimizations.compilers.quantizations.onnx import (
    quantize_onnx,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class ONNXCompiler(Compiler):
    supported_ops = {
        "cpu": [
            None,
            QuantizationType.STATIC,
            QuantizationType.DYNAMIC,
        ],
        "gpu": [
            None,
            QuantizationType.HALF,
        ],
    }

    def execute(
        self,
        model: str,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using ONNX Runtime Compiler.

        Args:
            model (str): The onnx model path.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train").get_numpy_list(
            QUANTIZATION_DATA_NUM
        )

        if quantization_type is not None:
            model = self._quantize_model(
                model, train_input_data, quantization_type, input_tfms
            )

        self.compiled_model = self._compile_model(model)

    def _compile_model(self, model: Union[str, Path]):
        return model

    def _quantize_model(
        self,
        model_path: str,
        input_data: List[Tuple[np.ndarray, ...]],
        quantization_type: QuantizationType,
        input_tfms: MultiStageTransformation,
    ):
        return quantize_onnx(
            model_path, input_data, quantization_type, self.device, input_tfms
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/openvino.py
================================================
import subprocess
from pathlib import Path
from typing import Tuple, List, Union

import numpy as np

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.openvino import (  # noqa: E501
    quantize_openvino,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.openvino import (
    Core,
    CompiledModel,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import MultiStageTransformation


class OpenVINOCompiler(Compiler):
    supported_ops = {
        "cpu": [
            None,
            QuantizationType.STATIC,
            QuantizationType.HALF,
        ],
        "gpu": [],
    }

    def __init__(self):
        super().__init__()

    def execute(
        self,
        model: Union[str, Path],
        model_params: ModelParams,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using OpenVINO library.

        Args:
            model (str): The onnx model path.
            model_params (ModelParams): The model parameters.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train").get_numpy_list(
            QUANTIZATION_DATA_NUM
        )

        cmd = [
            "mo",
            "--input_model",
            str(model),
            "--output_dir",
            str(Path(model).parent),
            "--input",
            ",".join(get_input_names(str(model))),
            "--input_shape",
            ",".join([f"{list(shape)}" for shape in model_params.input_sizes]),
        ]

        if quantization_type is QuantizationType.DYNAMIC:
            return None

        if quantization_type is QuantizationType.HALF:
            cmd = cmd + ["--compress_to_fp16"]

        process = subprocess.Popen(cmd)
        process.wait()
        base_path = Path(model).parent
        openvino_model_path = base_path / f"{Path(model).stem}.xml"
        openvino_model_weights = base_path / f"{Path(model).stem}.bin"

        if quantization_type not in [QuantizationType.HALF, None]:
            openvino_model_path, openvino_model_weights = self._quantize_model(
                model_topology=str(openvino_model_path),
                model_weights=str(openvino_model_weights),
                input_names=get_input_names(str(model)),
                input_data=train_input_data,
            )

        self.compiled_model = str(
            Path(openvino_model_path).parent / Path(openvino_model_path).stem
        )

    def _compile_model(
        self,
        model_name: str,
        model_weights: str,
        network_parameters: ModelParams,
    ) -> CompiledModel:
        core = Core()
        model = core.read_model(model=model_name, weights=model_weights)

        dynamic_shape = self._get_dynamic_shape(model, network_parameters)

        if dynamic_shape is not None:
            model.reshape(dynamic_shape)

        return core.compile_model(model=model, device_name="CPU")

    @staticmethod
    def _quantize_model(
        model_topology: str,
        model_weights: str,
        input_data: List[Tuple[np.ndarray, ...]],
        input_names: List[str],
    ) -> Tuple[str, str]:
        return quantize_openvino(
            model_topology, model_weights, input_data, input_names
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/intel_neural_compressor.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any

import yaml

from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.neural_compressor import (
    MixedPrecision,
    Quantization,
)
from nebullvm.optional_modules.torch import DataLoader, Module, GraphModule
from nebullvm.tools.data import DataManager, PytorchDataset
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)


def _prepare_quantization_config(model: Any, tmp_dir: str, approach: str):
    config = {
        "model": {
            "name": model.__class__.__name__,
            "framework": "pytorch_fx",
        },
        "quantization": {"approach": approach},
        "evaluation": {"accuracy": {"metric": {"topk": 1}}},
        "tuning": {
            "accuracy_criterion": {"relative": 0.01},
        },
    }

    path_file = Path(tmp_dir) / "temp_qt.yaml"
    with open(path_file, "w") as f:
        yaml.dump(config, f)

    return path_file


def _prepare_mixed_precision_config(model: Any, tmp_dir: str):
    config = {
        "model": {
            "name": model.__class__.__name__,
            "framework": "pytorch_fx",
        },
        "mixed_precision": {"precisions": "bf16"},
        "evaluation": {"accuracy": {"metric": {"topk": 1}}},
        "tuning": {
            "accuracy_criterion": {"relative": 0.01},
        },
    }

    path_file = Path(tmp_dir) / "temp_mp.yaml"
    with open(path_file, "w") as f:
        yaml.dump(config, f)

    return path_file


def _get_dataloader(input_data: DataManager):
    bs = input_data[0][0][0].shape[0]
    ds = PytorchDataset(input_data, has_labels=True)
    dl = DataLoader(ds, bs)
    return dl


def _quantize_static(model: Module, input_data: DataManager) -> GraphModule:
    with TemporaryDirectory() as tmp_dir:
        config_file_qt = _prepare_quantization_config(
            model, tmp_dir, "post_training_static_quant"
        )
        quantizer = Quantization(str(config_file_qt))
        quantizer.model = model
        quantizer.calib_dataloader = _get_dataloader(input_data)
        quantizer.eval_dataloader = _get_dataloader(input_data)
        compressed_model = quantizer()

    return compressed_model


def _quantize_dynamic(model: Module) -> GraphModule:
    with TemporaryDirectory() as tmp_dir:
        config_file_qt = _prepare_quantization_config(
            model, tmp_dir, "post_training_dynamic_quant"
        )
        quantizer = Quantization(str(config_file_qt))
        quantizer.model = model
        compressed_model = quantizer()

    return compressed_model


def _mixed_precision(
    model: Module, input_tfms: MultiStageTransformation
) -> GraphModule:
    with TemporaryDirectory() as tmp_dir:
        config_file_qt = _prepare_mixed_precision_config(model, tmp_dir)
        converter = MixedPrecision(str(config_file_qt))
        converter.model = model
        compressed_model = converter()

    input_tfms.append(HalfPrecisionTransformation())

    return compressed_model


def quantize_neural_compressor(
    model: Module,
    quantization_type: QuantizationType,
    input_tfms: MultiStageTransformation,
    input_data: DataManager,
) -> GraphModule:
    if quantization_type is QuantizationType.STATIC:
        quantized_model = _quantize_static(model, input_data)
    elif quantization_type is QuantizationType.DYNAMIC:
        quantized_model = _quantize_dynamic(model)
    elif quantization_type is QuantizationType.HALF:
        quantized_model = _mixed_precision(model, input_tfms)
    else:
        raise ValueError(
            f"Quantization type {quantization_type} is not "
            f"supported by Intel Neural Compressor"
        )
    return quantized_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/onnx.py
================================================
from pathlib import Path
from typing import Union, Iterable, Tuple, List

import cpuinfo
import numpy as np

from nebullvm.core.models import QuantizationType, Device, DeviceType
from nebullvm.optional_modules.onnx import (
    onnx,
    convert_float_to_float16_model_path,
)
from nebullvm.optional_modules.onnxruntime import (
    CalibrationDataReader,
    QuantType,
    quantize_dynamic,
    quantize_static,
)
from nebullvm.optional_modules.torch import DataLoader
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)


class _IterableCalibrationDataReader(CalibrationDataReader):
    def __init__(
        self,
        iterable_dataset: Union[Iterable[Tuple], List[Tuple]],
        input_names: List[str],
    ):
        self.iterable_dataset = iter(
            [
                {
                    input_name: value
                    for inputs in iterable_dataset
                    for input_name, value in zip(input_names, inputs)
                }
            ]
        )

    def get_next(self) -> dict:
        return next(self.iterable_dataset, None)

    @classmethod
    def from_dataloader(
        cls, dl: DataLoader, input_names: List[str], contains_y: bool = True
    ):
        iterable_ds = iter(
            inputs[:-1] if contains_y else inputs for inputs in dl
        )
        return cls(iterable_ds, input_names)


def _quantize_dynamic(model_path: str) -> str:
    model_path = Path(model_path)
    model_quant = model_path.parent.parent / "int8_dynamic"
    model_quant.mkdir(parents=True, exist_ok=True)
    model_quant = model_quant / (model_path.stem + ".quant.onnx")
    quantize_dynamic(
        model_path,
        model_quant,
        weight_type=QuantType.QUInt8,
        optimize_model=False,
    )
    return str(model_quant)


def _get_quantization_type_for_static(use_gpu) -> Tuple[QuantType, QuantType]:
    """Returns the quantization types for activations and weights,
    depending on the underlying hardware
    """
    arch = cpuinfo.get_cpu_info()["arch"].lower()
    if use_gpu:
        activation_type = weight_type = QuantType.QInt8
    elif "x86" in arch:
        cpu_raw_data = cpuinfo.get_cpu_info()["brand_raw"].lower()
        if "intel" in cpu_raw_data and "xeon" in cpu_raw_data:
            activation_type = QuantType.QUInt8
            weight_type = QuantType.QInt8
        else:
            activation_type = weight_type = QuantType.QUInt8
    else:
        activation_type = QuantType.QUInt8
        weight_type = QuantType.QUInt8
    return activation_type, weight_type


def _quantize_static(
    model_path: str, input_data: List[Tuple[np.ndarray, ...]], use_gpu: bool
) -> str:
    model_path = Path(model_path)
    model_quant = model_path.parent.parent / "int8_static"
    model_quant.mkdir(parents=True, exist_ok=True)
    model_quant = model_quant / (model_path.stem + ".quant.onnx")
    inputs = input_data
    input_names = get_input_names(str(model_path))
    cdr = _IterableCalibrationDataReader(
        input_names=input_names, iterable_dataset=inputs
    )
    activation_type, weight_type = _get_quantization_type_for_static(use_gpu)
    quantize_static(
        model_path,
        Path(model_quant),
        cdr,
        activation_type=activation_type,
        weight_type=weight_type,
        optimize_model=False,
    )
    return str(model_quant)


def _convert_to_half_precision(
    model_path: str, input_tfms: MultiStageTransformation
) -> str:
    model_path = Path(model_path)
    model_quant = model_path.parent.parent / "fp16"
    model_quant.mkdir(parents=True)
    model_quant = model_quant / (model_path.stem + "_fp16.onnx")
    new_onnx_model = convert_float_to_float16_model_path(str(model_path))
    input_tfms.append(HalfPrecisionTransformation())
    try:
        onnx.save(new_onnx_model, str(model_quant))
    except ValueError:
        # Model larger than 2GB must be saved as external data
        onnx.save(
            new_onnx_model,
            str(model_quant),
            save_as_external_data=True,
            all_tensors_to_one_file=False,
            convert_attribute=True,
        )
    return str(model_quant)


def quantize_onnx(
    model_path: str,
    input_data: List[Tuple[np.ndarray, ...]],
    quantization_type: QuantizationType,
    device: Device,
    input_tfms: MultiStageTransformation,
) -> str:
    if quantization_type == QuantizationType.DYNAMIC:
        return _quantize_dynamic(model_path)
    elif quantization_type == QuantizationType.STATIC:
        return _quantize_static(
            model_path, input_data, device.type is DeviceType.GPU
        )
    elif quantization_type == QuantizationType.HALF:
        return _convert_to_half_precision(model_path, input_tfms)
    else:
        raise ValueError(
            f"Quantization type {quantization_type} not supported"
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/openvino.py
================================================
from typing import List, Tuple, Any

import numpy as np

from nebullvm.optional_modules.openvino import (
    DataLoader,
    load_model,
    IEEngine,
    create_pipeline,
    compress_model_weights,
    save_model,
)


class _CalibrationDataLoader(DataLoader):
    def __init__(
        self, input_data: List[Tuple[Any, ...]], input_names: List[str]
    ):
        self._input_data = input_data
        self._input_names = input_names

    def __len__(self):
        return len(self._input_data)

    def __getitem__(self, item):
        inputs = {
            k: v for (k, v) in zip(self._input_names, self._input_data[item])
        }
        return (
            (item, None),
            inputs,
        )


def quantize_openvino(
    model_topology: str,
    model_weights: str,
    input_data: List[Tuple[np.ndarray, ...]],
    input_names: List[str],
) -> Tuple[str, str]:
    model_config = {
        "model_name": "model",
        "model": model_topology,
        "weights": model_weights,
    }

    # Engine config
    engine_config = {"device": "CPU"}

    algorithms = [
        {
            "name": "DefaultQuantization",
            "params": {
                "target_device": "ANY",
                "preset": "performance",
                "stat_subset_size": len(input_data),
            },
        }
    ]
    data_loader = _CalibrationDataLoader(
        input_data=input_data, input_names=input_names
    )
    model = load_model(model_config=model_config)
    engine = IEEngine(config=engine_config, data_loader=data_loader)
    pipeline = create_pipeline(algorithms, engine)
    compressed_model = pipeline.run(model=model)
    compress_model_weights(compressed_model)
    compressed_model_paths = save_model(
        model=compressed_model,
        save_path="quantized_model",
        model_name="quantized_model",
    )

    return (
        compressed_model_paths[0]["model"],
        compressed_model_paths[0]["weights"],
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/pytorch.py
================================================
import copy
from typing import List, Tuple, Union

from loguru import logger

from nebullvm.core.models import DeviceType, Device, QuantizationType
from nebullvm.optional_modules.torch import (
    torch,
    Module,
    symbolic_trace,
    QuantStub,
    DeQuantStub,
    GraphModule,
    default_dynamic_qconfig,
    prepare_fx,
    convert_fx,
    ScriptModule,
)
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)
from nebullvm.tools.utils import check_module_version


class _QuantWrapper(Module):
    def __init__(self, model: Module):
        super(_QuantWrapper, self).__init__()
        qconfig = model.qconfig if hasattr(model, "qconfig") else None
        self.quant = QuantStub(qconfig)
        self.model = model
        self.dequant = DeQuantStub()

    def forward(self, *inputs: torch.Tensor):
        inputs = (self.quant(x) for x in inputs)
        outputs = self.model(*inputs)
        return tuple(self.dequant(x) for x in outputs)


def _quantize_dynamic_torch(model: Module):
    layer_types = {
        type(layer)
        for layer in model.children()
        if len(list(layer.parameters())) > 0
    }
    return torch.quantization.quantize_dynamic(
        model=model, qconfig_spec=layer_types, dtype=torch.qint8
    )


def _quantize_dynamic_torch_fx(
    model: GraphModule,
    input_data: List[Tuple[torch.Tensor, ...]],
):
    qconfig_dict = {"": default_dynamic_qconfig}

    additional_arguments = {}
    if check_module_version(torch, min_version="1.13.0"):
        additional_arguments["example_inputs"] = input_data[0]

    model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)
    return convert_fx(model_prepared)


def _quantize_static_torch(
    model: Module,
    input_data: List[Tuple[torch.Tensor, ...]],
    backend: str,
):
    model = _QuantWrapper(model)
    model.qconfig = torch.quantization.get_default_qconfig(backend)
    # TODO: change line below, it's wrong
    # model = torch.quantization.fuse_modules(model, [["conv", "relu"]])
    model = torch.quantization.prepare(model)
    with torch.no_grad():
        for tensors in input_data:
            _ = model(*tensors)
    return torch.quantization.convert(model)


def _quantize_static_torch_fx(
    model: GraphModule,
    input_data: List[Tuple[torch.Tensor, ...]],
    backend: str,
):
    qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)}
    additional_arguments = {}
    if check_module_version(torch, min_version="1.13.0"):
        additional_arguments["example_inputs"] = input_data[0]

    model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)
    with torch.no_grad():
        for tensors in input_data:
            _ = model_prepared(*tensors)
    return convert_fx(model_prepared)


def _quantize_static(
    model: Union[Module, GraphModule],
    input_data: List[Tuple[torch.Tensor, ...]],
    device: Device,
):
    assert (
        device is not DeviceType.GPU
    ), "Quantization for torch is only available on CPU"

    backend = (
        "fbgemm"
        if "fbgemm" in torch.backends.quantized.supported_engines
        else "qnnpack"
    )

    torch.backends.quantized.engine = backend

    if isinstance(model, GraphModule):
        return _quantize_static_torch_fx(model, input_data, backend)
    else:
        return _quantize_static_torch(model, input_data, backend)


def _quantize_dynamic(
    model: Union[Module, GraphModule],
    input_data: List[Tuple[torch.Tensor, ...]],
    device: Device,
):
    assert (
        device is not DeviceType.GPU
    ), "Quantization for torch is only available on CPU"

    backend = (
        "fbgemm"
        if "fbgemm" in torch.backends.quantized.supported_engines
        else "qnnpack"
    )

    torch.backends.quantized.engine = backend

    if isinstance(model, GraphModule):
        return _quantize_dynamic_torch_fx(model, input_data)
    else:
        return _quantize_dynamic_torch(model)


def _half_precision(model: Module):
    return model.half()


def quantize_pytorch(
    model: Module,
    quantization_type: QuantizationType,
    input_tfms: MultiStageTransformation,
    input_data_torch: List[Tuple[torch.Tensor, ...]],
    device: Device,
) -> Union[torch.nn.Module, ScriptModule, GraphModule]:
    model = copy.deepcopy(model).eval()

    try:
        model = symbolic_trace(model)
    except Exception:
        logger.warning("Unable to trace model with torch.fx")

    if quantization_type is QuantizationType.HALF:
        input_tfms.append(HalfPrecisionTransformation())
        quantized_model = _half_precision(model)
    elif quantization_type is QuantizationType.STATIC:
        quantized_model = _quantize_static(model, input_data_torch, device)
    elif quantization_type is QuantizationType.DYNAMIC:
        quantized_model = _quantize_dynamic(model, input_data_torch, device)
    else:
        raise NotImplementedError(
            f"No quantization implemented for quantization "
            f"type {quantization_type}"
        )

    return quantized_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensor_rt.py
================================================
from typing import List, Tuple

import numpy as np

from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.optional_modules.tensor_rt import (
    tensorrt as trt,
    IInt8EntropyCalibrator2,
    polygraphy,
)
from nebullvm.tools.transformations import (
    MultiStageTransformation,
)


def quantize_tensorrt(
    quantization_type: QuantizationType,
    model_params: ModelParams,
    config,
    input_tfms: MultiStageTransformation,
    input_data: List[Tuple[np.ndarray, ...]] = None,
):
    if quantization_type is QuantizationType.HALF:
        config.set_flag(trt.BuilderFlag.FP16)
        # Tensor RT does not need to transform input data
        # to fp16 because it expects always fp32
    elif quantization_type is QuantizationType.STATIC:
        assert input_data is not None, (
            "You need to specify the calibration data for "
            "performing static quantization."
        )
        calibrator = TensorRTCalibrator(
            batch_size=model_params.batch_size,
            input_data=input_data,
        )
        config.set_flag(trt.BuilderFlag.INT8)
        config.int8_calibrator = calibrator

    return config


class TensorRTCalibrator(IInt8EntropyCalibrator2):
    def __init__(
        self, batch_size: int, input_data: List[Tuple[np.ndarray, ...]]
    ):
        super(TensorRTCalibrator, self).__init__()
        self._bs = batch_size
        self.batches = (x for x in input_data)

    def get_batch(self, names):
        cuda_stream = polygraphy.Stream()
        try:
            data = next(self.batches)

            cuda_data = []
            for input_tensor in data:
                device_array = polygraphy.DeviceArray(
                    shape=input_tensor.shape, dtype=input_tensor.dtype
                )
                device_array.copy_from(
                    host_buffer=input_tensor, stream=cuda_stream
                )
                cuda_data.append(device_array)

            return [input_tensor.ptr for input_tensor in cuda_data]
        except StopIteration:
            return None

    def get_batch_size(self):
        return self._bs

    def read_calibration_cache(self):
        return None

    def write_calibration_cache(self, cache):
        return None


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensorflow.py
================================================
from typing import List, Tuple

from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.tensorflow import tensorflow as tf


def _quantize_dynamic(model: tf.Module):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_quant_model = converter.convert()
    return tflite_quant_model


def _quantize_static(model: tf.Module, dataset: List[Tuple[tf.Tensor, ...]]):
    def representative_dataset():
        for data_tuple in dataset:
            yield list(data_tuple)

    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.representative_dataset = representative_dataset
    tflite_quant_model = converter.convert()
    return tflite_quant_model


def _half_precision(model: tf.Module):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    tflite_quant_model = converter.convert()
    return tflite_quant_model


def quantize_tensorflow(
    model: tf.Module,
    quantization_type: QuantizationType,
    input_data_tensorflow: List[Tuple[tf.Tensor, ...]],
):
    if quantization_type is QuantizationType.DYNAMIC:
        quantized_model = _quantize_dynamic(model)
    elif quantization_type is QuantizationType.STATIC:
        quantized_model = _quantize_static(model, input_data_tensorflow)
    elif quantization_type is QuantizationType.HALF:
        quantized_model = _half_precision(model)
    else:
        raise NotImplementedError(
            f"Quantization not supported for type {quantization_type}"
        )

    return quantized_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tvm.py
================================================
from typing import List, Sequence, Any

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.tvm import (
    relay,
    ToMixedPrecision,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)


class TVMCalibrator(DataManager):
    def __init__(self, data_reader: Sequence, input_names: List[str]):
        super(TVMCalibrator, self).__init__(data_reader=data_reader)
        self._input_names = input_names

    def __getitem__(self, item: int):
        tuple_ = self._data_reader[item]
        return {name: data for name, data in zip(self._input_names, tuple_)}


def quantize_apache_tvm(
    model: Any,
    quantization_type: QuantizationType,
    input_tfms: MultiStageTransformation,
    input_data: DataManager,
    params: Any,
):
    if quantization_type is not None:
        if quantization_type is QuantizationType.HALF:
            quantized_model = ToMixedPrecision(mixed_precision_type="float16")(
                model
            )
            input_tfms.append(HalfPrecisionTransformation())
        else:
            if quantization_type is QuantizationType.DYNAMIC:
                inputs = None
            elif quantization_type is QuantizationType.STATIC:
                inputs = input_data.get_split("train").get_numpy_list(
                    QUANTIZATION_DATA_NUM
                )
                input_names = [f"input_{n}" for n in range(len(inputs[0]))]
                inputs = TVMCalibrator(inputs, input_names)
            else:
                return

            if inputs is not None:
                with relay.quantize.qconfig(
                    calibrate_mode="kl_divergence", weight_scale="max"
                ):
                    quantized_model = relay.quantize.quantize(
                        model, params, dataset=inputs
                    )
            else:
                with relay.quantize.qconfig(
                    calibrate_mode="global_scale", global_scale=8.0
                ):
                    quantized_model = relay.quantize.quantize(model, params)

        return quantized_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/utils.py
================================================
from loguru import logger

from nebullvm.core.models import QuantizationType


def check_quantization(
    quantization_type: QuantizationType, perf_loss_ths: float
):
    if quantization_type is not None and perf_loss_ths is None:
        logger.warning(
            "Got a valid quantization type without any given quantization "
            "threshold. The quantization step will be ignored."
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensor_rt.py
================================================
import abc
import copy
import os
import subprocess
from pathlib import Path
from typing import List, Any, Tuple

import numpy as np

from nebullvm.config import QUANTIZATION_DATA_NUM, TORCH_TENSORRT_PRECISIONS
from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.operations.optimizations.compilers.base import Compiler

from nebullvm.operations.optimizations.compilers.quantizations.tensor_rt import (  # noqa: E501
    quantize_tensorrt,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensor_rt import tensorrt as trt
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.optional_modules.torch_tensorrt import (
    torch_tensorrt,
    DataLoaderCalibrator,
)
from nebullvm.tools.data import DataManager, PytorchDataset
from nebullvm.tools.diffusers import UNet
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import (
    MultiStageTransformation,
    HalfPrecisionTransformation,
)


class TensorRTCompiler(Compiler, abc.ABC):
    supported_ops = {
        "cpu": [],
        "gpu": [
            None,
            QuantizationType.STATIC,
            QuantizationType.HALF,
        ],
    }

    def __init__(self):
        super().__init__()
        self.model_orig = None

    @staticmethod
    def _extract_dynamic_shape_ranges(model_params: ModelParams):
        inputs_shapes = []

        for i, info in enumerate(model_params.input_infos):
            static_shape = info.size

            if model_params.dynamic_info is not None:
                input_dict = model_params.dynamic_info.inputs[i]

                assert all(
                    key in dim
                    for dim in input_dict.values()
                    for key in ["min_val", "opt_val", "max_val"]
                ), (
                    "Missing min/opt/max ranges, TensorRT needs them to "
                    "enable dynamic shape properly"
                )

                shape_dict = {
                    "min_shape": [
                        static_shape[j]
                        if j not in input_dict
                        else input_dict[j]["min_val"]
                        for j in range(len(static_shape))
                    ],
                    "opt_shape": [
                        static_shape[j]
                        if j not in input_dict
                        else input_dict[j]["opt_val"]
                        for j in range(len(static_shape))
                    ],
                    "max_shape": [
                        static_shape[j]
                        if j not in input_dict
                        else input_dict[j]["max_val"]
                        for j in range(len(static_shape))
                    ],
                }
                inputs_shapes.append(shape_dict)
            else:
                inputs_shapes.append({"shape": static_shape})

        return inputs_shapes

    @abc.abstractmethod
    def execute(self, *args, **kwargs):
        pass


class PyTorchTensorRTCompiler(TensorRTCompiler):
    def execute(
        self,
        model: Module,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using TensorRT Compiler from the
            PyTorch interface.

        Args:
            model (torch.nn.Module): The pytorch model.
            model_params (ModelParams): The model parameters.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)

        if quantization_type is QuantizationType.HALF:
            dtype = torch.half
            input_tfms.append(HalfPrecisionTransformation())
        elif quantization_type is QuantizationType.STATIC:
            if model_params.dynamic_info is not None:
                self.logger.warning(
                    "Static quantization is not available when "
                    "using dynamic shape"
                )
                return
            dtype = torch.int8

            dataset = PytorchDataset(input_data.get_split("train"))
            dataloader = torch.utils.data.DataLoader(
                dataset,
                batch_size=dataset.batch_size,
                shuffle=False,
                num_workers=0,
            )

            calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(
                dataloader,
                use_cache=False,
                algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,  # noqa E501
                device=torch.device(self.device.to_torch_format()),
            )
        else:
            dtype = torch.float32

        # Convert int64 to int32 for transformers inputs
        input_tensors = [
            tensor.to(self.device.to_torch_format())
            if tensor.dtype != torch.int64
            else tensor.to(torch.int32).to(self.device.to_torch_format())
            for tensor in input_data.get_list(1)[0]
        ]

        self.compiled_model = self._compile_model(
            model=model,
            model_params=model_params,
            input_tensors=input_tensors,
            dtype=dtype,
            calibrator=calibrator
            if quantization_type is QuantizationType.STATIC
            else None,  # noqa E501
            quantization_type=quantization_type,
        )

    @torch.no_grad()
    def _compile_model(
        self,
        model: Module,
        model_params: ModelParams,
        input_tensors: List[torch.Tensor],
        dtype: torch.dtype,
        calibrator: DataLoaderCalibrator,
        quantization_type: QuantizationType,
    ):

        model.to(self.device.to_torch_format()).eval()

        try:
            if quantization_type is QuantizationType.HALF:
                ts_model = torch.jit.script(copy.deepcopy(model).half()).half()
            else:
                ts_model = torch.jit.script(model)
        except Exception:
            if quantization_type is QuantizationType.HALF:
                ts_model = torch.jit.trace(
                    copy.deepcopy(model).half(),
                    [t.half() for t in input_tensors],
                ).half()
            else:
                ts_model = torch.jit.trace(model, input_tensors)

        with torch_tensorrt.logging.errors():
            inputs_shapes = self._extract_dynamic_shape_ranges(model_params)
            trt_model = torch_tensorrt.compile(
                ts_model,
                inputs=[
                    torch_tensorrt.Input(
                        **inputs_shapes[i],
                        dtype=torch.half
                        if (
                            dtype == torch.half
                            and tensor.dtype not in [torch.int8, torch.int32]
                        )
                        else tensor.dtype,
                    )
                    for i, tensor in enumerate(input_tensors)
                ],
                enabled_precisions=TORCH_TENSORRT_PRECISIONS[str(dtype)],
                calibrator=calibrator
                if quantization_type is QuantizationType.STATIC
                else None,
                workspace_size=self.device.get_free_memory(),
                device={
                    "device_type": torch_tensorrt.DeviceType.GPU,
                    "gpu_id": self.device.idx,
                    "dla_core": 0,
                    "allow_gpu_fallback": False,
                    "disable_tf32": False,
                },
                truncate_long_and_double=True,
            )

        # Delete calibration cache
        if os.path.exists("calibration.cache"):
            os.remove("calibration.cache")

        return trt_model

    @staticmethod
    def _quantize_model(**kwargs) -> Any:
        raise NotImplementedError


class ONNXTensorRTCompiler(TensorRTCompiler):
    def __init__(self):
        super().__init__()
        self.model_orig = None
        self.onnx_model_path = None
        self.simplify_model = True

    def execute(
        self,
        model: str,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        is_diffusion: bool = False,
        **kwargs,
    ):
        """Compile the input model using TensorRT Compiler from the
            ONNX interface.

        Args:
            model (str): The path to the onnx model.
            model_params (ModelParams): The model parameters.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
            is_diffusion (bool): Whether the model is a diffusion model.
                Default: False.
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train").get_numpy_list(
            QUANTIZATION_DATA_NUM
        )

        if self.simplify_model and not is_diffusion:
            try:
                import onnxsim  # noqa: F401

                # Simplify model, otherwise tensor RT won't work
                # on gpt2 and some other models.
                simplified_model = str(model) + "_simplified"
                if not Path(simplified_model).is_file():
                    cmd = [
                        "onnxsim",
                        str(model),
                        simplified_model,
                    ]
                    subprocess.run(cmd, stdout=subprocess.DEVNULL)

                # First try with simplified model
                self.onnx_model_path = simplified_model
                assert os.path.isfile(self.onnx_model_path)
            except Exception:
                # Use original model
                self.logger.warning(
                    "Unable to simplify model with ONNX Simplifier. "
                    "Original ONNX model will be used to build "
                    "TensorRT engine"
                )
                self.onnx_model_path = str(model)
            self.simplify_model = False
        elif self.onnx_model_path is None:
            self.onnx_model_path = str(model)

        if is_diffusion:
            if quantization_type is None:
                self.logger.warning(
                    "Skipping float32 precision for Stable Diffusion, "
                    "half precision will be used instead."
                )
                return
            if quantization_type is QuantizationType.STATIC:
                self.logger.warning(
                    "Skipping static quantization for Stable Diffusion "
                    "because for now it's not supported."
                )
                return

        if self.simplify_model and is_diffusion:
            optimized_model = str(Path(model).parent / "model_opt.onnx")
            unet = UNet(hf_token=None)
            opt_graph = unet.optimize(onnx.load(str(model)))
            try:
                onnx.save(opt_graph, optimized_model)
            except Exception:
                onnx.save(
                    opt_graph, optimized_model, save_as_external_data=True
                )
            self.onnx_model_path = optimized_model
            self.simplify_model = False
        elif self.onnx_model_path is None:
            self.onnx_model_path = str(model)

        # -- Build phase --
        nvidia_logger = trt.Logger(trt.Logger.ERROR)
        builder = trt.Builder(nvidia_logger)
        # create network definition
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        )
        # build the engine
        # TODO: setup config value for the class in a config file
        config = builder.create_builder_config()
        try:
            config.set_memory_pool_limit(
                trt.MemoryPoolType.WORKSPACE, self.device.get_free_memory()
            )
        except AttributeError:
            # The method set_memory_pool_limit is not available
            # until TensorRT Release 8.4.1
            self.logger.warning(
                "Cannot call method set_memory_pool_limit for TensorRT. "
                "because your version is lower than 8.4.1. "
                "Please update TensorRT version."
            )

        if quantization_type is not None:
            config = self._quantize_model(
                quantization_type,
                model_params,
                config,
                input_tfms,
                train_input_data
                if quantization_type is QuantizationType.STATIC
                else None,
            )

        self.compiled_model = self._compile_model(
            onnx_model_path=str(self.onnx_model_path),
            model_params=model_params,
            config=config,
            network=network,
            builder=builder,
            nvidia_logger=nvidia_logger,
        )
        self.model_orig = self.onnx_model_path

    def _compile_model(
        self,
        onnx_model_path: str,
        model_params: ModelParams,
        config,
        network,
        builder,
        nvidia_logger,
    ):
        parser = trt.OnnxParser(network, nvidia_logger)
        success = parser.parse_from_file(onnx_model_path)

        if not success:
            for idx in range(parser.num_errors):
                self.logger.debug(parser.get_error(idx))
            raise ValueError(
                f"Errors occurred while processing the "
                f"ONNX file at {onnx_model_path}"
            )

        if model_params.dynamic_info is not None:
            inputs_shapes = self._extract_dynamic_shape_ranges(model_params)
            profile = builder.create_optimization_profile()
            for i, input_name in enumerate(get_input_names(onnx_model_path)):
                profile.set_shape(
                    input_name,
                    inputs_shapes[i]["min_shape"],
                    inputs_shapes[i]["opt_shape"],
                    inputs_shapes[i]["max_shape"],
                )
            config.add_optimization_profile(profile)
        return builder.build_serialized_network(network, config)

    @staticmethod
    def _quantize_model(
        quantization_type: QuantizationType,
        model_params: ModelParams,
        config,
        input_tfms: MultiStageTransformation,
        input_data: List[Tuple[np.ndarray, ...]] = None,
    ):
        return quantize_tensorrt(
            quantization_type,
            model_params,
            config,
            input_tfms,
            input_data,
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensorflow.py
================================================
from typing import List, Tuple

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler

from nebullvm.operations.optimizations.compilers.quantizations.tensorflow import (  # noqa: E501
    quantize_tensorflow,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class TensorflowBackendCompiler(Compiler):
    supported_ops = {
        "cpu": [None],
        "gpu": [None],
    }

    def execute(
        self,
        model: tf.Module,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Optimize the input model using tensorflow built-in techniques.

        Args:
            model (tf.Module): The tensorflow model.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None.
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)

        self.compiled_model = model

    def _compile_model(self):
        pass

    @staticmethod
    def _quantize_model(**kwargs):
        raise NotImplementedError()


class TFLiteBackendCompiler(Compiler):
    supported_ops = {
        "cpu": [
            None,
            QuantizationType.STATIC,
            QuantizationType.HALF,
            QuantizationType.DYNAMIC,
        ],
        "gpu": [],
    }

    def execute(
        self,
        model: tf.Module,
        input_tfms: MultiStageTransformation,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Optimize the input model using pytorch built-in techniques.

        Args:
            model (torch.nn.Module): The pytorch model. For avoiding un-wanted
                modifications to the original model, it will be copied in the
                method.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with an higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train").get_list(
            QUANTIZATION_DATA_NUM
        )

        if quantization_type is not None:
            self.compiled_model = self._quantize_model(
                model, quantization_type, train_input_data
            )
        else:
            self.compiled_model = self._compile_model(model)

    def _compile_model(
        self,
        model: tf.Module,
    ):
        converter = tf.lite.TFLiteConverter.from_keras_model(model)
        tflite_model = converter.convert()
        return tflite_model

    @staticmethod
    def _quantize_model(
        model: tf.Module,
        quantization_type: QuantizationType,
        input_data_tensorflow: List[Tuple[tf.Tensor, ...]],
    ):
        return quantize_tensorflow(
            model, quantization_type, input_data_tensorflow
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_dynamo.py
================================================
from typing import Union, Any

from nebullvm.core.models import ModelParams, QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler

from nebullvm.optional_modules.torch import (
    torch,
    Module,
    GraphModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class TorchDynamoCompiler(Compiler):
    supported_ops = {
        "cpu": [None],
        "gpu": [None],
    }

    def execute(
        self,
        model: Module,
        model_params: ModelParams,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Optimize the input model using pytorch built-in techniques.

        Args:
            model (torch.nn.Module): The pytorch model.
            model_params (ModelParams): The model parameters.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None.
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        self.compiled_model = self._compile_model(model, model_params)

    @torch.no_grad()
    def _compile_model(
        self,
        model: Union[Module, GraphModule],
        network_parameters: ModelParams,
    ) -> Any:
        dynamic = False
        if network_parameters.dynamic_info is not None:
            dynamic = True
        return torch.compile(model, dynamic=dynamic)

    def _quantize_model(self, **kwargs) -> Any:
        raise NotImplementedError


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_neuron.py
================================================
from typing import List, Tuple

from nebullvm.core.models import QuantizationType, ModelParams, DeviceType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.torch import (
    torch,
    symbolic_trace,
)
from nebullvm.optional_modules.torch_neuron import torch_neuron
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class TorchNeuronCompiler(Compiler):
    supported_ops = {
        "cpu": [],
        "gpu": [],
        "neuron": [None, QuantizationType.HALF],
    }

    @staticmethod
    def _check_dynamic_shape(network_parameters: ModelParams) -> bool:
        """Handles case when model inputs have dynamic shapes.
        For now TorchNeuron only supports dynamic shape for the
        batch dimension.

        Args:
            network_parameters (ModelParams): The model parameters.

        Returns:
            bool: True if the model has dynamic batch size, False otherwise.
        """
        if network_parameters.dynamic_info is None:
            return False

        for i, input_shape in enumerate(
            network_parameters.dynamic_info.inputs
        ):
            if len(input_shape) > 1 or (
                len(input_shape) == 1 and input_shape.get(0) is None
            ):
                raise ValueError(
                    f"TorchNeuronCompiler only supports dynamic shapes for "
                    f"batch dimension. Provided dynamic info for input {i} "
                    f"is: {input_shape}. Please use padding for the other "
                    f"dimensions."
                )

        return True

    def execute(
        self,
        model: torch.nn.Module,
        model_params: ModelParams,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Optimize the input model using pytorch built-in techniques.

        Args:
            model (torch.nn.Module): The pytorch model.
            model_params (ModelParams): The model parameters.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None.
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        dynamic_batch_size = self._check_dynamic_shape(model_params)

        self.compiled_model = self._compile_model(
            model,
            input_data,
            quantization_type,
            dynamic_batch_size=dynamic_batch_size,
        )

    @torch.no_grad()
    def _compile_model(
        self,
        model: torch.nn.Module,
        input_data: DataManager,
        quantization_type: QuantizationType,
        dynamic_batch_size: bool,
    ) -> torch.jit.ScriptModule:
        input_sample = input_data.get_list(1)[0]
        if self.device.type is DeviceType.GPU:
            if quantization_type is QuantizationType.HALF:
                input_sample = [
                    t.to(self.device.to_torch_format()).half()
                    if torch.is_floating_point(t)
                    else t.to(self.device.to_torch_format())
                    for t in input_sample
                ]
            else:
                input_sample = [
                    t.to(self.device.to_torch_format()) for t in input_sample
                ]
            model.to(self.device.to_torch_format())
        model.eval()

        try:
            model_scripted = symbolic_trace(model)
            model_scripted = torch_neuron.trace(
                model_scripted,
                input_sample,
                dynamic_batch_size=dynamic_batch_size,
                compiler_args=["--fast-math", "none"]
                if quantization_type is None
                else None,
            )
        except Exception:
            try:
                model_scripted = torch_neuron.trace(
                    model,
                    input_sample,
                    dynamic_batch_size=dynamic_batch_size,
                    compiler_args=["--fast-math", "none"]
                    if quantization_type is None
                    else None,
                )
            except Exception:
                raise RuntimeError("Unable to trace model with torch_neuron.")

        return model_scripted

    @torch.no_grad()
    def _quantize_model(
        self,
        model: torch.nn.Module,
        quantization_type: QuantizationType,
        input_tfms: MultiStageTransformation,
        input_data_torch: List[Tuple[torch.Tensor, ...]],
    ):
        raise NotImplementedError()


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_xla.py
================================================
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.torchscript import (
    TorchScriptCompiler,
)
from nebullvm.optional_modules.torch import (
    torch,
)
from nebullvm.tools.data import DataManager


class TorchXLACompiler(TorchScriptCompiler):
    supported_ops = {
        "cpu": [],
        "gpu": [],
        "tpu": [None, QuantizationType.HALF],
    }

    @torch.no_grad()
    def _compile_model(
        self,
        model: torch.nn.Module,
        input_data: DataManager,
        quantization_type: QuantizationType,
    ) -> torch.nn.Module:
        compiled_model = model.to(self.device.to_torch_format())
        return compiled_model


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torchscript.py
================================================
from typing import Union, List, Tuple

from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType, DeviceType
from nebullvm.operations.optimizations.compilers.base import Compiler

from nebullvm.operations.optimizations.compilers.quantizations.pytorch import (
    quantize_pytorch,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.torch import (
    torch,
    Module,
    ScriptModule,
    GraphModule,
    symbolic_trace,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation


class TorchScriptCompiler(Compiler):
    supported_ops = {
        "cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],
        "gpu": [
            None,
            QuantizationType.HALF,
        ],
    }

    def execute(
        self,
        model: Module,
        input_tfms: MultiStageTransformation = None,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Optimize the input model using pytorch built-in techniques.

        Args:
            model (torch.nn.Module): The pytorch model.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None.
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)
        train_input_data = input_data.get_split("train").get_list(
            QUANTIZATION_DATA_NUM
        )

        if quantization_type is not None:
            model = self._quantize_model(
                model, quantization_type, input_tfms, train_input_data
            )

        self.compiled_model = self._compile_model(
            model, input_data, quantization_type
        )

    @torch.no_grad()
    def _compile_model(
        self,
        model: Union[Module, GraphModule],
        input_data: DataManager,
        quantization_type: QuantizationType,
    ) -> ScriptModule:
        input_sample = input_data.get_list(1)[0]
        if self.device.type is DeviceType.GPU:
            if quantization_type is QuantizationType.HALF:
                input_sample = [
                    t.to(self.device.to_torch_format()).half()
                    if torch.is_floating_point(t)
                    else t.to(self.device.to_torch_format())
                    for t in input_sample
                ]
            else:
                input_sample = [
                    t.to(self.device.to_torch_format()) for t in input_sample
                ]
            model.to(self.device.to_torch_format())

        if not isinstance(model, torch.fx.GraphModule):
            model.eval()
            try:
                model_scripted = symbolic_trace(model)
                model_scripted = torch.jit.script(model_scripted)
            except Exception:
                if quantization_type is None:
                    self.logger.warning("Unable to trace model with torch.fx")
                try:
                    model_scripted = torch.jit.script(model)
                except Exception:
                    model_scripted = torch.jit.trace(model, input_sample)
        else:
            model_scripted = torch.jit.script(model)

        return model_scripted

    @torch.no_grad()
    def _quantize_model(
        self,
        model: Module,
        quantization_type: QuantizationType,
        input_tfms: MultiStageTransformation,
        input_data_torch: List[Tuple[torch.Tensor, ...]],
    ):
        return quantize_pytorch(
            model, quantization_type, input_tfms, input_data_torch, self.device
        )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tvm.py
================================================
import abc
import os
import uuid
from abc import ABC
from typing import Any, Tuple, Dict, Union

from nebullvm.config import (
    AUTO_TVM_PARAMS,
    AUTO_TVM_TUNING_OPTION,
)
from nebullvm.core.models import (
    QuantizationType,
    ModelParams,
    DeviceType,
    Device,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.tvm import (
    TVMCalibrator,
    quantize_apache_tvm,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
    check_quantization,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.torch import Module, torch
from nebullvm.optional_modules.tvm import (
    tvm,
    IRModule,
    NDArray,
    XGBTuner,
    autotvm,
    relay,
    ExecutorFactoryModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.pytorch import create_model_inputs_torch
from nebullvm.tools.transformations import MultiStageTransformation


class ApacheTVMCompiler(Compiler, ABC):
    supported_ops = {
        "cpu": [
            None,
            # QuantizationType.STATIC,
            QuantizationType.HALF,
            QuantizationType.DYNAMIC,
        ],
        "gpu": [
            None,
            # QuantizationType.STATIC,
            QuantizationType.HALF,
            QuantizationType.DYNAMIC,
        ],
    }

    def __init__(self):
        super().__init__()
        self.model_orig = None

    def execute(
        self,
        model: Union[Module, str],
        input_tfms: MultiStageTransformation,
        model_params: ModelParams,
        metric_drop_ths: float = None,
        quantization_type: QuantizationType = None,
        input_data: DataManager = None,
        **kwargs,
    ):
        """Compile the input model using Apache TVM compiler.

        Args:
            model (Union[Module, str]: The input model. Can be a torch model
                or a path to an onnx model.
            input_tfms (MultiStageTransformation, optional): Transformations
                to be performed to the model's input tensors in order to
                get the prediction. Default: None.
            model_params (ModelParams): Model parameters.
            metric_drop_ths (float, optional): Threshold for the accepted drop
                in terms of precision. Any optimized model with a higher drop
                will be ignored. Default: None.
            quantization_type (QuantizationType, optional): The desired
                quantization algorithm to be used. Default: None.
            input_data (DataManager): User defined data. Default: None
        """

        if quantization_type not in self.supported_ops[self.device.type.value]:
            self.compiled_model = None
            return

        if quantization_type is QuantizationType.STATIC and input_data is None:
            raise ValueError("Input data is required for static quantization.")

        self.logger.info(
            f"Optimizing with {self.__class__.__name__} and "
            f"q_type: {quantization_type}."
        )

        check_quantization(quantization_type, metric_drop_ths)

        mod, params = self._build_tvm_model(model, model_params)

        if quantization_type is not None:
            mod = self._quantize_model(
                mod, quantization_type, input_tfms, input_data, params
            )

        self.compiled_model = self._compile_model(mod, params)

    @abc.abstractmethod
    def _build_tvm_model(self, model: Any, model_params: ModelParams):
        raise NotImplementedError()

    @staticmethod
    def _build_tvm_model_from_torch(
        torch_model: Module, model_params: ModelParams, device: Device
    ) -> Tuple[IRModule, Dict[str, NDArray]]:
        shape_dict = {
            f"input_{i}": input_size
            for i, input_size in enumerate(model_params.input_sizes)
        }
        inputs = tuple(create_model_inputs_torch(model_params.input_infos))
        if device.type is not DeviceType.GPU:
            inputs = tuple(input_.cpu() for input_ in inputs)
            torch_model.cpu()
        else:
            inputs = tuple(
                input_.to(device.to_torch_format()) for input_ in inputs
            )
            torch_model.to(device.to_torch_format())
        with torch.no_grad():
            _ = torch_model(*inputs)
            model_trace = torch.jit.trace(torch_model, inputs)
            model_trace.eval()
        mod, params = relay.frontend.from_pytorch(
            model_trace, list(shape_dict.items())
        )
        return mod, params

    @staticmethod
    def _build_tvm_model_from_onnx(
        onnx_model_path: str, model_params: ModelParams
    ) -> Tuple[IRModule, Dict[str, NDArray]]:
        shape_dict = {
            input_key: input_size
            for input_key, input_size in zip(
                get_input_names(onnx_model_path), model_params.input_sizes
            )
        }
        onnx_model = onnx.load(onnx_model_path)
        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
        return mod, params

    @staticmethod
    def _quantize(
        mod: IRModule,
        params: Dict[str, NDArray],
        input_data: TVMCalibrator = None,
    ) -> IRModule:
        if input_data is not None:
            with relay.quantize.qconfig(
                calibrate_mode="kl_divergence", weight_scale="max"
            ):
                mod = relay.quantize.quantize(mod, params, dataset=input_data)
        else:
            with relay.quantize.qconfig(
                calibrate_mode="global_scale", global_scale=8.0
            ):
                mod = relay.quantize.quantize(mod, params)
        return mod

    @staticmethod
    def _get_target(device) -> str:
        if device.type is DeviceType.GPU:
            return str(tvm.target.cuda())
        else:
            return "llvm"  # run on CPU

    @staticmethod
    def _tune_tvm_model(
        target: str, mod: IRModule, params: Dict[str, NDArray]
    ) -> str:
        """Tune the model using AutoTVM."""
        # TODO: add support to Ansor
        tuning_records = f"{uuid.uuid4()}_model_records.json"
        # create a TVM runner
        runner = autotvm.LocalRunner(
            number=AUTO_TVM_PARAMS["number"],
            repeat=AUTO_TVM_PARAMS["repeat"],
            timeout=AUTO_TVM_PARAMS["timeout"],
            min_repeat_ms=AUTO_TVM_PARAMS["min_repeat_ms"],
            # TODO modify min_repeat_ms for GPU usage
            enable_cpu_cache_flush=True,
        )
        # begin by extracting the tasks from the onnx model
        tasks = autotvm.task.extract_from_program(
            mod["main"], target=target, params=params
        )

        # Tune the extracted tasks sequentially.
        for i, task in enumerate(tasks):
            tuner_obj = XGBTuner(task, loss_type="rank")
            tuner_obj.tune(
                n_trial=min(
                    AUTO_TVM_TUNING_OPTION["trials"], len(task.config_space)
                ),
                early_stopping=AUTO_TVM_TUNING_OPTION["early_stopping"],
                measure_option=autotvm.measure_option(
                    builder=autotvm.LocalBuilder(build_func="default"),
                    runner=runner,
                ),
                callbacks=[
                    autotvm.callback.log_to_file(tuning_records),
                ],
            )
        return tuning_records

    def _compile_model(self, model: Any, params: Any) -> ExecutorFactoryModule:
        target = self._get_target(self.device)
        tuning_records = self._tune_tvm_model(target, model, params)
        with autotvm.apply_history_best(tuning_records):
            with tvm.transform.PassContext(opt_level=3, config={}):
                lib = relay.build(model, target=target, params=params)

        # Remove temporary file created by tvm
        os.remove(tuning_records)

        return lib

    @staticmethod
    def _quantize_model(
        model: Any,
        quantization_type: QuantizationType,
        input_tfms: MultiStageTransformation,
        input_data: DataManager,
        params,
    ):
        return quantize_apache_tvm(
            model, quantization_type, input_tfms, input_data, params
        )


class PyTorchApacheTVMCompiler(ApacheTVMCompiler):
    def _build_tvm_model(self, model: Any, model_params: ModelParams):
        return self._build_tvm_model_from_torch(
            model, model_params, self.device
        )


class ONNXApacheTVMCompiler(ApacheTVMCompiler):
    def _build_tvm_model(self, model: Any, model_params: ModelParams):
        self.model_orig = model
        return self._build_tvm_model_from_onnx(model, model_params)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/utils.py
================================================
from pathlib import Path

import nebullvm
from nebullvm.core.models import Device, ModelCompiler, DeviceType


def onnxruntime_is_available() -> bool:
    try:
        import onnxruntime  # noqa F401

        return True
    except ImportError:
        return False


def tvm_is_available() -> bool:
    try:
        import tvm  # noqa F401
        from tvm.runtime import Module  # noqa F401

        return True
    except ImportError:
        return False


def bladedisc_is_available() -> bool:
    try:
        import torch_blade  # noqa F401

        return True
    except ImportError:
        return False


def tensorrt_is_available() -> bool:
    try:
        import polygraphy  # noqa F401
        import tensorrt  # noqa F401

        return True
    except ImportError:
        return False


def torch_tensorrt_is_available() -> bool:
    try:
        import torch_tensorrt  # noqa F401

        return True
    except ImportError:
        return False


def openvino_is_available() -> bool:
    try:
        from openvino.runtime import Core  # noqa F401
    except ImportError:
        return False
    else:
        return True


def deepsparse_is_available() -> bool:
    try:
        import deepsparse  # noqa F401
    except ImportError:
        return False
    else:
        return True


def intel_neural_compressor_is_available() -> bool:
    try:
        import neural_compressor  # noqa F401
    except ImportError:
        return False
    else:
        return True


def torch_xla_is_available():
    try:
        import torch_xla  # noqa F401

        return True
    except ImportError:
        return False


def torch_neuron_is_available():
    try:
        import torch_neuron  # noqa F401

        return True
    except ImportError:
        return False


def get_faster_transformer_repo_path() -> Path:
    return Path(nebullvm.__file__).parent.joinpath("FasterTransformer")


def faster_transformer_is_available() -> bool:
    return (
        get_faster_transformer_repo_path()
        .parent.joinpath("FasterTransformer_build_success")
        .exists()
    )


def select_compilers_from_hardware_onnx(device: Device):
    from nebullvm.optional_modules.utils import onnx_is_available

    compilers = []
    if onnx_is_available():
        if onnxruntime_is_available():
            compilers.append(ModelCompiler.ONNX_RUNTIME)
        if tvm_is_available():
            compilers.append(ModelCompiler.APACHE_TVM)
        if device.type is DeviceType.GPU and tensorrt_is_available():
            compilers.append(ModelCompiler.TENSOR_RT)
        if device.type is DeviceType.CPU and openvino_is_available():
            compilers.append(ModelCompiler.OPENVINO)
    return compilers


def select_compilers_from_hardware_torch(device: Device):
    from nebullvm.optional_modules.utils import torch_is_available

    compilers = []
    if torch_is_available():
        compilers.append(ModelCompiler.TORCHSCRIPT)
        if tvm_is_available():
            compilers.append(ModelCompiler.APACHE_TVM)
        if bladedisc_is_available():
            compilers.append(ModelCompiler.BLADEDISC)
        if torch_neuron_is_available():
            compilers.append(ModelCompiler.TORCH_NEURON)

        if device.type is DeviceType.CPU:
            if deepsparse_is_available():
                compilers.append(ModelCompiler.DEEPSPARSE)
            if intel_neural_compressor_is_available():
                compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)
        elif device.type is DeviceType.GPU:
            if torch_tensorrt_is_available:
                compilers.append(ModelCompiler.TENSOR_RT)
    return compilers


def select_compilers_from_hardware_tensorflow():
    from nebullvm.optional_modules.utils import tensorflow_is_available

    compilers = []
    if tensorflow_is_available():
        compilers.append(ModelCompiler.XLA)
        compilers.append(ModelCompiler.TFLITE)
    return compilers


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/base.py
================================================
from abc import ABC, abstractmethod
from typing import Any, Optional, Dict, Callable, Tuple

import yaml

from nebullvm.operations.base import Operation
from nebullvm.tools.data import DataManager


class Compressor(Operation, ABC):
    def __init__(self, config_file: str = None):
        super().__init__()
        self._config = self._read_config(config_file)
        self.compressed_model = None
        self.new_metric_ths = None

    @abstractmethod
    def execute(
        self,
        model: Any,
        train_input_data: DataManager,
        eval_input_data: DataManager,
        metric_drop_ths: float,
        metric: Callable,
    ) -> Tuple[Any, Optional[float]]:
        raise NotImplementedError()

    def _read_config(self, config_file: Optional[str]) -> Dict:
        config = self._get_default_config()
        if config_file is not None:
            with open(config_file, "r") as f:
                data = yaml.load(f, Loader=yaml.CLoader)
                config.update(data.get(self.config_key, {}))
        return config

    @staticmethod
    @abstractmethod
    def _get_default_config() -> Dict:
        raise NotImplementedError

    @property
    @abstractmethod
    def config_key(self) -> str:
        raise NotImplementedError()

    def get_result(self) -> Tuple[Any, Optional[float]]:
        return self.compressed_model, self.new_metric_ths


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/intel.py
================================================
import copy
import re
from abc import ABC, abstractmethod
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, Any, Callable

import numpy as np
import yaml

from nebullvm.operations.optimizations.compressors.base import Compressor
from nebullvm.optional_modules.neural_compressor import Pruning
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import DataLoader, Dataset, Module
from nebullvm.tools.data import DataManager


def _get_model_framework(model: Any) -> str:
    if isinstance(model, Module):
        return "torch"
    elif isinstance(model, tf.Module) and model is not None:
        return "tensorflow"
    else:
        return "numpy"


class IntelPruningCompressor(Compressor, ABC):
    def __init__(self, config_file: str = None):
        super().__init__(config_file)
        self._temp_dir = mkdtemp()

    @property
    def config_key(self) -> str:
        return "intel_pruning"

    @staticmethod
    def _get_default_config() -> Dict:
        # see https://github.com/intel/neural-compressor/blob/master/neural_compressor/conf/config.py  # noqa
        # for further details
        config = {
            "train": {
                "optimizer": {
                    "SGD": {"learning_rate": 0.001},
                },
                "criterion": {
                    "CrossEntropyLoss": {
                        "reduction": "mean",
                        "from_logits": False,
                    },
                },
                "epoch": 10,
                "start_epoch": 0,
                "end_epoch": 10,
                "iteration": 30,
                "execution_mode": "eager",  # either eager or graph
                # "hostfile": None,  # str for multinode training support
            },
            "approach": {
                "weight_compression": {
                    "initial_sparsity": 0.0,
                    "target_sparsity": 0.60,
                    "start_epoch": 0,
                    "end_epoch": 8,
                    "pruners": [
                        {
                            "start_epoch": 0,
                            "end_epoch": 8,
                            "prune_type": "basic_magnitude",
                        },
                    ],
                }
            },
        }
        return config

    def _prepare_pruning_config(self, model: Any):
        pruning_config = copy.deepcopy(self._config)
        framework = _get_model_framework(model)
        config = {
            "model": {
                "name": model.__class__.__name__,
                "framework": framework if framework != "torch" else "pytorch",
            },
            "evaluation": {"accuracy": {"metric": {"topk": 1}}},
            "device": "cpu",
            "tuning": {
                "random_seed": 1978,
                "tensorboard": False,
                "workspace": {"path": self._temp_dir},
            },
            "pruning": pruning_config,
        }
        path_file = Path(self._temp_dir) / "temp.yaml"
        with open(path_file, "w") as f:
            yaml.dump(config, f)
        with open(path_file, "r+") as f:
            file_str = f.read()
            file_str = re.sub(
                "pruners:\n      - end_epoch:",
                "pruners:\n      - !Pruner\n        end_epoch:",
                file_str,
            )
            f.seek(0)
            f.write(file_str)
        return path_file

    def execute(
        self,
        model: Any,
        train_input_data: DataManager,
        eval_input_data: DataManager,
        metric_drop_ths: float,
        metric: Callable,
    ):
        config_file_pr = self._prepare_pruning_config(model)
        prune = Pruning(str(config_file_pr))
        prune.model = model
        prune.train_dataloader = self._get_dataloader(train_input_data)
        prune.eval_dataloader = self._get_dataloader(eval_input_data)
        self.compressed_model = prune.fit()

        if self.compressed_model is not None:
            error = self._compute_error(
                model, self.compressed_model, eval_input_data, metric
            )
            if error > metric_drop_ths:
                self.compressed_model = None
            else:
                self.new_metric_ths = metric_drop_ths - error

    @abstractmethod
    def _compute_error(
        self,
        model: Any,
        compressed_model: Any,
        eval_input_data: DataManager,
        metric: Callable,
    ):
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def _get_dataloader(input_data: DataManager):
        raise NotImplementedError


class INCDataset(Dataset):
    def __init__(self, input_data: DataManager):
        self.data = input_data
        self.batch_size = input_data[0][0][0].shape[0]

    def __len__(self):
        return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])

    def __getitem__(self, idx):
        batch_idx = int(idx / self.batch_size)
        item_idx = idx % self.batch_size
        data = tuple([data[item_idx] for data in self.data[batch_idx][0]])
        return data, self.data[batch_idx][1][item_idx]


class TorchIntelPruningCompressor(IntelPruningCompressor):
    @staticmethod
    def _get_dataloader(input_data: DataManager):
        bs = input_data[0][0][0].shape[0]
        ds = INCDataset(input_data)
        dl = DataLoader(ds, bs)
        return dl

    def _compute_error(
        self,
        model: Module,
        compressed_model: Module,
        eval_input_data: DataManager,
        metric: Callable,
    ):
        if len(eval_input_data) == 0:
            return np.inf
        metric_val = 0
        for inputs, y in eval_input_data:
            pred_model = model(*inputs)
            pred_compressed_model = compressed_model(*inputs)
            metric_val += metric(pred_model, pred_compressed_model, y)
        return metric_val / len(eval_input_data)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/__init__.py
================================================
import json
import logging
import os.path
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, List, Any, Dict

import torch
from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude
from sparseml.pytorch.optim import (
    ScheduledModifierManager,
)
from sparseml.pytorch.sparsification import (
    EpochRangeModifier,
    GMPruningModifier,
)
from sparseml.pytorch.utils import ModuleExporter
from sparsify.blueprints.utils import (
    default_epochs_distribution,
    PruningModelEvaluator,
    default_pruning_settings,
)
from sparsify.schemas import ProjectModelAnalysisSchema
from torch.nn import CrossEntropyLoss, MSELoss
from torch.optim import SGD
from tqdm.auto import tqdm

CRITERION_FNS = {
    "CrossEntropy": CrossEntropyLoss(),
    "MSE": MSELoss(),
}

logging.basicConfig(
    format=" %(asctime)s [%(levelname)s] %(message)s",
    datefmt="%d/%m/%Y %I:%M:%S %p",
)
logger = logging.getLogger("nebullvm_logger")
logger.setLevel(logging.INFO)


def _export_model_onnx(
    model: torch.nn.Module,
    save_path: Path,
    model_name: str,
    input_batch: Tuple,
):
    if torch.cuda.is_available():
        input_batch = tuple(t.cuda() for t in input_batch)
        model.cuda()

    exporter = ModuleExporter(model, output_dir=save_path)
    with torch.no_grad():
        example_outputs = model(*input_batch)
    exporter.export_onnx(
        input_batch, name=model_name, example_outputs=example_outputs
    )
    onnx_path = save_path / model_name

    return onnx_path


class RecipeBuilder:
    def __init__(self, model_path):
        self.model_path = model_path

    def _make_analysis(self):
        analyzer = ModelAnalyzer(self.model_path)
        self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())

    def _compute_loss_sensitivity(self):
        sensitivities = []
        parameters = []
        for i, node in enumerate(self.analysis["nodes"]):
            if node["prunable"]:
                sensitivities.append(node["prunable_equation_sensitivity"])
                parameters.append(node["prunable_params"])

        loss_analysis = pruning_loss_sens_magnitude(self.model_path)

        results_model = loss_analysis.results_model
        results = loss_analysis.results

        model = {
            "baseline_measurement_key": (
                str(results_model.baseline_measurement_key)
            ),
            "measurements": {
                str(key): val for key, val in results_model.averages.items()
            },
        }
        ops = []

        for res in results:
            ops.append(
                {
                    "id": res.id_,
                    "name": res.name,
                    "index": res.index,
                    "baseline_measurement_key": (
                        str(res.baseline_measurement_key)
                    ),
                    "measurements": {
                        str(key): val for key, val in res.averages.items()
                    },
                }
            )

        pruning = {"model": model, "ops": ops}
        loss = {}
        loss["baseline"] = {}
        loss["pruning"] = pruning

        model = PruningModelEvaluator(
            self.analysis,
            None,
            loss,
        )
        model.eval_baseline(default_pruning_settings().sparsity)
        model.eval_pruning(default_pruning_settings())

        self.final_analysis = model.to_dict_values()

    def build_recipe(self, epochs_pruning_window=None, training_epochs=10):
        self._make_analysis()
        self._compute_loss_sensitivity()

        if epochs_pruning_window is None:
            epochs = default_epochs_distribution(training_epochs)
        else:
            # TODO: set custom parameters
            epochs = default_epochs_distribution(training_epochs)
            epochs_dict = epochs._asdict()
            epochs_dict.update(epochs_pruning_window)
            epochs = epochs.__class__(**epochs_dict)

        mods = [
            EpochRangeModifier(
                start_epoch=epochs.start_epoch,
                end_epoch=epochs.end_epoch,
            )
        ]

        node_weight_name_lookup = {
            node["id"]: node["weight_name"]
            for node in self.analysis["nodes"]
            if node["prunable"]
        }

        sparsity_to_params = {}

        nodes = self.final_analysis[0]

        for node in nodes:
            sparsity = node["sparsity"]
            node_id = node["node_id"]
            weight_name = node_weight_name_lookup[node_id]

            if sparsity is None:
                continue

            if sparsity not in sparsity_to_params:
                sparsity_to_params[sparsity] = []

            sparsity_to_params[sparsity].append(weight_name)

        for sparsity, params in sparsity_to_params.items():
            gm_pruning = GMPruningModifier(
                init_sparsity=0.05,
                final_sparsity=sparsity,
                start_epoch=epochs.pruning_start_epoch,
                end_epoch=epochs.pruning_end_epoch,
                update_frequency=epochs.pruning_update_frequency,
                params=params,
            )

            mods.append(gm_pruning)

        return ScheduledModifierManager(mods)


class PruningTrainer:
    def __init__(self, model, bs):
        self.data_loader = None
        self.optimizer = None
        self.model = model
        self.batch_size = bs

    def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        if loss_fn is None:
            loss_fn = CrossEntropyLoss()
        else:
            loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())
        self.criterion = loss_fn
        self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)

    def _run_model_one_epoch(self, train=False):

        if train:
            self.model.train()
            data_loader = self.train_data_loader
        else:
            self.model.eval()
            data_loader = self.val_data_loader

        running_loss = 0.0

        for step, (inputs, labels) in tqdm(
            enumerate(data_loader), total=len(data_loader)
        ):
            inputs = tuple(t.to(self.device) for t in inputs)
            if not isinstance(labels, torch.Tensor):
                labels = torch.tensor(labels)
                if len(labels.shape) == 0:
                    labels = labels.unsqueeze(0)
            labels = labels.to(self.device)

            if train:
                self.optimizer.zero_grad()

            outputs = self.model(
                *inputs
            )  # model returns logits and softmax as a tuple
            loss = self.criterion(outputs, labels)

            if train:
                loss.backward()
                self.optimizer.step()

            running_loss += loss.item()

        loss = running_loss / (len(data_loader) + 1e-5)
        return loss

    def train(
        self, manager, train_data_loader, val_data_loader, **train_kwargs
    ):
        self.train_data_loader = train_data_loader
        self.val_data_loader = val_data_loader
        self._setup_training(**train_kwargs)
        self.optimizer = manager.modify(
            self.model,
            self.optimizer,
            steps_per_epoch=len(self.train_data_loader),
        )
        self.model.train()
        # Run model pruning
        epoch = manager.min_epochs
        while epoch < manager.max_epochs:
            # run training loop
            epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs)
            logger.info("Running Training Epoch {}".format(epoch_name))
            train_loss = self._run_model_one_epoch(train=True)
            logger.info(
                ("Training Epoch: {}\nTraining Loss: {}\n").format(
                    epoch_name, train_loss
                )
            )

            # run validation loop
            logger.info("Running Validation Epoch {}".format(epoch_name))
            val_loss = self._run_model_one_epoch()
            logger.info(
                "Validation Epoch: {}\nVal Loss: {}\n".format(
                    epoch_name, val_loss
                )
            )

            epoch += 1

        manager.finalize(self.model)

        return self.model


def _load_config(config_file: str):
    with open(config_file, "r") as f:
        config = json.load(f)
    return config


def _load_data(data_dir: str):
    data_dir = Path(data_dir)
    return [torch.load(input_path) for input_path in data_dir.glob("*.pt")]


def _load_model(model_file: str):
    if os.path.isdir(model_file):
        path = Path(model_file)
        module_file = path / "module.py"
        with open(module_file, "r") as f:
            module_str = f.read()
        exec(module_str, globals())
        model = eval("NebullvmFxModule")()
        model.load_state_dict(torch.load(path / "state_dict.pt"))
    else:
        model = torch.load(model_file)
    return model


def _train_model(
    model: torch.nn.Module,
    train_data: List[Tuple[Tuple, Any]],
    eval_data: List[Tuple[Tuple, Any]],
    epochs_pruning_window: Dict = None,
    training_epochs: int = 10,
    lr: float = 1e-3,
    momentum: float = 0.9,
    loss_fn: str = "CrossEntropy",
):
    batch_size = train_data[0][0][0].shape[0]
    with TemporaryDirectory() as tmp_dir:
        onnx_path = _export_model_onnx(
            model, Path(tmp_dir), "model.onnx", train_data[0][0]
        )
        onnx_path = onnx_path.as_posix()

        recipe = RecipeBuilder(onnx_path)
        # TODO: implement custom parameters support
        manager = recipe.build_recipe(
            epochs_pruning_window=epochs_pruning_window,
            training_epochs=training_epochs,
        )
        trainer = PruningTrainer(model, batch_size)
        pruned_model = trainer.train(
            manager, train_data, eval_data, lr=lr, momentum=momentum
        )
        return pruned_model


def _save_model(model: torch.nn.Module, path: str):
    if path.endswith(".pt"):
        torch.save(model, path)
    else:
        torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt")


def main(
    model_file: str,
    train_data_dir: str,
    eval_data_dir: str,
    config_file: str,
    out_file: str,
):
    config = _load_config(config_file)
    model = _load_model(model_file)
    train_data = _load_data(train_data_dir)
    eval_data = _load_data(eval_data_dir)
    pruned_model = _train_model(model, train_data, eval_data, **config)
    _save_model(pruned_model, out_file)


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument("--model", help="The model to be pruned.")
    parser.add_argument(
        "--train_dir",
        help="The directory contained the pickled training data.",
    )
    parser.add_argument(
        "--eval_dir", help="The directory contained the pickled test data."
    )
    parser.add_argument("--config", help="The config file.")
    parser.add_argument(
        "--pruned_model", help="Path where storing the pruned model."
    )
    args = parser.parse_args()
    main(
        model_file=args.model,
        train_data_dir=args.train_dir,
        eval_data_dir=args.eval_dir,
        config_file=args.config,
        out_file=args.pruned_model,
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/neural_magic_training.py
================================================
import json
import logging
import os.path
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, List, Any, Dict

import torch
from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude
from sparseml.pytorch.optim import (
    ScheduledModifierManager,
)
from sparseml.pytorch.sparsification import (
    EpochRangeModifier,
    GMPruningModifier,
)
from sparseml.pytorch.utils import ModuleExporter
from sparsify.blueprints.utils import (
    default_epochs_distribution,
    PruningModelEvaluator,
    default_pruning_settings,
)
from sparsify.schemas import ProjectModelAnalysisSchema
from torch.nn import CrossEntropyLoss, MSELoss
from torch.optim import SGD
from tqdm.auto import tqdm

CRITERION_FNS = {
    "CrossEntropy": CrossEntropyLoss(),
    "MSE": MSELoss(),
}

logging.basicConfig(
    format=" %(asctime)s [%(levelname)s] %(message)s",
    datefmt="%d/%m/%Y %I:%M:%S %p",
)
logger = logging.getLogger("nebullvm_logger")
logger.setLevel(logging.INFO)


def _export_model_onnx(
    model: torch.nn.Module,
    save_path: Path,
    model_name: str,
    input_batch: Tuple,
):
    if torch.cuda.is_available():
        input_batch = tuple(t.cuda() for t in input_batch)
        model.cuda()

    exporter = ModuleExporter(model, output_dir=save_path)
    with torch.no_grad():
        example_outputs = model(*input_batch)
    exporter.export_onnx(
        input_batch, name=model_name, example_outputs=example_outputs
    )
    onnx_path = save_path / model_name

    return onnx_path


class RecipeBuilder:
    def __init__(self, model_path):
        self.model_path = model_path

    def _make_analysis(self):
        analyzer = ModelAnalyzer(self.model_path)
        self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())

    def _compute_loss_sensitivity(self):
        sensitivities = []
        parameters = []
        for i, node in enumerate(self.analysis["nodes"]):
            if node["prunable"]:
                sensitivities.append(node["prunable_equation_sensitivity"])
                parameters.append(node["prunable_params"])

        loss_analysis = pruning_loss_sens_magnitude(self.model_path)

        results_model = loss_analysis.results_model
        results = loss_analysis.results

        model = {
            "baseline_measurement_key": (
                str(results_model.baseline_measurement_key)
            ),
            "measurements": {
                str(key): val for key, val in results_model.averages.items()
            },
        }
        ops = []

        for res in results:
            ops.append(
                {
                    "id": res.id_,
                    "name": res.name,
                    "index": res.index,
                    "baseline_measurement_key": (
                        str(res.baseline_measurement_key)
                    ),
                    "measurements": {
                        str(key): val for key, val in res.averages.items()
                    },
                }
            )

        pruning = {"model": model, "ops": ops}
        loss = {}
        loss["baseline"] = {}
        loss["pruning"] = pruning

        model = PruningModelEvaluator(
            self.analysis,
            None,
            loss,
        )
        model.eval_baseline(default_pruning_settings().sparsity)
        model.eval_pruning(default_pruning_settings())

        self.final_analysis = model.to_dict_values()

    def build_recipe(self, epochs_pruning_window=None, training_epochs=10):
        self._make_analysis()
        self._compute_loss_sensitivity()

        if epochs_pruning_window is None:
            epochs = default_epochs_distribution(training_epochs)
        else:
            # TODO: set custom parameters
            epochs = default_epochs_distribution(training_epochs)
            epochs_dict = epochs._asdict()
            epochs_dict.update(epochs_pruning_window)
            epochs = epochs.__class__(**epochs_dict)

        mods = [
            EpochRangeModifier(
                start_epoch=epochs.start_epoch,
                end_epoch=epochs.end_epoch,
            )
        ]

        node_weight_name_lookup = {
            node["id"]: node["weight_name"]
            for node in self.analysis["nodes"]
            if node["prunable"]
        }

        sparsity_to_params = {}

        nodes = self.final_analysis[0]

        for node in nodes:
            sparsity = node["sparsity"]
            node_id = node["node_id"]
            weight_name = node_weight_name_lookup[node_id]

            if sparsity is None:
                continue

            if sparsity not in sparsity_to_params:
                sparsity_to_params[sparsity] = []

            sparsity_to_params[sparsity].append(weight_name)

        for sparsity, params in sparsity_to_params.items():
            gm_pruning = GMPruningModifier(
                init_sparsity=0.05,
                final_sparsity=sparsity,
                start_epoch=epochs.pruning_start_epoch,
                end_epoch=epochs.pruning_end_epoch,
                update_frequency=epochs.pruning_update_frequency,
                params=params,
            )

            mods.append(gm_pruning)

        return ScheduledModifierManager(mods)


class PruningTrainer:
    def __init__(self, model, bs):
        self.data_loader = None
        self.optimizer = None
        self.model = model
        self.batch_size = bs

    def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        if loss_fn is None:
            loss_fn = CrossEntropyLoss()
        else:
            loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())
        self.criterion = loss_fn
        self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)

    def _run_model_one_epoch(self, train=False):

        if train:
            self.model.train()
            data_loader = self.train_data_loader
        else:
            self.model.eval()
            data_loader = self.val_data_loader

        running_loss = 0.0

        for step, (inputs, labels) in tqdm(
            enumerate(data_loader), total=len(data_loader)
        ):
            inputs = tuple(t.to(self.device) for t in inputs)
            if not isinstance(labels, torch.Tensor):
                labels = torch.tensor(labels)
                if len(labels.shape) == 0:
                    labels = labels.unsqueeze(0)
            labels = labels.to(self.device)

            if train:
                self.optimizer.zero_grad()

            outputs = self.model(
                *inputs
            )  # model returns logits and softmax as a tuple
            loss = self.criterion(outputs, labels)

            if train:
                loss.backward()
                self.optimizer.step()

            running_loss += loss.item()

        loss = running_loss / (len(data_loader) + 1e-5)
        return loss

    def train(
        self, manager, train_data_loader, val_data_loader, **train_kwargs
    ):
        self.train_data_loader = train_data_loader
        self.val_data_loader = val_data_loader
        self._setup_training(**train_kwargs)
        self.optimizer = manager.modify(
            self.model,
            self.optimizer,
            steps_per_epoch=len(self.train_data_loader),
        )
        self.model.train()
        # Run model pruning
        epoch = manager.min_epochs
        while epoch < manager.max_epochs:
            # run training loop
            epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs)
            logger.info("Running Training Epoch {}".format(epoch_name))
            train_loss = self._run_model_one_epoch(train=True)
            logger.info(
                ("Training Epoch: {}\nTraining Loss: {}\n").format(
                    epoch_name, train_loss
                )
            )

            # run validation loop
            logger.info("Running Validation Epoch {}".format(epoch_name))
            val_loss = self._run_model_one_epoch()
            logger.info(
                "Validation Epoch: {}\nVal Loss: {}\n".format(
                    epoch_name, val_loss
                )
            )

            epoch += 1

        manager.finalize(self.model)

        return self.model


def _load_config(config_file: str):
    with open(config_file, "r") as f:
        config = json.load(f)
    return config


def _load_data(data_dir: str):
    data_dir = Path(data_dir)
    return [torch.load(input_path) for input_path in data_dir.glob("*.pt")]


def _load_model(model_file: str):
    if os.path.isdir(model_file):
        path = Path(model_file)
        module_file = path / "module.py"
        with open(module_file, "r") as f:
            module_str = f.read()
        exec(module_str, globals())
        model = eval("NebullvmFxModule")()
        model.load_state_dict(torch.load(path / "state_dict.pt"))
    else:
        model = torch.load(model_file)
    return model


def _train_model(
    model: torch.nn.Module,
    train_data: List[Tuple[Tuple, Any]],
    eval_data: List[Tuple[Tuple, Any]],
    epochs_pruning_window: Dict = None,
    training_epochs: int = 10,
    lr: float = 1e-3,
    momentum: float = 0.9,
    loss_fn: str = "CrossEntropy",
):
    batch_size = train_data[0][0][0].shape[0]
    with TemporaryDirectory() as tmp_dir:
        onnx_path = _export_model_onnx(
            model, Path(tmp_dir), "model.onnx", train_data[0][0]
        )
        onnx_path = onnx_path.as_posix()

        recipe = RecipeBuilder(onnx_path)
        # TODO: implement custom parameters support
        manager = recipe.build_recipe(
            epochs_pruning_window=epochs_pruning_window,
            training_epochs=training_epochs,
        )
        trainer = PruningTrainer(model, batch_size)
        pruned_model = trainer.train(
            manager, train_data, eval_data, lr=lr, momentum=momentum
        )
        return pruned_model


def _save_model(model: torch.nn.Module, path: str):
    if path.endswith(".pt"):
        torch.save(model, path)
    else:
        torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt")


def main(
    model_file: str,
    train_data_dir: str,
    eval_data_dir: str,
    config_file: str,
    out_file: str,
):
    config = _load_config(config_file)
    model = _load_model(model_file)
    train_data = _load_data(train_data_dir)
    eval_data = _load_data(eval_data_dir)
    pruned_model = _train_model(model, train_data, eval_data, **config)
    _save_model(pruned_model, out_file)


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument("--model", help="The model to be pruned.")
    parser.add_argument(
        "--train_dir",
        help="The directory contained the pickled training data.",
    )
    parser.add_argument(
        "--eval_dir", help="The directory contained the pickled test data."
    )
    parser.add_argument("--config", help="The config file.")
    parser.add_argument(
        "--pruned_model", help="Path where storing the pruned model."
    )
    args = parser.parse_args()
    main(
        model_file=args.model,
        train_data_dir=args.train_dir,
        eval_data_dir=args.eval_dir,
        config_file=args.config,
        out_file=args.pruned_model,
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/sparseml.py
================================================
import json
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Callable, Dict

import numpy as np
from loguru import logger

from nebullvm.operations.optimizations.compressors.base import Compressor
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.pytorch import save_with_torch_fx, load_with_torch_fx
from nebullvm.tools.venv import run_in_different_venv


def _save_model(model: Module, path: Path):
    try:
        save_with_torch_fx(model, path)
    except Exception as ex:
        logger.warning(
            f"Got an error while exporting with TorchFX. The model will be "
            f"saved using the standard PyTorch save pickling method. Error "
            f"got: {ex}"
        )
        torch.save(model, path / "model.pt")
        return path / "model.pt"
    else:
        return path


def _load_model(path: Path):
    if path.is_file():
        return torch.load(path)
    else:
        return load_with_torch_fx(path)


def _save_dataset(input_data: DataManager, path: Path):
    path.mkdir(exist_ok=True)
    for i, x in enumerate(input_data):
        torch.save(x, path / f"input_{i}.pt")


def _save_json(dictionary: Dict, path: Path):
    with open(path, "w") as f:
        json.dump(dictionary, f)


def _write_requirements_file(path: Path):
    requirements = "sparseml\nsparsify\ntqdm"
    with open(path, "w") as f:
        f.write(requirements)


class SparseMLCompressor(Compressor):
    def execute(
        self,
        model: Module,
        train_input_data: DataManager,
        eval_input_data: DataManager,
        metric_drop_ths: float,
        metric: Callable,
    ):
        script_path = (
            Path(__file__).parent / "scripts/neural_magic_training.py"
        )
        with TemporaryDirectory(dir="") as tmp_dir:
            tmp_dir = Path(tmp_dir)
            requirements_file = tmp_dir / "requirements.txt"
            model_path = _save_model(model, tmp_dir)
            training_data_dir = tmp_dir / "train"
            eval_data_dir = tmp_dir / "eval"
            config_file = tmp_dir / "config.json"
            pruned_model_path = (
                tmp_dir / "pruned_model.pt"
                if model_path.is_file()
                else tmp_dir
            )

            _write_requirements_file(requirements_file)
            _save_dataset(train_input_data, training_data_dir)
            _save_dataset(eval_input_data, eval_data_dir)
            _save_json(self._config, config_file)

            run_in_different_venv(
                str(requirements_file),
                str(script_path),
                torch.cuda.is_available(),
                "--model",
                f"{model_path}",
                "--train_dir",
                f"{training_data_dir}",
                "--eval_dir",
                f"{eval_data_dir}",
                "--config",
                f"{config_file}",
                "--pruned_model",
                f"{pruned_model_path}",
            )

            self.compressed_model = _load_model(pruned_model_path)

            if self.compressed_model is not None:
                error = self._compute_error(
                    model, self.compressed_model, eval_input_data, metric
                )
                if error > metric_drop_ths:
                    self.compressed_model = None
                else:
                    self.new_metric_ths = metric_drop_ths - error

    @staticmethod
    @torch.no_grad()
    def _compute_error(
        model: Module,
        pruned_model: Module,
        eval_input_data: DataManager,
        metric: Callable,
    ) -> float:
        if len(eval_input_data) == 0:
            return np.inf
        metric_val = 0.0
        model.eval()
        pruned_model.eval()
        for inputs, y in eval_input_data:
            if torch.cuda.is_available():
                inputs = tuple(data.cuda() for data in inputs)
                pruned_model.cuda()
                model.cuda()
            model_pred = model(*inputs)
            pruned_pred = pruned_model(*inputs)
            metric_val += metric(model_pred, pruned_pred, y)
        return metric_val / len(eval_input_data)

    @staticmethod
    def _get_default_config() -> Dict:
        return {
            "training_epochs": 10,
            "epochs_pruning_window": {"start_epoch": 0, "end_epoch": 10},
            "loss_fn": "CrossEntropy",
            "lr": 1e-3,
            "momentum": 0.9,
        }

    @property
    def config_key(self) -> str:
        return "sparseml"


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimize_inference.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Iterable, Callable, List, Union, Dict, Optional

from nebullvm.config import TRAIN_TEST_SPLIT_RATIO
from nebullvm.core import types
from nebullvm.core.models import (
    OptimizeInferenceResult,
    OriginalModel,
    OptimizedModel,
    BenchmarkOriginalModelResult,
    ModelCompiler,
    ModelCompressor,
    OptimizationTime,
    ModelParams,
    DeepLearningFramework,
)
from nebullvm.operations.base import Operation
from nebullvm.operations.conversions.utils import get_conversion_op
from nebullvm.operations.measures.measures import LatencyOriginalModelMeasure
from nebullvm.operations.measures.utils import QUANTIZATION_METRIC_MAP
from nebullvm.operations.optimizations.optimizers.optimizers import (
    PytorchOptimizer,
    TensorflowOptimizer,
    ONNXOptimizer,
)
from nebullvm.operations.optimizations.utils import (
    map_compilers_and_compressors,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import DataLoader as TorchDataLoader
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.utils import (
    check_dependencies,
)
from nebullvm.tools.adapters import (
    ModelAdapter,
    DiffusionAdapter,
    HuggingFaceAdapter,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.diffusers import (
    is_diffusion_model_pipe,
    is_diffusion_model,
)
from nebullvm.tools.hardware_utils import get_hw_setup
from nebullvm.tools.utils import (
    is_huggingface_data,
    check_input_data,
    is_data_subscriptable,
    get_dl_framework,
    extract_info_from_data,
    get_model_name,
    get_model_size_mb,
    get_throughput,
)


class OptimizeInferenceOp(Operation):
    def __init__(self):
        super().__init__()
        self.torch_optimization_op = PytorchOptimizer()
        self.onnx_optimization_op = ONNXOptimizer()
        self.tensorflow_optimization_op = TensorflowOptimizer()

    @staticmethod
    def _as_data_manager(data) -> DataManager:
        if isinstance(data, DataManager):
            return data
        if check_input_data(data) is False:
            raise ValueError(
                "The provided data does not match the expected "
                "format.\n"
                "Speedster supports data in the following formats: \n"
                "- PyTorch DataLoader\n"
                "- TensorFlow Dataset\n"
                "- List of tuples: [((input_0, ... ), label), ...] \n"
                "Inputs and labels should be either tensors or numpy "
                "arrays,\n"
                "depending on the framework used.\n"
            )
        if is_data_subscriptable(data):
            return DataManager(data)
        else:
            return DataManager.from_iterable(data)

    @staticmethod
    def _check_inputs(model: Any, input_data: types.InputData):
        if model is None:
            raise ValueError("Input model cannot be None")
        if len(input_data) == 0:
            raise ValueError("Input data cannot be empty")

    def execute(
        self,
        model: Any,
        input_data: types.InputData,
        metric_drop_ths: float = None,
        metric: Union[str, Callable] = None,
        optimization_time: str = "constrained",
        dynamic_info: Dict = None,
        config_file: str = None,
        ignore_compilers: List[str] = None,
        ignore_compressors: List[str] = None,
        store_latencies: bool = False,
        **kwargs,
    ) -> OptimizeInferenceResult:

        self._check_inputs(model, input_data)
        check_dependencies(self.device)

        ignore_compilers = map_compilers_and_compressors(
            ignore_compilers, ModelCompiler
        )
        ignore_compressors = map_compilers_and_compressors(
            ignore_compressors, ModelCompressor
        )

        optimization_time = OptimizationTime(optimization_time)

        data = input_data

        if isinstance(data, (TorchDataLoader, tf.data.Dataset)):
            try:
                data = DataManager.from_dataloader(data)
            except Exception:
                raise ValueError(
                    "The provided dataloader does not match the expected "
                    "format.\n"
                    "Speedster supports dataloaders that return tuples in "
                    "the\n"
                    "following formats: \n"
                    "Single input: (input,  label)\n"
                    "Multiple inputs: ((input1, input2, ...),  label) or "
                    "(input1, input2, ...,  label)\n"
                    "Inputs and labels should be either tensors or numpy "
                    "arrays,\n"
                    "depending on the framework used.\n"
                )

        # Setup adapters
        model_adapter: Optional[ModelAdapter] = None
        if is_diffusion_model_pipe(model):
            self.logger.info(
                "The provided model is a diffusion model. "
                "Speedster will optimize the UNet part of the model."
            )
            model_adapter = DiffusionAdapter(model, data, self.device)
        elif is_huggingface_data(data[0]):
            model_adapter = HuggingFaceAdapter(
                model, data, self.device, **kwargs
            )
            if dynamic_info is None:
                self.logger.warning(
                    "Dynamic shape info has not been provided for the "
                    "HuggingFace model. The resulting optimized model "
                    "will be usable only with a fixed input shape. "
                    "To optimize the model for dynamic shapes, please "
                    "look here: https://nebuly.gitbook.io/nebuly/modules/"
                    "speedster/how-to-guides"
                    "#using-dynamic-shape."
                )

        # Adapt data and model
        if model_adapter is not None:
            data = model_adapter.adapted_data
            model = model_adapter.adapted_model

        data = self._as_data_manager(data)
        dl_framework = get_dl_framework(model)

        if metric_drop_ths is not None and metric_drop_ths <= 0:
            metric_drop_ths = None
        elif metric_drop_ths is not None and metric is None:
            metric = "numeric_precision"
        if isinstance(metric, str):
            metric = QUANTIZATION_METRIC_MAP.get(metric)

        model_params: ModelParams = extract_info_from_data(
            model=model,
            input_data=data,
            dl_framework=dl_framework,
            dynamic_info=dynamic_info,
            device=self.device,
            is_diffusion=is_diffusion_model(model),
        )

        data.split(TRAIN_TEST_SPLIT_RATIO)

        # -------- Benchmark original model --------
        original_latency_op = LatencyOriginalModelMeasure().to(self.device)
        orig_model_benchmark: BenchmarkOriginalModelResult = (
            original_latency_op.execute(
                model=model,
                input_data=data.get_split("test"),
                dl_framework=dl_framework,
            )
        )
        original_model = OriginalModel(
            model=model,
            latency_seconds=orig_model_benchmark.latency_seconds,
            name=get_model_name(model),
            size_mb=get_model_size_mb(model),
            framework=dl_framework,
            throughput=get_throughput(
                latency=orig_model_benchmark.latency_seconds,
                # Normal models have batch size B, diffusion
                # models have batch size 2B
                batch_size=model_params.batch_size
                if not is_diffusion_model(model)
                else model_params.batch_size / 2,
            ),
        )
        # ------------------------------------------

        with TemporaryDirectory() as tmp_dir:
            tmp_dir = Path(tmp_dir) / "fp32"
            tmp_dir.mkdir(parents=True, exist_ok=True)

            # Convert model to all available frameworks
            conversion_op = get_conversion_op(dl_framework)
            conversion_op.to(self.device).set_state(model, data).execute(
                save_path=tmp_dir,
                model_params=model_params,
            )

            # Optimize models
            optimized_models: List[OptimizedModel] = []
            is_diffusion = is_diffusion_model(model)
            for i, model in enumerate(conversion_op.get_result()):
                optimized_models += self._optimize(
                    model=model,
                    input_data=data,
                    model_outputs=orig_model_benchmark.model_outputs,
                    optimization_time=optimization_time,
                    metric_drop_ths=metric_drop_ths,
                    metric=metric,
                    model_params=model_params,
                    ignore_compilers=ignore_compilers,
                    ignore_compressors=ignore_compressors,
                    source_dl_framework=dl_framework,
                    pipeline_idx=i + 1,
                    len_pipelines=len(conversion_op.get_result()),
                    is_diffusion=is_diffusion,
                )

        optimized_models.sort(key=lambda x: x.latency_seconds, reverse=False)

        # Check if at least one optimized model has been created
        no_optimized_models = len(optimized_models) < 1
        no_inference_learners = all(
            o.inference_learner is None for o in optimized_models
        )
        if no_optimized_models or no_inference_learners:
            self.logger.warning(
                "No optimized model has been created. This is likely "
                "due to a bug during optimization. Please open an issue "
                "and report in details your use case."
            )

        # Extract lowest-latency model
        lowest_latency = self._extract_lowest_latency_model(optimized_models)

        if model_adapter is not None:
            original_model = model_adapter.adapt_original_model(original_model)
            lowest_latency = model_adapter.adapt_inference_learner(
                lowest_latency
            )

        return OptimizeInferenceResult(
            original_model=original_model,
            optimized_model=lowest_latency,
            hardware_setup=get_hw_setup(),
        )

    def _optimize(
        self,
        model: Any,
        model_outputs: Iterable,
        input_data: types.InputData,
        optimization_time: OptimizationTime,
        metric_drop_ths: float,
        metric: Callable,
        model_params: ModelParams,
        ignore_compilers: List[ModelCompiler],
        ignore_compressors: List[ModelCompressor],
        source_dl_framework: DeepLearningFramework,
        pipeline_idx: int,
        len_pipelines: int,
        is_diffusion: bool,
    ) -> List[OptimizedModel]:
        if isinstance(model, torch.nn.Module):
            optimization_op = self.torch_optimization_op
            self.logger.info(
                f"[{pipeline_idx}/{len_pipelines}] Running PyTorch "
                f"Optimization Pipeline"
            )
        elif isinstance(model, tf.Module):
            optimization_op = self.tensorflow_optimization_op
            self.logger.info(
                f"[{pipeline_idx}/{len_pipelines}] Running TensorFlow "
                f"Optimization Pipeline"
            )
        else:
            optimization_op = self.onnx_optimization_op
            self.logger.info(
                f"[{pipeline_idx}/{len_pipelines}] Running ONNX "
                f"Optimization Pipeline"
            )

        # Run optimization
        optimized_models = optimization_op.to(self.device).execute(
            model=model,
            input_data=input_data,
            optimization_time=optimization_time,
            metric_drop_ths=metric_drop_ths,
            metric=metric,
            model_params=model_params,
            model_outputs=model_outputs,
            ignore_compilers=ignore_compilers,
            ignore_compressors=ignore_compressors,
            source_dl_framework=source_dl_framework,
            is_diffusion=is_diffusion,
        )

        if isinstance(model, torch.nn.Module):
            optimization_op.free_model_gpu(model)

        return optimized_models

    @staticmethod
    def _extract_lowest_latency_model(
        models: List[OptimizedModel],
    ) -> Optional[OptimizedModel]:
        # fmt: off
        inference_learner_models = [
            m for m in models
            if m.inference_learner is not None
        ]
        # fmt: on
        if len(inference_learner_models) == 0:
            return None
        return min(inference_learner_models, key=lambda m: m.latency_seconds)


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/base.py
================================================
import abc
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List, Tuple, Type, Union

from nebullvm.config import ACTIVATION_METRIC_DROP_THS
from nebullvm.core.models import (
    OptimizedModel,
    OptimizationTime,
    ModelParams,
    ModelCompiler,
    ModelCompressor,
    DeepLearningFramework,
    DeviceType,
    QuantizationType,
)
from nebullvm.operations.base import Operation
from nebullvm.operations.inference_learners.base import (
    BuildInferenceLearner,
)
from nebullvm.operations.inference_learners.builders import (
    DeepSparseBuildInferenceLearner,
    FasterTransformerBuildInferenceLearner,
    IntelNeuralCompressorBuildInferenceLearner,
    ONNXApacheTVMBuildInferenceLearner,
    ONNXBuildInferenceLearner,
    ONNXTensorRTBuildInferenceLearner,
    OpenVINOBuildInferenceLearner,
    PyTorchApacheTVMBuildInferenceLearner,
    PyTorchTensorRTBuildInferenceLearner,
    TensorflowBuildInferenceLearner,
    TFLiteBuildInferenceLearner,
    TorchNeuronBuildInferenceLearner,
    TorchXLABuildInferenceLearner,
    TorchDynamoBuildInferenceLearner,
    TorchScriptBuildInferenceLearner,
)
from nebullvm.operations.measures.measures import MetricDropMeasure
from nebullvm.operations.measures.utils import (
    compute_optimized_running_time,
    compute_relative_difference,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.deepsparse import (
    DeepSparseCompiler,
)
from nebullvm.operations.optimizations.compilers.faster_transformer import (
    FasterTransformerCompiler,
)
from nebullvm.operations.optimizations.compilers.intel_neural_compressor import (  # noqa: E501
    IntelNeuralCompressorCompiler,
)
from nebullvm.operations.optimizations.compilers.onnxruntime import (
    ONNXCompiler,
)
from nebullvm.operations.optimizations.compilers.openvino import (
    OpenVINOCompiler,
)
from nebullvm.operations.optimizations.compilers.tensor_rt import (
    ONNXTensorRTCompiler,
    PyTorchTensorRTCompiler,
)
from nebullvm.operations.optimizations.compilers.tensorflow import (
    TensorflowBackendCompiler,
    TFLiteBackendCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_dynamo import (
    TorchDynamoCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_neuron import (
    TorchNeuronCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_xla import (
    TorchXLACompiler,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
    TorchScriptCompiler,
)
from nebullvm.operations.optimizations.compilers.tvm import (
    ONNXApacheTVMCompiler,
    PyTorchApacheTVMCompiler,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import get_throughput


class Optimizer(Operation, abc.ABC):
    def __init__(self):
        super().__init__()
        self.optimized_models = []
        self.source_dl_framework = None
        self.pipeline_dl_framework = None
        self.compiler_ops = {}
        self.build_inference_learner_ops = {}
        self.validity_check_op = MetricDropMeasure()

    def execute(
        self,
        model: Any,
        input_data: DataManager,
        optimization_time: OptimizationTime,
        metric_drop_ths: float,
        metric: Callable,
        model_params: ModelParams,
        model_outputs: List[Tuple[Any, ...]],
        ignore_compilers: List[ModelCompiler],
        ignore_compressors: List[ModelCompressor],
        source_dl_framework: DeepLearningFramework,
        is_diffusion: bool = False,
    ) -> List[OptimizedModel]:
        self.source_dl_framework = source_dl_framework

        # TODO: implement and select compressors from hardware

        compilers = self._select_compilers_from_hardware()

        remove_compiler_list = []
        add_compiler_list = []
        for compiler in ignore_compilers:
            if compiler in MULTI_FRAMEWORK_COMPILERS:
                add_compiler_list += MULTI_FRAMEWORK_COMPILERS[compiler]
                remove_compiler_list.append(compiler)

        for c in remove_compiler_list:
            ignore_compilers.remove(c)

        ignore_compilers += add_compiler_list

        (
            self.compiler_ops,
            self.build_inference_learner_ops,
        ) = self._load_compilers(
            ignore_compilers=ignore_compilers,
            compilers=compilers,
        )
        self._optimize(
            model=model,
            input_data=input_data,
            optimization_time=optimization_time,
            metric_drop_ths=metric_drop_ths,
            metric=metric,
            model_params=model_params,
            model_outputs=model_outputs,
            ignore_compilers=ignore_compilers,
            is_diffusion=is_diffusion,
        )

        return self.optimized_models

    @abc.abstractmethod
    def _select_compilers_from_hardware(self):
        raise NotImplementedError()

    @staticmethod
    def _load_compilers(
        ignore_compilers: List[ModelCompiler],
        compilers: List[ModelCompiler],
    ):
        compiler_ops = {
            compiler: COMPILER_TO_OPTIMIZER_MAP[compiler]()
            for compiler in compilers
            if compiler not in ignore_compilers
            and compiler in COMPILER_TO_OPTIMIZER_MAP
        }
        build_inference_learner_ops = {
            compiler: COMPILER_TO_INFERENCE_LEARNER_MAP[compiler]()
            for compiler in compilers
            if compiler not in ignore_compilers
            and compiler in COMPILER_TO_OPTIMIZER_MAP
        }

        return compiler_ops, build_inference_learner_ops

    def free_model_gpu(self, model: Any):
        # Free gpu memory
        if self.device.type is DeviceType.GPU:
            try:
                model.cpu()
            except Exception:
                pass
            try:
                with torch.cuda.device(self.device.to_torch_format()):
                    torch.cuda.empty_cache()
            except Exception:
                pass

    def _optimize(
        self,
        model: Union[torch.nn.Module, tf.Module, str],
        input_data: DataManager,
        optimization_time: OptimizationTime,
        metric_drop_ths: float,
        metric: Callable,
        model_params: ModelParams,
        model_outputs: List[Tuple[Any, ...]],
        ignore_compilers: List[ModelCompiler],
        is_diffusion: bool = False,
    ):

        if metric_drop_ths is not None:
            q_types = [
                None,
            ]
            if metric_drop_ths > 0:
                q_types.append(QuantizationType.HALF)
            if metric_drop_ths > ACTIVATION_METRIC_DROP_THS:
                q_types.append(QuantizationType.DYNAMIC)
                if input_data is not None:
                    q_types.append(QuantizationType.STATIC)
        else:
            q_types = [None]

        optimization_info = []
        for compiler, compiler_op, build_inference_learner_op in zip(
            self.compiler_ops.keys(),
            self.compiler_ops.values(),
            self.build_inference_learner_ops.values(),
        ):
            for q_type in q_types:
                input_tfms = MultiStageTransformation([])

                self.free_model_gpu(model)

                with TemporaryDirectory() as tmp_dir:
                    try:
                        compiler_op.to(self.device).execute(
                            model=model,
                            input_data=input_data,
                            model_params=model_params,
                            metric_drop_ths=metric_drop_ths
                            if q_type is not None
                            else None,
                            quantization_type=q_type,
                            input_tfms=input_tfms,
                            onnx_output_path=tmp_dir,
                            is_diffusion=is_diffusion,
                        )

                        compiled_model = compiler_op.get_result()
                        if compiled_model is not None:
                            build_inference_learner_op.to(self.device).execute(
                                model=compiled_model,
                                model_orig=compiler_op.model_orig
                                if hasattr(compiler_op, "model_orig")
                                else None,
                                model_params=model_params,
                                input_tfms=input_tfms,
                                source_dl_framework=self.source_dl_framework,
                                quantization_type=q_type,
                            )
                            inference_learner = (
                                build_inference_learner_op.get_result()
                            )

                            if inference_learner is not None:
                                test_input_data, ys = input_data.get_split(
                                    "test"
                                ).get_list(with_ys=True)

                                self.validity_check_op.execute(
                                    inference_learner,
                                    test_input_data,
                                    model_outputs,
                                    metric_drop_ths,
                                    metric_func=metric
                                    if q_type is not None
                                    else compute_relative_difference,
                                    ys=ys,
                                )

                                if self.validity_check_op.valid:
                                    latency = compute_optimized_running_time(
                                        inference_learner, input_data
                                    )
                                    self.logger.info(
                                        f"Optimized model latency: {latency} "
                                        f"sec/iter"
                                    )

                                    if (
                                        compiler not in ignore_compilers
                                        and optimization_time
                                        is OptimizationTime.CONSTRAINED
                                    ):
                                        ignore_compilers.append(compiler)

                                    self.optimized_models.append(
                                        OptimizedModel(
                                            inference_learner=inference_learner,  # noqa: E501
                                            metric_drop=self.validity_check_op.measure_result,  # noqa: E501
                                            compiler=compiler,
                                            technique=q_type.name
                                            if q_type is not None
                                            else "None",
                                            latency_seconds=latency,
                                            throughput=get_throughput(
                                                latency,
                                                # Normal models have batch
                                                # size B, diffusion models
                                                # have batch size 2B
                                                model_params.batch_size
                                                if not is_diffusion
                                                else model_params.batch_size
                                                / 2,
                                            ),
                                            size_mb=inference_learner.get_size()  # noqa: E501
                                            / 1e6,
                                        )
                                    )

                                    opt_info_dict = {
                                        "compiler": f"{self.pipeline_dl_framework.value}_{compiler.value}",  # noqa: E501
                                        "technique": q_type.value
                                        if q_type
                                        else "none",
                                        "latency": latency,
                                    }
                                    if (
                                        metric_drop_ths is not None
                                        and q_type is not None
                                    ):
                                        opt_info_dict[
                                            "metric_loss"
                                        ] = (
                                            self.validity_check_op.measure_result  # noqa: E501
                                        )
                                        opt_info_dict[
                                            "metric"
                                        ] = metric.__name__
                                    optimization_info.append(opt_info_dict)
                                else:
                                    self.logger.warning(
                                        "The optimized model will be "
                                        "discarded due to poor results "
                                        "obtained with the given metric."
                                    )

                                if self.device.type in [
                                    DeviceType.GPU,
                                    DeviceType.TPU,
                                ]:
                                    inference_learner.free_gpu_memory()
                    except Exception as ex:
                        self.logger.warning(
                            f"Optimization failed with "
                            f"{self.pipeline_dl_framework} "
                            f"interface of {compiler}. Got error {ex}. "
                            f"If possible the compilation will be re-scheduled"
                            f" with another interface. Please consult the "
                            f"documentation for further info or open an issue "
                            f"on GitHub for receiving assistance."
                        )
                        optimization_info.append(
                            {
                                "compiler": compiler.value,
                                "technique": q_type.value
                                if q_type
                                else "none",
                                "latency": -1,
                            }
                        )
        if self.feedback_collector is not None:
            self.feedback_collector.store_info(
                key="optimizations",
                value=optimization_info,
            )


MULTI_FRAMEWORK_COMPILERS = {
    ModelCompiler.TENSOR_RT: [
        ModelCompiler.TENSOR_RT_TORCH,
        ModelCompiler.TENSOR_RT_ONNX,
    ],
    ModelCompiler.APACHE_TVM: [
        ModelCompiler.APACHE_TVM_TORCH,
        ModelCompiler.APACHE_TVM_ONNX,
    ],
}

COMPILER_TO_OPTIMIZER_MAP: Dict[ModelCompiler, Type[Compiler]] = {
    ModelCompiler.TORCHSCRIPT: TorchScriptCompiler,
    ModelCompiler.DEEPSPARSE: DeepSparseCompiler,
    ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorCompiler,
    ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTCompiler,
    ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTCompiler,
    ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMCompiler,
    ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMCompiler,
    ModelCompiler.ONNX_RUNTIME: ONNXCompiler,
    ModelCompiler.OPENVINO: OpenVINOCompiler,
    ModelCompiler.TFLITE: TFLiteBackendCompiler,
    ModelCompiler.XLA: TensorflowBackendCompiler,
    ModelCompiler.TORCH_NEURON: TorchNeuronCompiler,
    ModelCompiler.TORCH_XLA: TorchXLACompiler,
    ModelCompiler.TORCH_DYNAMO: TorchDynamoCompiler,
    ModelCompiler.FASTER_TRANSFORMER: FasterTransformerCompiler,
}

COMPILER_TO_INFERENCE_LEARNER_MAP: Dict[
    ModelCompiler, Type[BuildInferenceLearner]
] = {
    ModelCompiler.TORCHSCRIPT: TorchScriptBuildInferenceLearner,
    ModelCompiler.DEEPSPARSE: DeepSparseBuildInferenceLearner,
    ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorBuildInferenceLearner,  # noqa: E501
    ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTBuildInferenceLearner,
    ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTBuildInferenceLearner,
    ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMBuildInferenceLearner,
    ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMBuildInferenceLearner,
    ModelCompiler.ONNX_RUNTIME: ONNXBuildInferenceLearner,
    ModelCompiler.OPENVINO: OpenVINOBuildInferenceLearner,
    ModelCompiler.TFLITE: TFLiteBuildInferenceLearner,
    ModelCompiler.XLA: TensorflowBuildInferenceLearner,
    ModelCompiler.TORCH_NEURON: TorchNeuronBuildInferenceLearner,
    ModelCompiler.TORCH_XLA: TorchXLABuildInferenceLearner,
    ModelCompiler.TORCH_DYNAMO: TorchDynamoBuildInferenceLearner,
    ModelCompiler.FASTER_TRANSFORMER: FasterTransformerBuildInferenceLearner,
}


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/optimizers.py
================================================
import platform

from nebullvm.core.models import (
    DeepLearningFramework,
    DeviceType,
    ModelCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import Optimizer
from nebullvm.operations.optimizations.compilers.utils import (
    tvm_is_available,
    bladedisc_is_available,
    deepsparse_is_available,
    intel_neural_compressor_is_available,
    torch_tensorrt_is_available,
    onnxruntime_is_available,
    tensorrt_is_available,
    openvino_is_available,
    torch_neuron_is_available,
    torch_xla_is_available,
    faster_transformer_is_available,
)
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.utils import (
    torch_is_available,
    tensorflow_is_available,
    onnx_is_available,
)
from nebullvm.tools.utils import check_module_version


class PytorchOptimizer(Optimizer):
    def __init__(self):
        super().__init__()
        self.pipeline_dl_framework = DeepLearningFramework.PYTORCH

    def _select_compilers_from_hardware(self):
        compilers = []
        if torch_is_available():
            if self.device.type is DeviceType.TPU:
                if torch_xla_is_available():
                    compilers.append(ModelCompiler.TORCH_XLA)
                else:
                    raise RuntimeError(
                        "Torch XLA is not available on your platform. "
                        "Please install torch-xla the readme at this "
                        "link: https://github.com/pytorch/xla"
                    )
            elif self.device.type is DeviceType.NEURON:
                if torch_neuron_is_available():
                    compilers.append(ModelCompiler.TORCH_NEURON)
                else:
                    raise RuntimeError(
                        "Torch Neuron is not available on your platform. "
                        "Please install torch-neuron by following "
                        "this guide: https://awsdocs-neuron"
                        ".readthedocs-hosted.com/en/latest/general/"
                        "quick-start/torch-neuron.html."
                    )
            else:
                compilers.append(ModelCompiler.TORCHSCRIPT)
                if (
                    check_module_version(torch, min_version="2.0.0")
                    and platform.system() != "Windows"
                    and False
                ):  # Deactivated because save and load methods are
                    # not implemented
                    compilers.append(ModelCompiler.TORCH_DYNAMO)
                if tvm_is_available():
                    compilers.append(ModelCompiler.APACHE_TVM_TORCH)
                if bladedisc_is_available():
                    compilers.append(ModelCompiler.BLADEDISC)

                if self.device.type is DeviceType.CPU:
                    if deepsparse_is_available():
                        compilers.append(ModelCompiler.DEEPSPARSE)
                    if intel_neural_compressor_is_available():
                        compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)
                elif self.device.type is DeviceType.GPU:
                    if torch_tensorrt_is_available():
                        compilers.append(ModelCompiler.TENSOR_RT_TORCH)
                    if faster_transformer_is_available():
                        compilers.append(ModelCompiler.FASTER_TRANSFORMER)
        return compilers


class TensorflowOptimizer(Optimizer):
    def __init__(self):
        super().__init__()
        self.pipeline_dl_framework = DeepLearningFramework.TENSORFLOW

    def _select_compilers_from_hardware(self):
        compilers = []
        if tensorflow_is_available():
            compilers.append(ModelCompiler.XLA)
            compilers.append(ModelCompiler.TFLITE)
        return compilers


class ONNXOptimizer(Optimizer):
    def __init__(self):
        super().__init__()
        self.pipeline_dl_framework = DeepLearningFramework.NUMPY

    def _select_compilers_from_hardware(self):
        compilers = []
        if onnx_is_available():
            if onnxruntime_is_available():
                compilers.append(ModelCompiler.ONNX_RUNTIME)
            if tvm_is_available():
                compilers.append(ModelCompiler.APACHE_TVM_ONNX)
            if self.device.type is DeviceType.GPU and tensorrt_is_available():
                compilers.append(ModelCompiler.TENSOR_RT_ONNX)
            if self.device.type is DeviceType.CPU and openvino_is_available():
                compilers.append(ModelCompiler.OPENVINO)
        return compilers


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_deepsparse.py
================================================
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.config import CONSTRAINED_METRIC_DROP_THS
from nebullvm.core.models import (
    Device,
    DeviceType,
    DeepLearningFramework,
    ModelCompiler,
)
from nebullvm.operations.inference_learners.deepsparse import (
    DEEPSPARSE_INFERENCE_LEARNERS,
)
from nebullvm.operations.measures.measures import MetricDropMeasure
from nebullvm.operations.measures.utils import compute_relative_difference
from nebullvm.operations.optimizations.compilers.deepsparse import (
    DeepSparseCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
    deepsparse_is_available,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import initialize_model
from nebullvm.operations.inference_learners.utils import load_model

device = Device(DeviceType.CPU)


@pytest.mark.parametrize(
    ("output_library", "dynamic"),
    [
        # (DeepLearningFramework.PYTORCH, True),
        (DeepLearningFramework.PYTORCH, False),
    ],
)
@pytest.mark.skipif(
    not deepsparse_is_available(),
    reason="Can't test deepsparse if it's not installed.",
)
def test_deepsparse(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type=None,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, None, output_library, device)

        compiler_op = DeepSparseCompiler()
        compiler_op.to(device).execute(
            model=model,
            onnx_output_path=tmp_dir,
            model_params=model_params,
            quantization_type=None,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.DEEPSPARSE
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]
        )
        assert isinstance(optimized_model.get_size(), int)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]
        )

        inputs_example = optimized_model.get_inputs_example()
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        test_input_data, ys = input_data.get_split("test").get_list(
            with_ys=True
        )

        validity_check_op = MetricDropMeasure()
        validity_check_op.execute(
            optimized_model,
            test_input_data,
            model_outputs,
            CONSTRAINED_METRIC_DROP_THS,
            metric_func=metric
            if quantization_type is not None
            else compute_relative_difference,
            ys=ys,
        )

        # Check validity of the optimized model
        assert validity_check_op.get_result()

        # Dynamic batch size is currently not supported from deepsparse
        # if dynamic:
        #     inputs_example = [
        #         input_[: len(input_) // 2] for input_ in inputs_example
        #     ]
        #     res = model(*inputs_example)
        #     assert res is not None


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_intel_neural_compressor.py
================================================
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.core.models import (
    DeviceType,
    Device,
    QuantizationType,
    DeepLearningFramework,
    ModelCompiler,
)
from nebullvm.operations.inference_learners.neural_compressor import (
    NEURAL_COMPRESSOR_INFERENCE_LEARNERS,
)
from nebullvm.operations.optimizations.compilers.intel_neural_compressor import (  # noqa: E501
    IntelNeuralCompressorCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
    intel_neural_compressor_is_available,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model

device = Device(DeviceType.CPU)


@pytest.mark.parametrize(
    ("output_library", "dynamic", "metric_drop_ths", "quantization_type"),
    [
        (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.DYNAMIC),
        (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.DYNAMIC),
        (DeepLearningFramework.PYTORCH, True, 2, QuantizationType.STATIC),
        (DeepLearningFramework.PYTORCH, False, 2, QuantizationType.STATIC),
    ],
)
@pytest.mark.skipif(
    not intel_neural_compressor_is_available(),
    reason="Can't test neural compressor if it's not installed.",
)
def test_neural_compressor(
    output_library: DeepLearningFramework,
    dynamic: bool,
    metric_drop_ths: float,
    quantization_type: QuantizationType,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, None, output_library, device)

        compiler_op = IntelNeuralCompressorCompiler()
        compiler_op.to(device).execute(
            model=model,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.INTEL_NEURAL_COMPRESSOR
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()

        assert isinstance(
            optimized_model,
            NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library],
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = optimized_model.get_inputs_example()
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(res_tensor, res_orig_tensor, rtol=1e-01)
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_onnxruntime.py
================================================
import sys
from pathlib import Path
from tempfile import TemporaryDirectory

import onnx
import pytest
import torch

from nebullvm.core.models import (
    Device,
    DeviceType,
    DeepLearningFramework,
    QuantizationType,
    ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS
from nebullvm.operations.optimizations.compilers.onnxruntime import (
    ONNXCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available

device = (
    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
        "external_data_format",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None, True),
        (DeepLearningFramework.PYTORCH, True, None, None, None, False),
        (DeepLearningFramework.PYTORCH, False, None, None, None, False),
    ],
)
def test_onnxruntime(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
    external_data_format: bool,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )

        # Test onnx external data format (large models)
        if external_data_format:
            onnx_model = onnx.load(model_path)
            onnx.save_model(
                onnx_model,
                model_path,
                save_as_external_data=True,
                all_tensors_to_one_file=False,
            )

        compiler_op = ONNXCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.ONNX_RUNTIME
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
            quantization_type=quantization_type,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            with torch.inference_mode():
                res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
        "external_data_format",
    ),
    [
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.DYNAMIC,
            2,
            "numeric_precision",
            False,
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
            False,
        ),
    ],
)
@pytest.mark.skipif(
    torch.cuda.is_available(),
    reason="onnxruntime with int8 precision is very slow on GPU",
)
def test_onnxruntime_quantization(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
    external_data_format: bool,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )

        # Test onnx external data format (large models)
        if external_data_format:
            onnx_model = onnx.load(model_path)
            onnx.save_model(
                onnx_model,
                model_path,
                save_as_external_data=True,
                all_tensors_to_one_file=False,
            )

        compiler_op = ONNXCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.ONNX_RUNTIME
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
            quantization_type=quantization_type,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            with torch.inference_mode():
                res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
        "external_data_format",
    ),
    [
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
            False,
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
            True,
        ),
    ],
)
@pytest.mark.skipif(
    sys.platform == "win32",
    reason="onnxruntime with half precision on windows does not work",
)
@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="onnxruntime with half precision is very slow on CPU",
)
def test_onnxruntime_half(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
    external_data_format: bool,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )

        # Test onnx external data format (large models)
        if external_data_format:
            onnx_model = onnx.load(model_path)
            onnx.save_model(
                onnx_model,
                model_path,
                save_as_external_data=True,
                all_tensors_to_one_file=False,
            )

        compiler_op = ONNXCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.ONNX_RUNTIME
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
            quantization_type=quantization_type,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = ONNX_INFERENCE_LEARNERS[output_library].load(tmp_dir)
        assert isinstance(
            loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            with torch.inference_mode():
                res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=1e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_openvino.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory

import cpuinfo
import pytest
import torch

from nebullvm.core.models import (
    DeepLearningFramework,
    QuantizationType,
    Device,
    DeviceType,
    ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.openvino import (
    OPENVINO_INFERENCE_LEARNERS,
)
from nebullvm.operations.optimizations.compilers.openvino import (
    OpenVINOCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
    ],
)
@pytest.mark.skipif(
    "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
    reason="Openvino is only available for intel processors.",
)
def test_openvino(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    device = Device(DeviceType.CPU)
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )

        compiler_op = OpenVINOCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            model_params=model_params,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.OPENVINO
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, OPENVINO_INFERENCE_LEARNERS[output_library]
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, OPENVINO_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=2e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensor_rt.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.core.models import (
    Device,
    DeviceType,
    DeepLearningFramework,
    QuantizationType,
    ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.tensor_rt import (
    TENSOR_RT_INFERENCE_LEARNERS,
    PytorchTensorRTInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tensor_rt import (
    ONNXTensorRTCompiler,
    PyTorchTensorRTCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import check_module_version

device = Device(DeviceType.GPU)


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
    ],
)
@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Skip because cuda is not available.",
)
def test_tensorrt_onnx(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )
        compiler_op = ONNXTensorRTCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            model_params=model_params,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.TENSOR_RT_ONNX
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(
            optimized_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]
        )

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(
            loaded_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]
        )

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = tuple(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            with torch.inference_mode():
                res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=1e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            False,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
    ],
)
@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Skip because cuda is not available.",
)
@pytest.mark.skipif(
    not check_module_version(torch, max_version="1.13.1+cu117"),
    reason="Skip because torch version is not supported.",
)
def test_tensorrt_torch(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        compiler_op = PyTorchTensorRTCompiler()
        compiler_op.to(device).execute(
            model=model,
            model_params=model_params,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.TENSOR_RT_TORCH
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(optimized_model, PytorchTensorRTInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = PytorchTensorRTInferenceLearner.load(tmp_dir)
        assert isinstance(loaded_model, PytorchTensorRTInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = tuple(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=1e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensorflow.py
================================================
from tempfile import TemporaryDirectory

import pytest

from nebullvm.core.models import (
    DeepLearningFramework,
    QuantizationType,
    Device,
    DeviceType,
    ModelCompiler,
)
from nebullvm.operations.inference_learners.tensorflow import (
    TensorflowBackendInferenceLearner,
    TFLiteBackendInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tensorflow import (
    TensorflowBackendCompiler,
    TFLiteBackendCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.TENSORFLOW, False, None, None, None),
        (DeepLearningFramework.TENSORFLOW, True, None, None, None),
    ],
)
def test_tensorflow_backend(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    device = (
        Device(DeviceType.GPU)
        if gpu_is_available()
        else Device(DeviceType.CPU)
    )
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        compiler_op = TensorflowBackendCompiler()
        compiler_op.to(device).execute(
            model=model,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.XLA
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()

        assert isinstance(optimized_model, TensorflowBackendInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, TensorflowBackendInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model.predict(*inputs_example)
        assert res is not None

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = optimized_model.predict(*inputs_example)
            assert res is not None


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (
            DeepLearningFramework.TENSORFLOW,
            False,
            None,
            0.1,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.TENSORFLOW,
            True,
            None,
            0.1,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.TENSORFLOW,
            True,
            QuantizationType.DYNAMIC,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.TENSORFLOW,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.TENSORFLOW,
            True,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
    ],
)
def test_tf_lite(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    device = Device(DeviceType.CPU)
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        compiler_op = TFLiteBackendCompiler()
        compiler_op.to(device).execute(
            model=model,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.TFLITE
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()

        assert isinstance(optimized_model, TFLiteBackendInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = TFLiteBackendInferenceLearner.load(tmp_dir)
        assert isinstance(loaded_model, TFLiteBackendInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model.predict(*inputs_example)
        assert res is not None

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = optimized_model.predict(*inputs_example)
            assert res is not None


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torch_dynamo.py
================================================
import platform
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.core.models import (
    DeviceType,
    Device,
    DeepLearningFramework,
    QuantizationType,
    ModelCompiler,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
    TorchDynamoInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.torch_dynamo import (
    TorchDynamoCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.tools.utils import gpu_is_available, check_module_version

device = (
    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)


def run_test_torch_dynamo(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:  # noqa: F841
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        compiler_op = TorchDynamoCompiler()
        compiler_op.to(device).execute(
            model=model,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
            model_params=model_params,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.TORCH_DYNAMO
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(optimized_model, TorchDynamoInferenceLearner)

        # Test save and load functions
        # optimized_model.save(tmp_dir)
        # loaded_model = load_model(tmp_dir)
        # assert isinstance(loaded_model, TorchDynamoInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        # res_loaded = loaded_model(*inputs_example)
        # assert all(
        #     [
        #         torch.allclose(res_tensor, res_loaded_tensor)
        #         for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
        #     ]
        # )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=2e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
    ],
)
@pytest.mark.skipif(
    not check_module_version(torch, min_version="2.0.0"),
    reason="Torch version is not supported",
)
@pytest.mark.skipif(
    platform.system() == "Windows",
    reason="Torch compile() is not currently supported on windows",
)
def test_torch_dynamo_fp32(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    run_test_torch_dynamo(
        output_library,
        dynamic,
        quantization_type,
        metric_drop_ths,
        metric,
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torchscript.py
================================================
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.core.models import (
    DeviceType,
    Device,
    DeepLearningFramework,
    QuantizationType,
    ModelCompiler,
)
from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
    TorchScriptCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available

device = (
    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)


def run_test_torchscript(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        compiler_op = TorchScriptCompiler()
        compiler_op.to(device).execute(
            model=model,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.TORCHSCRIPT
        ]()

        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(optimized_model, TorchScriptInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, TorchScriptInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = list(optimized_model.get_inputs_example())
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:  # Check also with a smaller bath_size
            torch_device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )

            inputs_example = [
                input_[: len(input_) // 2].to(torch_device)
                for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=2e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
    ],
)
def test_torchscript_no_quantization(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    run_test_torchscript(
        output_library,
        dynamic,
        quantization_type,
        metric_drop_ths,
        metric,
    )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        )
    ],
)
@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Half quantization is not available on CPU",
)
def test_torchscript_half_quantization(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    run_test_torchscript(
        output_library,
        dynamic,
        quantization_type,
        metric_drop_ths,
        metric,
    )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.DYNAMIC,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.STATIC,
            2,
            "numeric_precision",
        ),
    ],
)
@pytest.mark.skipif(
    torch.cuda.is_available(),
    reason="INT8 quantization is not available on GPU",
)
def test_torchscript_int8_quantization(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    run_test_torchscript(
        output_library,
        dynamic,
        quantization_type,
        metric_drop_ths,
        metric,
    )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tvm.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest
import torch

from nebullvm.core.models import (
    Device,
    DeviceType,
    DeepLearningFramework,
    QuantizationType,
    ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.tvm import (
    PytorchApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tvm import (
    ONNXApacheTVMCompiler,
    PyTorchApacheTVMCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from nebullvm.operations.optimizations.optimizers.base import (
    COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
    initialize_model,
    check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available

device = (
    Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.DYNAMIC,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        # (
        #     DeepLearningFramework.PYTORCH,
        #     True,
        #     QuantizationType.STATIC,
        #     2,
        #     "numeric_precision",
        # ),
    ],
)
@pytest.mark.skipif(
    not tvm_is_available(), reason="Apache TVM is not installed"
)
def test_tvm_onnx(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)

        model_path = Path(tmp_dir) / "fp32"
        model_path.mkdir(parents=True)

        converter_op = PytorchConverter()
        converter_op.to(device).set_state(model, input_data).execute(
            model_path, model_params
        )

        converted_models = converter_op.get_result()
        assert len(converted_models) > 1

        model_path = str(
            [model for model in converted_models if isinstance(model, Path)][0]
        )

        compiler_op = ONNXApacheTVMCompiler()
        compiler_op.to(device).execute(
            model=model_path,
            model_params=model_params,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.APACHE_TVM_ONNX
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = optimized_model.get_inputs_example()
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        if dynamic:
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=1e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


@pytest.mark.parametrize(
    (
        "output_library",
        "dynamic",
        "quantization_type",
        "metric_drop_ths",
        "metric",
    ),
    [
        (DeepLearningFramework.PYTORCH, True, None, None, None),
        (DeepLearningFramework.PYTORCH, False, None, None, None),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.DYNAMIC,
            2,
            "numeric_precision",
        ),
        (
            DeepLearningFramework.PYTORCH,
            True,
            QuantizationType.HALF,
            2,
            "numeric_precision",
        ),
        # (
        #     DeepLearningFramework.PYTORCH,
        #     True,
        #     QuantizationType.STATIC,
        #     2,
        #     "numeric_precision",
        # ),
    ],
)
@pytest.mark.skipif(
    not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_tvm_torch(
    output_library: DeepLearningFramework,
    dynamic: bool,
    quantization_type: QuantizationType,
    metric_drop_ths: int,
    metric: str,
):
    with TemporaryDirectory() as tmp_dir:
        (
            model,
            input_data,
            model_params,
            input_tfms,
            model_outputs,
            metric,
        ) = initialize_model(dynamic, metric, output_library, device)
        compiler_op = PyTorchApacheTVMCompiler()
        compiler_op.to(device).execute(
            model=model,
            model_params=model_params,
            input_tfms=input_tfms,
            metric_drop_ths=metric_drop_ths,
            quantization_type=quantization_type,
            input_data=input_data,
        )

        compiled_model = compiler_op.get_result()

        build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
            ModelCompiler.APACHE_TVM_TORCH
        ]()
        build_inference_learner_op.to(device).execute(
            model=compiled_model,
            model_orig=compiler_op.model_orig
            if hasattr(compiler_op, "model_orig")
            else None,
            model_params=model_params,
            input_tfms=input_tfms,
            source_dl_framework=output_library,
        )

        optimized_model = build_inference_learner_op.get_result()
        assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)

        # Test save and load functions
        optimized_model.save(tmp_dir)
        loaded_model = PytorchApacheTVMInferenceLearner.load(tmp_dir)
        assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)

        assert isinstance(optimized_model.get_size(), int)

        inputs_example = optimized_model.get_inputs_example()
        res = optimized_model(*inputs_example)
        assert res is not None

        res_loaded = loaded_model(*inputs_example)
        assert all(
            [
                torch.allclose(res_tensor, res_loaded_tensor)
                for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
            ]
        )

        # Test validity of the model
        valid = check_model_validity(
            optimized_model,
            input_data,
            model_outputs,
            metric_drop_ths,
            quantization_type,
            metric,
        )
        assert valid

        if dynamic:
            inputs_example = [
                input_[: len(input_) // 2] for input_ in inputs_example
            ]
            res = optimized_model(*inputs_example)
            assert res is not None

            res_orig = tuple(model(*inputs_example))
            assert all(
                [
                    torch.allclose(
                        res_tensor.float(), res_orig_tensor, rtol=1e-01
                    )
                    for (res_tensor, res_orig_tensor) in zip(res, res_orig)
                ]
            )


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/utils.py
================================================
import os
from pathlib import Path
from typing import Any, Callable, Optional, Tuple

import tensorflow as tf
import tensorflow.keras as keras
import torch
from tensorflow.keras import Model, layers
from transformers import AlbertModel, AlbertTokenizer

from nebullvm.config import TRAIN_TEST_SPLIT_RATIO, CONSTRAINED_METRIC_DROP_THS
from nebullvm.core.models import (
    DeepLearningFramework,
    ModelParams,
    DataType,
    DeviceType,
    Device,
    QuantizationType,
)
from nebullvm.operations.conversions.huggingface import convert_hf_model
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.operations.measures.measures import (
    LatencyOriginalModelMeasure,
    MetricDropMeasure,
)
from nebullvm.operations.measures.utils import compute_relative_difference
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import gpu_is_available, extract_info_from_data

INPUT_SHAPE = (3, 256, 256)
OUTPUT_SHAPE = (2,)
STATIC_BATCH_SIZE = 1
DYNAMIC_BATCH_SIZE = 2


class TestModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(
            in_channels=3, out_channels=64, kernel_size=3
        )
        self.relu1 = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv2d(
            in_channels=64, out_channels=32, kernel_size=3
        )
        self.relu2 = torch.nn.ReLU()
        self.fcn = torch.nn.Linear(32, 2)

    def forward(self, input_tensor_0, input_tensor_1):
        x0 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_0))))
        x1 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_1))))
        x = x0 + x1
        x = self.fcn(x.mean(dim=(-2, -1)).view(-1, 32))
        return x


def tensorflow_model():
    input_0 = keras.Input(shape=(256, 256, 3))
    input_1 = keras.Input(shape=(256, 256, 3))
    x0 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_0)
    x1 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_1)
    x0 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x0)
    x1 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x1)
    x = x0 + x1
    y = layers.Dense(2, activation="softmax")(x)
    return Model(inputs=[input_0, input_1], outputs=y)


def _build_static_model(
    framework: DeepLearningFramework = DeepLearningFramework.PYTORCH,
) -> Tuple[torch.nn.Module, ModelParams]:
    model_params = {
        "batch_size": STATIC_BATCH_SIZE,
        "input_infos": [
            {"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
            {"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
        ],
        "output_sizes": [
            (STATIC_BATCH_SIZE, *OUTPUT_SHAPE),
        ],
        "output_types": [DataType.FLOAT32],
    }
    model_params = ModelParams(**model_params)
    if framework == DeepLearningFramework.PYTORCH:
        model = TestModel()
    elif framework == DeepLearningFramework.TENSORFLOW:
        model = tensorflow_model()
    else:
        raise NotImplementedError
    return model, model_params


def _build_dynamic_model(
    framework: DeepLearningFramework,
) -> Tuple[torch.nn.Module, ModelParams]:
    model_params = {
        "batch_size": DYNAMIC_BATCH_SIZE,
        "input_infos": [
            {"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
            {"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
        ],
        "output_sizes": [
            (DYNAMIC_BATCH_SIZE, *OUTPUT_SHAPE),
        ],
        "output_types": [DataType.FLOAT32],
        "dynamic_info": {
            "inputs": [
                {
                    0: {
                        "name": "batch",
                        "min_val": 1,
                        "opt_val": 1,
                        "max_val": 2,
                    }
                },
                {
                    0: {
                        "name": "batch",
                        "min_val": 1,
                        "opt_val": 1,
                        "max_val": 2,
                    }
                },
            ],
            "outputs": [{0: "batch"}],
        },
    }
    if framework == DeepLearningFramework.PYTORCH:
        model = TestModel()
    elif framework == DeepLearningFramework.TENSORFLOW:
        model = tensorflow_model()
    else:
        raise NotImplementedError()
    return model, ModelParams(**model_params)


def get_torch_model(dynamic: bool = False):
    if dynamic:
        model, model_params = _build_dynamic_model(
            DeepLearningFramework.PYTORCH
        )
    else:
        model, model_params = _build_static_model(
            DeepLearningFramework.PYTORCH
        )
    return model, model_params


def get_tensorflow_model(dynamic: bool = False):
    if dynamic:
        model, model_params = _build_dynamic_model(
            DeepLearningFramework.TENSORFLOW
        )
    else:
        model, model_params = _build_static_model(
            DeepLearningFramework.TENSORFLOW
        )
    return model, model_params


def get_huggingface_model(temp_dir: str, dl_framework: DeepLearningFramework):
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = AlbertModel.from_pretrained("albert-base-v1")

    text = "Short text you wish to process"
    encoded_input = tokenizer(text, return_tensors="pt")
    device = (
        Device(DeviceType.GPU)
        if gpu_is_available()
        else Device(DeviceType.CPU)
    )

    (
        model,
        input_data,
        input_names,
        output_structure,
        output_type,
    ) = convert_hf_model(model, [encoded_input], device=device)

    input_data = DataManager(input_data)
    input_data.split(TRAIN_TEST_SPLIT_RATIO)

    # Benchmark original model
    benchmark_orig_model_op = LatencyOriginalModelMeasure()
    benchmark_orig_model_op.to(device).execute(
        model=model,
        input_data=input_data.get_split("test"),
        dl_framework=dl_framework,
    )

    model_outputs = benchmark_orig_model_op.get_result()[0]

    model_path = os.path.join(temp_dir, "test_model.onnx")

    model_params = extract_info_from_data(
        model, input_data, dl_framework, None, device
    )

    device = DeviceType.GPU if gpu_is_available() else DeviceType.CPU
    convert_torch_to_onnx(
        model, input_data, model_params, Path(model_path), device
    )

    return (
        model_path,
        model_params,
        output_structure,
        input_names,
        output_type,
        input_data,
        model_outputs,
    )


def initialize_model(
    dynamic: bool,
    metric: Optional[str],
    output_library: DeepLearningFramework,
    device: Device,
):
    torch_device = torch.device(
        "cuda" if device.type is DeviceType.GPU else "cpu"
    )
    batch_size = DYNAMIC_BATCH_SIZE if dynamic else STATIC_BATCH_SIZE

    if output_library == DeepLearningFramework.PYTORCH:
        model, model_params = get_torch_model(dynamic)

        input_data = DataManager(
            [
                (
                    (
                        torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),
                        torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),
                    ),
                    torch.zeros(batch_size, dtype=torch.long),
                )
            ]
        )
    elif output_library == DeepLearningFramework.TENSORFLOW:
        model, model_params = get_tensorflow_model(dynamic)
        input_data = DataManager(
            [
                (
                    (
                        tf.random_normal_initializer()(
                            shape=(
                                batch_size,
                                *INPUT_SHAPE[1:],
                                INPUT_SHAPE[0],
                            )
                        ),
                        tf.random_normal_initializer()(
                            shape=(
                                batch_size,
                                *INPUT_SHAPE[1:],
                                INPUT_SHAPE[0],
                            )
                        ),
                    ),
                    [0 for _ in range(batch_size)],
                )
            ]
        )

    input_data.split(TRAIN_TEST_SPLIT_RATIO)
    input_tfms = MultiStageTransformation([])

    # Benchmark original model
    benchmark_orig_model_op = LatencyOriginalModelMeasure()
    benchmark_res = benchmark_orig_model_op.to(device).execute(
        model=model,
        input_data=input_data.get_split("test"),
        dl_framework=output_library,
    )

    model_outputs = benchmark_res.model_outputs

    if metric is not None:
        metric = compute_relative_difference

    return model, input_data, model_params, input_tfms, model_outputs, metric


def check_model_validity(
    optimized_model: Any,
    input_data: DataManager,
    model_outputs: Any,
    metric_drop_ths: float,
    quantization_type: QuantizationType,
    metric: Callable,
) -> bool:
    test_input_data, ys = input_data.get_split("test").get_list(with_ys=True)
    validity_check_op = MetricDropMeasure()
    validity_check_op.execute(
        optimized_model,
        test_input_data,
        model_outputs,
        metric_drop_ths
        if metric_drop_ths is not None
        else CONSTRAINED_METRIC_DROP_THS,
        metric_func=metric
        if quantization_type is not None
        else compute_relative_difference,
        ys=ys,
    )

    print(validity_check_op.get_result()[1])

    return validity_check_op.get_result()[0]


================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/utils.py
================================================
from typing import Callable, List


def map_compilers_and_compressors(ignore_list: List, enum_class: Callable):
    if ignore_list is None:
        ignore_list = []
    else:
        ignore_list = [enum_class(element) for element in ignore_list]
    return ignore_list


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/blade_disc.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import torch_blade
except ImportError:
    torch_blade = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/deepsparse.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    from deepsparse import compile_model, cpu
except ImportError:
    compile_model = cpu = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/diffusers.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import diffusers  # noqa F401
    from diffusers import (
        StableDiffusionPipeline,
        DiffusionPipeline,
    )  # noqa F401
    from diffusers.models import (
        AutoencoderKL,
        UNet2DConditionModel,
    )  # noqa F401
    from diffusers.models.unet_2d import UNet2DOutput  # noqa F401
except ImportError:
    diffusers = DummyClass
    StableDiffusionPipeline = DummyClass
    DiffusionPipeline = DummyClass
    UNet2DConditionModel = DummyClass
    AutoencoderKL = DummyClass
    UNet2DOutput = DummyClass

try:
    import onnx_graphsurgeon  # noqa F401
except ImportError:
    onnx_graphsurgeon = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/dummy.py
================================================
class DummyClass:
    pass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/huggingface.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    from transformers import PreTrainedModel, CLIPTextModel, CLIPTokenizer
    from transformers.tokenization_utils import PreTrainedTokenizer
    from transformers.models.bert.modeling_bert import (
        BertModel,
        BertEmbeddings,
        BertEncoder,
        BertPooler,
        BertPreTrainedModel,
    )
    from transformers import BertConfig, GPT2Tokenizer, GPT2LMHeadModel
except ImportError:
    # add placeholders for function definition
    PreTrainedModel = DummyClass
    CLIPTextModel = DummyClass
    CLIPTokenizer = DummyClass
    PreTrainedTokenizer = DummyClass
    BertModel = DummyClass
    BertEmbeddings = DummyClass
    BertEncoder = DummyClass
    BertPooler = DummyClass
    BertPreTrainedModel = DummyClass
    BertConfig = DummyClass
    GPT2Tokenizer = DummyClass
    GPT2LMHeadModel = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/neural_compressor.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import neural_compressor  # noqa F401
    from neural_compressor.adaptor.pytorch import (
        _cfg_to_qconfig as cfg_to_qconfig,
        _cfgs_to_fx_cfgs as cfgs_to_fx_cfgs,
    )
    from neural_compressor.experimental import (
        MixedPrecision,
        Quantization,
        Pruning,
    )
except ImportError:
    cfg_to_qconfig = cfgs_to_fx_cfgs = None
    MixedPrecision = Quantization = Pruning = DummyClass
except ValueError:
    # MacOS
    cfg_to_qconfig = cfgs_to_fx_cfgs = None
    MixedPrecision = Quantization = Pruning = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnx.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import onnx  # noqa F401
except ImportError:
    onnx = DummyClass

try:
    import onnxmltools  # noqa F401
    from onnxmltools.utils.float16_converter import (  # noqa F401
        convert_float_to_float16_model_path,
    )

except ImportError:
    convert_float_to_float16_model_path = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnxruntime.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import onnxruntime  # noqa F401
    from onnxruntime.quantization import (
        QuantType,
        quantize_static,
        quantize_dynamic,
        CalibrationDataReader,
    )
except ImportError:
    onnxruntime = DummyClass
    setattr(onnxruntime, "SessionOptions", None)
    QuantType = quantize_static = quantize_dynamic = None
    CalibrationDataReader = DummyClass
except FileNotFoundError:
    # Solves a colab issue
    QuantType = quantize_static = quantize_dynamic = None
    CalibrationDataReader = DummyClass

try:
    # They require torch
    from onnxruntime.transformers import optimizer
    from onnxruntime.transformers.optimizer import MODEL_TYPES
except ImportError:
    MODEL_TYPES = DummyClass
    optimizer = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnxsim.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import onnxsim
except ImportError:
    onnxsim = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/openvino.py
================================================
import logging

from nebullvm.optional_modules.dummy import DummyClass

try:
    from openvino.runtime import Core, Model, CompiledModel, InferRequest
    from openvino.tools.pot import DataLoader
    from openvino.tools.pot import IEEngine
    from openvino.tools.pot import load_model, save_model
    from openvino.tools.pot import compress_model_weights
    from openvino.tools.pot import create_pipeline
except ImportError:
    Model = CompiledModel = InferRequest = Core = DummyClass
    DataLoader = IEEngine = DummyClass
    load_model = save_model = compress_model_weights = create_pipeline = None

# Fix openvino issue with logging
# It adds a second handler to the root logger that cause issues
if len(logging.getLogger().handlers) > 1:
    logging.getLogger().removeHandler(logging.getLogger().handlers[-1])


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tensor_rt.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import tensorrt
    from tensorrt import IInt8EntropyCalibrator2
except ImportError:
    tensorrt = DummyClass
    IInt8EntropyCalibrator2 = DummyClass

try:
    import polygraphy.cuda as polygraphy
    from polygraphy.logger import G_LOGGER

    G_LOGGER.module_severity = 40
    from polygraphy.backend.onnx.loader import fold_constants
except ImportError:
    polygraphy = DummyClass
    fold_constants = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tensorflow.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import absl.logging

    absl.logging.set_verbosity(absl.logging.ERROR)
except Exception:
    pass


class Keras:
    Model = DummyClass


class data:
    Dataset = DummyClass


class dtypes:
    DType = DummyClass


class Tensorflow:
    Module = DummyClass
    Tensor = DummyClass
    keras = Keras()
    data = data
    dtypes = dtypes
    float16 = float32 = int32 = int64 = DummyClass

    @staticmethod
    def function(**kwargs):
        return lambda x: x


try:
    import tensorflow  # noqa F401

    physical_devices = tensorflow.config.experimental.list_physical_devices(
        "GPU"
    )
    if len(physical_devices) > 0:
        for physical_device in physical_devices:
            tensorflow.config.experimental.set_memory_growth(
                physical_device, True
            )

    tensorflow.get_logger().setLevel("ERROR")
    tensorflow.autograph.set_verbosity(0)
except (ImportError, AttributeError):
    tensorflow = Tensorflow


try:
    import tf2onnx  # noqa F401

    tf2onnx.logging.set_level("ERROR")
    tf2onnx.logging.set_tf_verbosity("ERROR")
except ImportError:
    tf2onnx = object


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import torch  # noqa F401
    from torch.nn import Module  # noqa F401
    from torch.jit import ScriptModule  # noqa F401
    from torch.fx import GraphModule
    from torch.utils.data import DataLoader, Dataset  # noqa F401
    from torch.quantization.quantize_fx import (  # noqa F401
        prepare_fx,
        convert_fx,
    )

    from torch.ao.quantization.stubs import QuantStub, DeQuantStub
    from torch.fx import symbolic_trace
    from torch.quantization import default_dynamic_qconfig
    import torch.distributed as torch_distributed
except ImportError:

    class nn:
        Module = DummyClass

    class jit:
        ScriptModule = DummyClass

    class fx:
        GraphModule = DummyClass

    class torch:
        float = half = int8 = DummyClass
        float16 = float32 = int32 = int64 = DummyClass
        Tensor = DummyClass
        dtype = DummyClass
        nn = nn
        jit = jit
        Generator = DummyClass
        FloatTensor = DummyClass
        fx = fx

        @staticmethod
        def no_grad():
            return lambda x: None

        @staticmethod
        def inference_mode():
            return lambda x: None

    Dataset = DummyClass
    Module = DummyClass
    ScriptModule = DummyClass
    GraphModule = DummyClass
    DataLoader = DummyClass
    symbolic_trace = None
    QuantStub = DeQuantStub = DummyClass
    default_dynamic_qconfig = prepare_fx = convert_fx = None
    Generator = DummyClass
    FloatTensor = DummyClass
    torch_distributed = None


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_neuron.py
================================================
import logging

from nebullvm.optional_modules.dummy import DummyClass

try:
    import torch_neuron  # noqa F401

    logging.getLogger("Neuron").setLevel(logging.WARNING)
except ImportError:
    try:
        import torch_neuronx  # noqa F401

        logging.getLogger("Neuron").setLevel(logging.WARNING)
    except ImportError:
        torch_neuron = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_tensorrt.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import torch_tensorrt
    from torch_tensorrt.ptq import DataLoaderCalibrator  # noqa F401
except ImportError:
    torch_tensorrt = DummyClass
    DataLoaderCalibrator = None


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_xla.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import torch_xla
    import torch_xla.core.xla_model as xm
except ImportError:
    torch_xla = DummyClass
    xm = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tvm.py
================================================
from nebullvm.optional_modules.dummy import DummyClass

try:
    import tvm
    from tvm import IRModule
    from tvm.runtime.ndarray import NDArray
    from tvm.autotvm.tuner import XGBTuner
    from tvm import autotvm
    import tvm.relay as relay
    from tvm.relay.transform import ToMixedPrecision
    from tvm.contrib.graph_executor import GraphModule
    from tvm.runtime import Module
    from tvm.relay.backend.executor_factory import ExecutorFactoryModule
except ImportError:
    tvm = (
        IRModule
    ) = (
        NDArray
    ) = (
        XGBTuner
    ) = (
        ExecutorFactoryModule
    ) = autotvm = relay = ToMixedPrecision = GraphModule = Module = DummyClass


================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/utils.py
================================================
import cpuinfo
from loguru import logger

from nebullvm.core.models import Device, DeviceType
from nebullvm.operations.optimizations.compilers.utils import (
    bladedisc_is_available,
    deepsparse_is_available,
    faster_transformer_is_available,
    intel_neural_compressor_is_available,
    onnxruntime_is_available,
    openvino_is_available,
    tensorrt_is_available,
    torch_tensorrt_is_available,
    torch_neuron_is_available,
    torch_xla_is_available,
    tvm_is_available,
)
from nebullvm.tools.utils import gpu_is_available, check_module_version


def torch_is_available() -> bool:
    try:
        import torch  # noqa F401

        if not torch.cuda.is_available() and gpu_is_available():
            logger.warning(
                "Installed PyTorch does not have cuda support. "
                "Please ensure that torch.cuda.is_available() "
                "returns True by installing the proper version "
                "of PyTorch. "
            )

        if not check_module_version(torch, min_version="1.10.0"):
            logger.warning(
                "torch module version must be >= 1.10.0. "
                "Please update it if you want to use it."
            )
            return False
    except ImportError:
        return False
    else:
        return True


def tensorflow_is_available() -> bool:
    try:
        import tensorflow  # noqa F401

        if not check_module_version(tensorflow, min_version="2.7.0"):
            logger.warning(
                "tensorflow module version must be >= 2.7.0. "
                "Please update it if you want to use it."
            )
            return False
    except ImportError:
        return False
    else:
        return True


def onnx_is_available() -> bool:
    try:
        import onnx  # noqa F401

        if not check_module_version(onnx, min_version="1.10.0"):
            logger.warning(
                "onnx module version must be >= 1.10.0. "
                "Please update it if you want to use it."
            )
            return False
        return True
    except ImportError:
        return False


def _onnxmltools_is_available():
    try:
        import onnxmltools  # noqa F401

        if not check_module_version(onnxmltools, min_version="1.11.0"):
            logger.warning(
                "onnxmltools module version must be >= 1.11.0. "
                "Please update it if you want to use the ONNX API "
                "or the ONNX pipeline for PyTorch and Tensorflow."
            )
            return False
        else:
            return True
    except ImportError:
        return False


def _onnxsim_is_available():
    try:
        import onnxsim  # noqa F401

        return True
    except ImportError:
        return False


def _polygraphy_is_available():
    try:
        import polygraphy.cuda  # noqa F401

        return True
    except ImportError:
        return False


def tf2onnx_is_available():
    try:
        import tf2onnx  # noqa F401

        return True
    except ImportError:
        return False


def check_dependencies(device: Device):
    missing_frameworks = []
    missing_suggested_compilers = []
    missing_optional_compilers = []
    missing_dependencies = []

    processor = cpuinfo.get_cpu_info()["brand_raw"].lower()

    if device.type is DeviceType.TPU:
        if not torch_is_available():
            missing_frameworks.append("torch")
        if not torch_xla_is_available():
            missing_dependencies.append("torch_xla")
    elif device.type is DeviceType.NEURON:
        if not torch_is_available():
            missing_frameworks.append("torch")
        if not torch_neuron_is_available():
            missing_dependencies.append("torch_neuron")
    else:
        if not onnx_is_available():
            missing_frameworks.append("onnx")

        if not tvm_is_available():
            missing_optional_compilers.append("tvm")
        if not onnxruntime_is_available():
            missing_suggested_compilers.append("onnxruntime")
        elif not _onnxmltools_is_available():
            missing_dependencies.append("onnxmltools")
        if not faster_transformer_is_available():
            missing_optional_compilers.append("faster_transformer")
        if device.type is DeviceType.GPU:
            if not tensorrt_is_available():
                missing_suggested_compilers.append("tensorrt")
            else:
                if not _onnxsim_is_available():
                    missing_dependencies.append("onnxsim")
                elif not _polygraphy_is_available():
                    missing_dependencies.append("polygraphy")
        if device.type is DeviceType.CPU:
            if not openvino_is_available() and "intel" in processor:
                missing_suggested_compilers.append("openvino")

        if torch_is_available():
            if not tvm_is_available():
                if "tvm" not in missing_optional_compilers:
                    missing_optional_compilers.append("tvm")
            if not bladedisc_is_available():
                missing_optional_compilers.append("torch_blade")

            if device.type is DeviceType.CPU:
                if not deepsparse_is_available() and "intel" in processor:
                    missing_suggested_compilers.append("deepsparse")
                if (
                    not intel_neural_compressor_is_available()
                    and "intel" in processor
                ):
                    missing_suggested_compilers.append("neural_compressor")
            elif device.type is DeviceType.GPU:
                if not torch_tensorrt_is_available:
                    missing_suggested_compilers.append("torch_tensorrt")
        else:
            missing_frameworks.append("torch")

        if tensorflow_is_available():
            if not tf2onnx_is_available():
                missing_dependencies.append("tf2onnx")
        else:
            missing_frameworks.append("tensorflow")

    missing_frameworks = ", ".join(missing_frameworks)
    if len(missing_frameworks) > 0:
        logger.warning(
            f"Missing Frameworks: {missing_frameworks}.\n "
            f"Please install them "
            "to include them in the optimization pipeline."
        )

    missing_suggested_compilers = ", ".join(missing_suggested_compilers)
    if len(missing_suggested_compilers) > 0:
        logger.warning(
            f"Missing Compilers: {missing_suggested_compilers}.\n "
            f"Please install them "
            "to include them in the optimization pipeline."
        )

    missing_dependencies = ", ".join(missing_dependencies)
    if len(missing_dependencies) > 0:
        logger.warning(
            f"Missing Dependencies: {missing_dependencies}.\n "
            f"Without them, some compilers "
            f"may not work properly."
        )


================================================
FILE: optimization/nebullvm/nebullvm/tools/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/tools/adapters.py
================================================
import abc
import copy
from abc import abstractmethod
import time
from typing import List, Any, Union

from loguru import logger

from nebullvm.core.models import (
    Device,
    DeviceType,
    OptimizedModel,
    OriginalModel,
)
from nebullvm.operations.conversions.huggingface import convert_hf_model
from nebullvm.operations.inference_learners.base import (
    BaseInferenceLearner,
)
from nebullvm.operations.inference_learners.huggingface import (
    DiffusionInferenceLearner,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.diffusers import (
    get_unet_inputs,
    preprocess_diffusers,
    postprocess_diffusers,
)
from nebullvm.tools.pytorch import get_torch_model_size
from nebullvm.tools.utils import (
    is_huggingface_data,
    check_module_version,
    get_throughput,
)


class ModelAdapter(abc.ABC):
    @property
    @abstractmethod
    def adapted_model(self):
        pass

    @property
    @abstractmethod
    def adapted_data(self):
        pass

    @abstractmethod
    def adapt_inference_learner(
        self, optimized_model: OptimizedModel
    ) -> BaseInferenceLearner:
        pass

    @abstractmethod
    def adapt_original_model(
        self, original_model: OriginalModel
    ) -> OriginalModel:
        pass


class DiffusionAdapter(ModelAdapter):
    def __init__(
        self,
        original_pipeline: StableDiffusionPipeline,
        data: List,
        device: Device,
    ):
        self.original_pipeline = copy.deepcopy(original_pipeline)
        self.original_data = data
        self.device = device
        self.__adapted = False
        self.__df_model = None
        self.__df_data = None

    @torch.no_grad()
    def __benchmark_pipeline(
        self,
        pipe: Union[StableDiffusionPipeline, BaseInferenceLearner],
        num_warmup_steps=2,
        num_steps=3,
    ):

        # Warmup
        for i in range(num_warmup_steps):
            _ = pipe(self.original_data[i % len(self.original_data)]).images[0]

        start = time.time()
        # Benchmark
        for i in range(num_steps):
            _ = pipe(self.original_data[i % len(self.original_data)]).images[0]

        took = time.time() - start

        return took / num_steps

    def __adapt(self):
        if not check_module_version(torch, max_version="1.13.1+cu117"):
            raise ValueError(
                "Diffusion models are only supported in PyTorch "
                "versions <= 1.13.1. Please downgrade your PyTorch "
                "version and try again."
            )

        model = copy.deepcopy(self.original_pipeline)
        model.get_unet_inputs = get_unet_inputs
        model.to(self.device.to_torch_format())
        self.__df_data = [
            (
                tuple(
                    d.reshape((1,)) if d.shape == torch.Size([]) else d
                    for d in model.get_unet_inputs(
                        model,
                        prompt=prompt,
                    )
                    if d is not None
                ),
                None,
            )
            for prompt in self.original_data
        ]
        self.__df_model = preprocess_diffusers(model)
        self.__adapted = True

    @property
    def adapted_model(self):
        if self.__adapted is False:
            self.__adapt()
        return self.__df_model

    @property
    def adapted_data(self):
        if self.__adapted is False:
            self.__adapt()
        return self.__df_data

    def adapt_inference_learner(
        self, optimized_model: OptimizedModel
    ) -> OptimizedModel:
        pipe = copy.deepcopy(self.original_pipeline)
        pipe.to(self.device.to_torch_format())
        if self.device.type is DeviceType.GPU:
            try:
                pipe.enable_xformers_memory_efficient_attention()
            except Exception:
                pass

        pipe = postprocess_diffusers(
            optimized_model.inference_learner,
            pipe,
            self.device,
        )
        logger.info("Benchmarking optimized pipeline...")
        optimized_model.latency_seconds = self.__benchmark_pipeline(pipe)
        optimized_model.throughput = get_throughput(
            optimized_model.latency_seconds
        )
        optimized_model.inference_learner = DiffusionInferenceLearner(pipe)
        optimized_model.size_mb += (
            sum(
                [
                    get_torch_model_size(v)
                    for (k, v) in pipe.__dict__.items()
                    if isinstance(v, torch.nn.Module) and k != "unet"
                ]
            )
            / 1e6
        )
        return optimized_model

    def adapt_original_model(
        self, original_model: OriginalModel
    ) -> OriginalModel:
        pipe = copy.deepcopy(self.original_pipeline)
        pipe.to(self.device.to_torch_format())
        logger.info("Benchmarking original pipeline...")
        original_model.latency_seconds = self.__benchmark_pipeline(pipe)
        original_model.throughput = get_throughput(
            original_model.latency_seconds
        )
        original_model.size_mb += (
            sum(
                [
                    get_torch_model_size(v)
                    for (k, v) in pipe.__dict__.items()
                    if isinstance(v, torch.nn.Module) and k != "unet"
                ]
            )
            / 1e6
        )
        return original_model


class HuggingFaceAdapter(ModelAdapter):
    def __init__(self, model: Any, data: List, device: Device, **kwargs):
        self.original_model = model
        self.original_data = data
        self.device = device
        self.tokenizer_params = kwargs
        self.__adapted = False
        self.__hf_model = None
        self.__hf_data = None
        self.__hf_input_names = None
        self.__hf_output_type = None
        self.__hf_output_structure = None

    def __adapt_model(self):
        if not is_huggingface_data(self.original_data[0]):
            raise ValueError("Cannot convert non-HuggingFace data")
        (
            model,
            data,
            input_names,
            output_structure,
            output_type,
        ) = convert_hf_model(
            self.original_model,
            self.original_data,
            self.device,
            **self.tokenizer_params,
        )
        self.__hf_model = model
        self.__hf_data = data
        self.__hf_input_names = input_names
        self.__hf_output_type = output_type
        self.__hf_output_structure = output_structure
        self.__adapted = True

    @property
    def adapted_model(self):
        if self.__adapted is False:
            self.__adapt_model()
        return self.__hf_model

    @property
    def adapted_data(self):
        if self.__adapted is False:
            self.__adapt_model()
        return self.__hf_data

    def adapt_inference_learner(
        self, optimized_model: OptimizedModel
    ) -> OptimizedModel:
        from nebullvm.operations.inference_learners.huggingface import (
            HuggingFaceInferenceLearner,
        )

        optimized_model.inference_learner = HuggingFaceInferenceLearner(
            core_inference_learner=optimized_model.inference_learner,
            output_structure=self.__hf_output_structure,
            input_names=self.__hf_input_names,
            output_type=self.__hf_output_type,
        )

        return optimized_model

    def adapt_original_model(
        self, original_model: OriginalModel
    ) -> OriginalModel:
        return original_model


================================================
FILE: optimization/nebullvm/nebullvm/tools/benchmark.py
================================================
import time
from abc import abstractmethod, ABC
from typing import Any, Dict, Type

import numpy as np
from loguru import logger
from tqdm import tqdm

from nebullvm.core.models import DeepLearningFramework, ModelParams, DeviceType
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, DataLoader
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import create_model_inputs_onnx
from nebullvm.tools.pytorch import create_model_inputs_torch
from nebullvm.tools.tf import create_model_inputs_tf
from nebullvm.tools.utils import (
    check_input_data,
    extract_info_from_data,
    is_data_subscriptable,
    check_device,
)


def _get_dl_framework(model: Any):
    if (
        isinstance(model, torch.nn.Module)
        or str(model).startswith("Pytorch")
        or str(model).startswith("Torch")
    ):
        return DeepLearningFramework.PYTORCH
    elif (isinstance(model, tf.Module) and model is not None) or str(
        model
    ).startswith("Tensorflow"):
        return DeepLearningFramework.TENSORFLOW
    elif isinstance(model, str) or str(model).startswith("Numpy"):
        return DeepLearningFramework.NUMPY
    else:
        raise TypeError(f"Model type {type(model)} not supported.")


def _create_model_inputs(
    dl_framework: DeepLearningFramework, model_params: ModelParams
):
    if dl_framework == DeepLearningFramework.PYTORCH:
        input_data = create_model_inputs_torch(model_params.input_infos)
    elif dl_framework == DeepLearningFramework.TENSORFLOW:
        input_data = create_model_inputs_tf(model_params.input_infos)
    elif dl_framework == DeepLearningFramework.NUMPY:
        input_data = create_model_inputs_onnx(model_params.input_infos)
    else:
        raise TypeError(f"Unknown framework {dl_framework}")

    return input_data


class BaseBenchmark(ABC):
    def __init__(self, model, input_tensors, device, n_warmup=50, n_runs=1000):
        self.model = model
        self.input_tensors = input_tensors
        self.device = device
        self.n_warmup = n_warmup
        self.n_runs = n_runs

    @abstractmethod
    def benchmark(self):
        raise NotImplementedError


class PytorchBenchmark(BaseBenchmark):
    def benchmark(self):
        input_tensors = [
            [tensor.to(self.device.to_torch_format()) for tensor in tensors]
            for tensors in self.input_tensors
        ]
        batch_size = input_tensors[0][0].shape[0]

        if isinstance(self.model, torch.nn.Module):
            self.model.to(self.device.to_torch_format()).eval()

        with torch.no_grad():
            for i in tqdm(
                range(self.n_warmup),
                desc=f"Performing warm up on {self.n_warmup} iterations",
            ):
                self.model(
                    *input_tensors[i % min(self.n_warmup, len(input_tensors))]
                )
        if self.device.type is DeviceType.GPU:
            torch.cuda.synchronize()
        timings = []
        with torch.no_grad():
            for i in tqdm(
                range(1, self.n_runs + 1),
                desc=f"Performing benchmark on {self.n_runs} iterations",
            ):
                start_time = time.time()
                self.model(
                    *input_tensors[i % min(self.n_runs, len(input_tensors))]
                )
                if self.device.type is DeviceType.GPU:
                    torch.cuda.synchronize()
                end_time = time.time()
                timings.append(end_time - start_time)

        print(f"Batch size: {batch_size}")

        throughput = batch_size / np.mean(timings)
        latency = np.mean(timings) / batch_size

        print("Average Throughput: %.2f data/second" % throughput)
        print("Average Latency: %.4f seconds/data" % latency)

        return throughput, latency


class TensorflowBenchmark(BaseBenchmark):
    def benchmark(self):
        batch_size = self.input_tensors[0][0].shape[0]

        for i in tqdm(
            range(self.n_warmup),
            desc=f"Performing warm up on {self.n_warmup} iterations",
        ):
            with tf.device(self.device.to_tf_format()):
                self.model(
                    *self.input_tensors[
                        i % min(self.n_warmup, len(self.input_tensors))
                    ]
                )

        timings = []
        for i in tqdm(
            range(1, self.n_runs + 1),
            desc=f"Performing benchmark on {self.n_runs} iterations",
        ):
            start_time = time.time()
            with tf.device(self.device.to_tf_format()):
                self.model(
                    *self.input_tensors[
                        i % min(self.n_runs, len(self.input_tensors))
                    ]
                )

            end_time = time.time()
            timings.append(end_time - start_time)

        print(f"Batch size: {batch_size}")

        throughput = batch_size / np.mean(timings)
        latency = np.mean(timings) / batch_size

        print("Average Throughput: %.2f data/second" % throughput)
        print("Average Latency: %.4f seconds/data" % latency)

        return throughput, latency


class NumpyBenchmark(BaseBenchmark):
    def benchmark(self):
        if not isinstance(self.model, BaseInferenceLearner):
            # TODO: Add support for original onnx models
            raise NotImplementedError(
                "Benchmark function doesn't support original " "onnx models."
            )
        batch_size = self.input_tensors[0][0].shape[0]

        for i in tqdm(
            range(self.n_warmup),
            desc=f"Performing warm up on {self.n_warmup} iterations",
        ):
            self.model(
                *self.input_tensors[
                    i % min(self.n_warmup, len(self.input_tensors))
                ]
            )

        timings = []
        for i in tqdm(
            range(1, self.n_runs + 1),
            desc=f"Performing benchmark on {self.n_runs} iterations",
        ):
            start_time = time.time()
            self.model(
                *self.input_tensors[
                    i % min(self.n_runs, len(self.input_tensors))
                ]
            )

            end_time = time.time()
            timings.append(end_time - start_time)

        print(f"Batch size: {batch_size}")

        throughput = batch_size / np.mean(timings)
        latency = np.mean(timings) / batch_size

        print("Average Throughput: %.2f data/second" % throughput)
        print("Average Latency: %.4f seconds/data" % latency)

        return throughput, latency


def benchmark(
    model, input_data, device=None, random=False, n_warmup=50, n_runs=1000
):
    """Performs a Benchmark on the input model regardless of the framework it
    was used for implementing it.
    Args:
        model (Any): The input model.
        input_data (Iterable or Sequence): Input data to be used for
            optimizing the model. PyTorch, TensorFlow
            and Onnx respectively accept input tensor in `torch.Tensor`,
            `tf.Tensor` and `np.ndarray` formats. Note that the each input
            sample must be a tuple containing a tuple as first element, the
            `inputs`, and the `label` as second element. The `inputs` needs to
            be passed as tuple even if a single input is needed by the model
            (in this case the `inputs` tuple will contain just an element).
            HuggingFace models can take as data samples both dictionaries or
            strings. Strings will then be converted in data samples using the
            HuggingFace tokenizer which must be given as input when just a
            list of string is provided as input_data (tokenizers can be passed
            as extra arguments of this function using the keyword `tokenizer`).
        device (str): Device to be used for running the benchmark. If None,
            CPU will be used. Default: None.
        random (bool, optional): If set to true, the data used to benchmark the
            model will be computed randomly given the info extracted from the
            provided input_data.
        n_warmup (int, optional): Number of warmup iterations.
        n_runs (int, optional): Number of iterations performed to benchmark
            the model.
    """
    if not isinstance(model, BaseInferenceLearner):
        device = check_device(device)
    else:
        device = model.device

    logger.info(f"Running benchmark on {device.type.name}")

    dl_framework = _get_dl_framework(model)

    if isinstance(input_data, (DataLoader, tf.data.Dataset)):
        try:
            input_data = DataManager.from_dataloader(input_data)
        except Exception:
            raise ValueError(
                "The provided dataloader does not match the expected "
                "format.\n"
                "Speedster supports dataloaders that return tuples in "
                "the\n"
                "following formats: \n"
                "Single input: (input,  label)\n"
                "Multiple inputs: ((input1, input2, ...),  label) or "
                "(input1, input2, ...,  label)\n"
                "Inputs and labels should be either tensors or numpy "
                "arrays,\n"
                "depending on the framework used.\n"
            )

    if not isinstance(input_data, DataManager):
        if check_input_data(input_data):
            if is_data_subscriptable(input_data):
                input_data = DataManager(input_data)
            else:
                input_data = DataManager.from_iterable(input_data)
        else:
            raise ValueError(
                "The provided data does not match the expected "
                "format.\n"
                "Speedster supports data in the following formats: \n"
                "- PyTorch DataLoader\n"
                "- TensorFlow Dataset\n"
                "- List of tuples: [((input_0, ... ), label), ...] \n"
                "Inputs and labels should be either tensors or numpy "
                "arrays,\n"
                "depending on the framework used.\n"
            )

    if random:
        model_params = extract_info_from_data(
            model, input_data, dl_framework, None, device
        )
        input_data = _create_model_inputs(dl_framework, model_params)
    else:
        input_data = input_data.get_list()

    BENCHMARK_FUNCTIONS[dl_framework](
        model=model,
        input_tensors=input_data,
        device=device,
        n_warmup=n_warmup,
        n_runs=n_runs,
    ).benchmark()


BENCHMARK_FUNCTIONS: Dict[DeepLearningFramework, Type[BaseBenchmark]] = {
    DeepLearningFramework.PYTORCH: PytorchBenchmark,
    DeepLearningFramework.TENSORFLOW: TensorflowBenchmark,
    DeepLearningFramework.NUMPY: NumpyBenchmark,
}


================================================
FILE: optimization/nebullvm/nebullvm/tools/data.py
================================================
from typing import Sequence, List, Tuple, Any, Union, Iterable

import numpy as np
from loguru import logger

from nebullvm.config import MIN_DIM_INPUT_DATA
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Dataset, DataLoader
from nebullvm.tools.onnx import convert_to_numpy


class DataManager:
    """Class for managing the user data in nebullvm.

    Attributes:
        data_reader(Sequence): Object implementing the __getitem__, the
            __len__ and the __iter__/__next__ APIs. It should read the
            user data and return tuples of tensors for feeding the models.
    """

    def __init__(self, data_reader: Sequence):
        self._data_reader = data_reader
        self._pointer = 0
        self.train_idxs = []
        self.test_idxs = []

    def __getitem__(self, item):
        return self._data_reader[item]

    def __len__(self):
        return len(self._data_reader)

    def __iter__(self):
        self._pointer = 0
        return self

    def __next__(self):
        if self._pointer < len(self):
            data = self[self._pointer]
            self._pointer += 1
            return data
        else:
            raise StopIteration

    def get_numpy_list(
        self, n: int = None, shuffle: bool = False, with_ys: bool = False
    ) -> Union[
        List[Tuple[np.ndarray, ...]], Tuple[List[Tuple[np.ndarray, ...]], List]
    ]:
        if n is None:
            n = len(self)
        if not with_ys:
            return [
                tuple(convert_to_numpy(x) for x in tuple_)
                for tuple_ in self.get_list(n, shuffle)
            ]
        else:
            xs, ys = self.get_list(n, shuffle, with_ys=True)
            return [
                tuple(convert_to_numpy(x) for x in tuple_) for tuple_ in xs
            ], ys

    def get_list(
        self, n: int = None, shuffle: bool = False, with_ys: bool = False
    ) -> Union[List[Tuple[Any, ...]], Tuple[List[Tuple[Any, ...]], List]]:
        if n is None:
            n = len(self)
        if shuffle:
            idx = np.random.choice(len(self), n, replace=n > len(self))
        else:
            idx = np.arange(0, min(n, len(self)))
            if n > len(self):
                np.random.seed(0)
                idx = np.concatenate(
                    [
                        idx,
                        np.random.choice(
                            len(self), n - len(self), replace=True
                        ),
                    ]
                )
        if not with_ys:
            return [self[i][0] for i in idx]

        ys, xs = [], []
        for i in idx:
            x, y = self[i] if len(self[i]) > 1 else (self[i][0], None)
            xs.append(x)
            ys.append(y)
        return xs, ys

    @classmethod
    def from_iterable(cls, iterable: Iterable, max_length: int = 500):
        return cls([x for i, x in enumerate(iterable) if i < max_length])

    @classmethod
    def from_dataloader(
        cls,
        dataloader: Union[DataLoader, tf.data.Dataset],
        max_length: int = 500,
    ):
        batch_size = (
            dataloader.batch_size
            if isinstance(dataloader, DataLoader)
            else dataloader._batch_size
        )

        if batch_size > max_length:
            raise ValueError(
                f"Batch size ({dataloader.batch_size}) is greater than "
                f"max_length ({max_length})."
            )
        data_manager = []
        warning_label = False
        for i, batch in enumerate(dataloader):
            if i * batch_size >= max_length:
                break

            if isinstance(batch, (list, tuple)):
                if len(batch) == 1:
                    data_manager.append((batch, None))
                elif len(batch) == 2:
                    if isinstance(batch[0], tuple):
                        data_manager.append((batch[0], batch[1]))
                    elif isinstance(batch[0], (torch.Tensor, tf.Tensor)):
                        warning_label = True
                        data_manager.append(((batch[0],), batch[1]))
                    else:
                        raise ValueError(
                            "The first element of the batch should be a "
                            "tuple or a torch.Tensor"
                        )
                else:
                    warning_label = True
                    data_manager.append(
                        (tuple(t for t in batch[:-1]), batch[-1])
                    )
            elif isinstance(batch, (torch.Tensor, tf.Tensor)):
                data_manager.append(((batch,), None))
            else:
                raise ValueError(
                    "The batch should be a tuple, a list or a Tensor"
                )

        if warning_label:
            logger.warning(
                "The provided dataloader returns a tuple of tensors"
                "for each batch. The last tensor in the tuple will "
                "be considered as the label. "
                "To avoid this warning, the dataloader should return "
                "a tuple for each batch, where the first element is "
                "a tuple containing the inputs and the second element "
                "is a tensor containing the label."
            )

        return cls(data_manager)

    def get_split(self, split_type="train"):
        return (
            DataManager([self[i] for i in self.train_idxs])
            if split_type == "train"
            else DataManager([self[i] for i in self.test_idxs])
        )

    def split(self, split_pct: float, shuffle: bool = False):
        if shuffle:
            idx = np.random.choice(len(self), len(self), replace=False)
        else:
            idx = np.arange(len(self))

        n = int(round(len(idx) * split_pct))

        if len(self) < MIN_DIM_INPUT_DATA:
            logger.warning(
                f"Not enough data for splitting the DataManager. "
                f"You should provide at least {MIN_DIM_INPUT_DATA} "
                f"data samples to allow a good split between train "
                f"and test sets. Compression, calibration and precision "
                f"checks will use the same data."
            )
            self.train_idxs = idx
            self.test_idxs = idx
        else:
            self.train_idxs = idx[:n]
            self.test_idxs = idx[n:]


class PytorchDataset(Dataset):
    def __init__(self, input_data: DataManager, has_labels: bool = False):
        self.data = input_data
        self.has_labels = has_labels
        self.batch_size = input_data[0][0][0].shape[0]

    def __len__(self):
        return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])

    def __getitem__(self, idx):
        batch_idx = int(idx / self.batch_size)
        item_idx = idx % self.batch_size
        data = tuple([data[item_idx] for data in self.data[batch_idx][0]])

        if self.has_labels:
            label = self.data[batch_idx][1]
            if label is not None:
                return data, self.data[batch_idx][1][item_idx]
            else:
                return data, torch.tensor([0])
        else:
            return data


================================================
FILE: optimization/nebullvm/nebullvm/tools/diffusers.py
================================================
# Based on https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/models.py
#
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Dict, Union, List, Optional, Any, Tuple

from nebullvm.core.models import Device
from nebullvm.optional_modules.diffusers import (
    DiffusionPipeline,
    UNet2DConditionModel,
    UNet2DOutput,
    AutoencoderKL,
    onnx_graphsurgeon as gs,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.huggingface import CLIPTextModel, CLIPTokenizer
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensor_rt import fold_constants
from nebullvm.optional_modules.torch import torch


@torch.no_grad()
def get_unet_inputs(
    self,
    prompt: Union[str, List[str]] = None,
    height: Optional[int] = None,
    width: Optional[int] = None,
    num_inference_steps: int = 1,
    guidance_scale: float = 7.5,
    negative_prompt: Optional[Union[str, List[str]]] = None,
    num_images_per_prompt: Optional[int] = 1,
    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
    latents: Optional[torch.FloatTensor] = None,
    prompt_embeds: Optional[torch.FloatTensor] = None,
    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
    callback_steps: int = 1,
    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
    # 0. Default height and width to unet
    height = height or self.unet.config.sample_size * self.vae_scale_factor
    width = width or self.unet.config.sample_size * self.vae_scale_factor

    # 1. Check inputs. Raise error if not correct
    self.check_inputs(
        prompt,
        height,
        width,
        callback_steps,
        negative_prompt,
        prompt_embeds,
        negative_prompt_embeds,
    )

    # 2. Define call parameters
    if prompt is not None and isinstance(prompt, str):
        batch_size = 1
    elif prompt is not None and isinstance(prompt, list):
        batch_size = len(prompt)
    else:
        batch_size = prompt_embeds.shape[0]

    device = self._execution_device
    do_classifier_free_guidance = guidance_scale > 1.0

    # 3. Encode input prompt
    prompt_embeds = self._encode_prompt(
        prompt,
        device,
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt,
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
    )

    # 4. Prepare timesteps
    self.scheduler.set_timesteps(num_inference_steps, device=device)
    timesteps = self.scheduler.timesteps

    # 5. Prepare latent variables
    num_channels_latents = self.unet.in_channels
    latents = self.prepare_latents(
        batch_size * num_images_per_prompt,
        num_channels_latents,
        height,
        width,
        prompt_embeds.dtype,
        device,
        generator,
        latents,
    )

    for i, t in enumerate(timesteps):
        # expand the latents if we are doing classifier free guidance
        latent_model_input = (
            torch.cat([latents] * 2)
            if do_classifier_free_guidance
            else latents
        )
        latent_model_input = self.scheduler.scale_model_input(
            latent_model_input, t
        )

        return latent_model_input, t, prompt_embeds, cross_attention_kwargs


class DiffusionUNetWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *x, **kwargs):
        return tuple(
            self.model(x[0], x[1], encoder_hidden_states=x[2]).values()
        )


class OptimizedDiffusionWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *x, **kwargs):
        return UNet2DOutput(
            self.model(
                x[0],
                x[1].reshape((1,)) if x[1].shape == torch.Size([]) else x[1],
                kwargs["encoder_hidden_states"],
            )[0]
        )


def is_diffusion_model_pipe(model):
    return isinstance(model, DiffusionPipeline)


def get_default_dynamic_info(input_shape: List[Tuple[int, ...]]):
    return {
        "inputs": [
            {
                0: {
                    "name": "2B",
                    "min_val": input_shape[0][0],
                    "opt_val": input_shape[0][0],
                    "max_val": input_shape[0][0],
                },
                2: {
                    "name": "H",
                    "min_val": input_shape[0][2],
                    "opt_val": input_shape[0][2],
                    "max_val": input_shape[0][2],
                },
                3: {
                    "name": "W",
                    "min_val": input_shape[0][3],
                    "opt_val": input_shape[0][3],
                    "max_val": input_shape[0][3],
                },
            },
            {},
            {
                0: {
                    "name": "2B",
                    "min_val": input_shape[2][0],
                    "opt_val": input_shape[2][0],
                    "max_val": input_shape[2][0],
                }
            },
        ],
        "outputs": [{0: "2B", 2: "H", 3: "W"}],
    }


def preprocess_diffusers(pipe: DiffusionPipeline) -> torch.nn.Module:
    # Function that wraps the Diffusion UNet model to
    # be compatible with the optimizations performed by nebullvm
    model = DiffusionUNetWrapper(pipe.unet)
    return model


def postprocess_diffusers(
    optimized_model: Any,
    pipe: StableDiffusionPipeline,
    device: Device,
) -> StableDiffusionPipeline:
    # Function that puts the optimized Diffusion UNet model back
    # into the Diffusion Pipeline
    final_model = OptimizedDiffusionWrapper(optimized_model)
    final_model.sample_size = pipe.unet.sample_size
    final_model.in_channels = pipe.unet.in_channels
    final_model.device = torch.device(device.to_torch_format())
    final_model.config = pipe.unet.config
    final_model.in_channels = pipe.unet.in_channels
    pipe.unet = final_model
    return pipe


class Optimizer:
    def __init__(self, onnx_graph, verbose=False):
        self.graph = gs.import_onnx(onnx_graph)
        self.verbose = verbose

    def info(self, prefix):
        if self.verbose:
            print(
                f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs"
            )

    def cleanup(self, return_onnx=False):
        self.graph.cleanup().toposort()
        if return_onnx:
            return gs.export_onnx(self.graph)

    def select_outputs(self, keep, names=None):
        self.graph.outputs = [self.graph.outputs[o] for o in keep]
        if names:
            for i, name in enumerate(names):
                self.graph.outputs[i].name = name

    def fold_constants(self, return_onnx=False):
        onnx_graph = fold_constants(
            gs.export_onnx(self.graph),
            allow_onnxruntime_shape_inference=True,
        )
        self.graph = gs.import_onnx(onnx_graph)
        if return_onnx:
            return onnx_graph

    def infer_shapes(self, return_onnx=False):
        onnx_graph = gs.export_onnx(self.graph)
        if onnx_graph.ByteSize() > 2147483648:
            raise TypeError("ERROR: model size exceeds supported 2GB limit")
        else:
            onnx_graph = onnx.shape_inference.infer_shapes(onnx_graph)

        self.graph = gs.import_onnx(onnx_graph)
        if return_onnx:
            return onnx_graph


def get_path(version, inpaint=False):
    if version == "1.4":
        if inpaint:
            return "runwayml/stable-diffusion-inpainting"
        else:
            return "CompVis/stable-diffusion-v1-4"
    elif version == "1.5":
        if inpaint:
            return "runwayml/stable-diffusion-inpainting"
        else:
            return "runwayml/stable-diffusion-v1-5"
    elif version == "2.0-base":
        if inpaint:
            return "stabilityai/stable-diffusion-2-inpainting"
        else:
            return "stabilityai/stable-diffusion-2-base"
    elif version == "2.0":
        if inpaint:
            return "stabilityai/stable-diffusion-2-inpainting"
        else:
            return "stabilityai/stable-diffusion-2"
    elif version == "2.1":
        return "stabilityai/stable-diffusion-2-1"
    elif version == "2.1-base":
        return "stabilityai/stable-diffusion-2-1-base"
    else:
        raise ValueError(f"Incorrect version {version}")


def get_embedding_dim(version):
    if version in ("1.4", "1.5"):
        return 768
    elif version in ("2.0", "2.0-base", "2.1", "2.1-base"):
        return 1024
    else:
        raise ValueError(f"Incorrect version {version}")


class BaseModel:
    def __init__(
        self,
        hf_token,
        fp16=False,
        device="cuda",
        verbose=False,
        path="",
        max_batch_size=16,
        embedding_dim=768,
        text_maxlen=77,
    ):
        self.name = "SD Model"
        self.hf_token = hf_token
        self.fp16 = fp16
        self.device = device
        self.verbose = verbose
        self.path = path

        self.min_batch = 1
        self.max_batch = max_batch_size
        self.min_image_shape = 256  # min image resolution: 256x256
        self.max_image_shape = 1024  # max image resolution: 1024x1024
        self.min_latent_shape = self.min_image_shape // 8
        self.max_latent_shape = self.max_image_shape // 8

        self.embedding_dim = embedding_dim
        self.text_maxlen = text_maxlen

    def get_model(self):
        pass

    def get_input_names(self):
        pass

    def get_output_names(self):
        pass

    def get_dynamic_axes(self):
        return None

    def get_sample_input(self, batch_size, image_height, image_width):
        pass

    def get_input_profile(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        return None

    def get_shape_dict(self, batch_size, image_height, image_width):
        return None

    def optimize(self, onnx_graph):
        opt = Optimizer(onnx_graph, verbose=self.verbose)
        opt.info(self.name + ": original")
        opt.cleanup()
        opt.info(self.name + ": cleanup")
        opt.fold_constants()
        opt.info(self.name + ": fold constants")
        opt.infer_shapes()
        opt.info(self.name + ": shape inference")
        onnx_opt_graph = opt.cleanup(return_onnx=True)
        opt.info(self.name + ": finished")
        return onnx_opt_graph

    def check_dims(self, batch_size, image_height, image_width):
        assert batch_size >= self.min_batch and batch_size <= self.max_batch
        assert image_height % 8 == 0 or image_width % 8 == 0
        latent_height = image_height // 8
        latent_width = image_width // 8
        assert (
            latent_height >= self.min_latent_shape
            and latent_height <= self.max_latent_shape
        )
        assert (
            latent_width >= self.min_latent_shape
            and latent_width <= self.max_latent_shape
        )
        return (latent_height, latent_width)

    def get_minmax_dims(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        min_batch = batch_size if static_batch else self.min_batch
        max_batch = batch_size if static_batch else self.max_batch
        latent_height = image_height // 8
        latent_width = image_width // 8
        min_image_height = (
            image_height if static_shape else self.min_image_shape
        )
        max_image_height = (
            image_height if static_shape else self.max_image_shape
        )
        min_image_width = image_width if static_shape else self.min_image_shape
        max_image_width = image_width if static_shape else self.max_image_shape
        min_latent_height = (
            latent_height if static_shape else self.min_latent_shape
        )
        max_latent_height = (
            latent_height if static_shape else self.max_latent_shape
        )
        min_latent_width = (
            latent_width if static_shape else self.min_latent_shape
        )
        max_latent_width = (
            latent_width if static_shape else self.max_latent_shape
        )
        return (
            min_batch,
            max_batch,
            min_image_height,
            max_image_height,
            min_image_width,
            max_image_width,
            min_latent_height,
            max_latent_height,
            min_latent_width,
            max_latent_width,
        )


class CLIP(BaseModel):
    def __init__(
        self, hf_token, device, verbose, path, max_batch_size, embedding_dim
    ):
        super(CLIP, self).__init__(
            hf_token,
            device=device,
            verbose=verbose,
            path=path,
            max_batch_size=max_batch_size,
            embedding_dim=embedding_dim,
        )
        self.name = "CLIP"

    def get_model(self):
        return CLIPTextModel.from_pretrained(
            self.path, subfolder="text_encoder", use_auth_token=self.hf_token
        ).to(self.device)

    def get_input_names(self):
        return ["input_ids"]

    def get_output_names(self):
        return ["text_embeddings", "pooler_output"]

    def get_dynamic_axes(self):
        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}

    def get_input_profile(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        self.check_dims(batch_size, image_height, image_width)
        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
            batch_size, image_height, image_width, static_batch, static_shape
        )
        return {
            "input_ids": [
                (min_batch, self.text_maxlen),
                (batch_size, self.text_maxlen),
                (max_batch, self.text_maxlen),
            ]
        }

    def get_shape_dict(self, batch_size, image_height, image_width):
        self.check_dims(batch_size, image_height, image_width)
        return {
            "input_ids": (batch_size, self.text_maxlen),
            "text_embeddings": (
                batch_size,
                self.text_maxlen,
                self.embedding_dim,
            ),
        }

    def get_sample_input(self, batch_size, image_height, image_width):
        self.check_dims(batch_size, image_height, image_width)
        return torch.zeros(
            batch_size, self.text_maxlen, dtype=torch.int32, device=self.device
        )

    def optimize(self, onnx_graph):
        opt = Optimizer(onnx_graph, verbose=self.verbose)
        opt.info(self.name + ": original")
        opt.select_outputs([0])  # delete graph output#1
        opt.cleanup()
        opt.info(self.name + ": remove output[1]")
        opt.fold_constants()
        opt.info(self.name + ": fold constants")
        opt.infer_shapes()
        opt.info(self.name + ": shape inference")
        opt.select_outputs(
            [0], names=["text_embeddings"]
        )  # rename network output
        opt.info(self.name + ": remove output[0]")
        opt_onnx_graph = opt.cleanup(return_onnx=True)
        opt.info(self.name + ": finished")
        return opt_onnx_graph


def make_CLIP(
    version, hf_token, device, verbose, max_batch_size, inpaint=False
):
    return CLIP(
        hf_token=hf_token,
        device=device,
        verbose=verbose,
        path=get_path(version, inpaint=inpaint),
        max_batch_size=max_batch_size,
        embedding_dim=get_embedding_dim(version),
    )


class UNet(BaseModel):
    def __init__(
        self,
        hf_token,
        fp16=False,
        device="cuda",
        verbose=False,
        path="",
        max_batch_size=16,
        embedding_dim=768,
        text_maxlen=77,
        unet_dim=4,
    ):
        super(UNet, self).__init__(
            hf_token,
            fp16=fp16,
            device=device,
            verbose=verbose,
            path=path,
            max_batch_size=max_batch_size,
            embedding_dim=embedding_dim,
            text_maxlen=text_maxlen,
        )
        self.unet_dim = unet_dim
        self.name = "UNet"

    def get_model(self):
        model_opts = (
            {"revision": "fp16", "torch_dtype": torch.float16}
            if self.fp16
            else {}
        )
        return UNet2DConditionModel.from_pretrained(
            self.path,
            subfolder="unet",
            use_auth_token=self.hf_token,
            **model_opts,
        ).to(self.device)

    def get_input_names(self):
        return ["sample", "timestep", "encoder_hidden_states"]

    def get_output_names(self):
        return ["latent"]

    def get_dynamic_axes(self):
        return {
            "sample": {0: "2B", 2: "H", 3: "W"},
            "encoder_hidden_states": {0: "2B"},
            "latent": {0: "2B", 2: "H", 3: "W"},
        }

    def get_input_profile(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        (
            min_batch,
            max_batch,
            _,
            _,
            _,
            _,
            min_latent_height,
            max_latent_height,
            min_latent_width,
            max_latent_width,
        ) = self.get_minmax_dims(
            batch_size, image_height, image_width, static_batch, static_shape
        )
        return {
            "sample": [
                (
                    2 * min_batch,
                    self.unet_dim,
                    min_latent_height,
                    min_latent_width,
                ),
                (2 * batch_size, self.unet_dim, latent_height, latent_width),
                (
                    2 * max_batch,
                    self.unet_dim,
                    max_latent_height,
                    max_latent_width,
                ),
            ],
            "encoder_hidden_states": [
                (2 * min_batch, self.text_maxlen, self.embedding_dim),
                (2 * batch_size, self.text_maxlen, self.embedding_dim),
                (2 * max_batch, self.text_maxlen, self.embedding_dim),
            ],
        }

    def get_shape_dict(self, batch_size, image_height, image_width):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        return {
            "sample": (
                2 * batch_size,
                self.unet_dim,
                latent_height,
                latent_width,
            ),
            "encoder_hidden_states": (
                2 * batch_size,
                self.text_maxlen,
                self.embedding_dim,
            ),
            "latent": (2 * batch_size, 4, latent_height, latent_width),
        }

    def get_sample_input(self, batch_size, image_height, image_width):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        dtype = torch.float16 if self.fp16 else torch.float32
        return (
            torch.randn(
                2 * batch_size,
                self.unet_dim,
                latent_height,
                latent_width,
                dtype=torch.float32,
                device=self.device,
            ),
            torch.tensor([1.0], dtype=torch.float32, device=self.device),
            torch.randn(
                2 * batch_size,
                self.text_maxlen,
                self.embedding_dim,
                dtype=dtype,
                device=self.device,
            ),
        )


def make_UNet(
    version, hf_token, device, verbose, max_batch_size, inpaint=False
):
    return UNet(
        hf_token=hf_token,
        fp16=True,
        device=device,
        verbose=verbose,
        path=get_path(version, inpaint=inpaint),
        max_batch_size=max_batch_size,
        embedding_dim=get_embedding_dim(version),
        unet_dim=(9 if inpaint else 4),
    )


class VAE(BaseModel):
    def __init__(
        self, hf_token, device, verbose, path, max_batch_size, embedding_dim
    ):
        super(VAE, self).__init__(
            hf_token,
            device=device,
            verbose=verbose,
            path=path,
            max_batch_size=max_batch_size,
            embedding_dim=embedding_dim,
        )
        self.name = "VAE decoder"

    def get_model(self):
        vae = AutoencoderKL.from_pretrained(
            self.path, subfolder="vae", use_auth_token=self.hf_token
        ).to(self.device)
        vae.forward = vae.decode
        return vae

    def get_input_names(self):
        return ["latent"]

    def get_output_names(self):
        return ["images"]

    def get_dynamic_axes(self):
        return {
            "latent": {0: "B", 2: "H", 3: "W"},
            "images": {0: "B", 2: "8H", 3: "8W"},
        }

    def get_input_profile(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        (
            min_batch,
            max_batch,
            _,
            _,
            _,
            _,
            min_latent_height,
            max_latent_height,
            min_latent_width,
            max_latent_width,
        ) = self.get_minmax_dims(
            batch_size, image_height, image_width, static_batch, static_shape
        )
        return {
            "latent": [
                (min_batch, 4, min_latent_height, min_latent_width),
                (batch_size, 4, latent_height, latent_width),
                (max_batch, 4, max_latent_height, max_latent_width),
            ]
        }

    def get_shape_dict(self, batch_size, image_height, image_width):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        return {
            "latent": (batch_size, 4, latent_height, latent_width),
            "images": (batch_size, 3, image_height, image_width),
        }

    def get_sample_input(self, batch_size, image_height, image_width):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        return torch.randn(
            batch_size,
            4,
            latent_height,
            latent_width,
            dtype=torch.float32,
            device=self.device,
        )


def make_VAE(
    version, hf_token, device, verbose, max_batch_size, inpaint=False
):
    return VAE(
        hf_token=hf_token,
        device=device,
        verbose=verbose,
        path=get_path(version, inpaint=inpaint),
        max_batch_size=max_batch_size,
        embedding_dim=get_embedding_dim(version),
    )


class TorchVAEEncoder(torch.nn.Module):
    def __init__(self, token, device, path):
        super().__init__()
        self.path = path
        self.vae_encoder = AutoencoderKL.from_pretrained(
            self.path, subfolder="vae", use_auth_token=token
        ).to(device)

    def forward(self, x):
        return self.vae_encoder.encode(x).latent_dist.sample()


class VAEEncoder(BaseModel):
    def __init__(
        self, hf_token, device, verbose, path, max_batch_size, embedding_dim
    ):
        super(VAEEncoder, self).__init__(
            hf_token,
            device=device,
            verbose=verbose,
            path=path,
            max_batch_size=max_batch_size,
            embedding_dim=embedding_dim,
        )
        self.name = "VAE encoder"

    def get_model(self):
        vae_encoder = TorchVAEEncoder(self.hf_token, self.device, self.path)
        return vae_encoder

    def get_input_names(self):
        return ["images"]

    def get_output_names(self):
        return ["latent"]

    def get_dynamic_axes(self):
        return {
            "images": {0: "B", 2: "8H", 3: "8W"},
            "latent": {0: "B", 2: "H", 3: "W"},
        }

    def get_input_profile(
        self, batch_size, image_height, image_width, static_batch, static_shape
    ):
        assert batch_size >= self.min_batch and batch_size <= self.max_batch
        min_batch = batch_size if static_batch else self.min_batch
        max_batch = batch_size if static_batch else self.max_batch
        self.check_dims(batch_size, image_height, image_width)
        (
            min_batch,
            max_batch,
            min_image_height,
            max_image_height,
            min_image_width,
            max_image_width,
            _,
            _,
            _,
            _,
        ) = self.get_minmax_dims(
            batch_size, image_height, image_width, static_batch, static_shape
        )

        return {
            "images": [
                (min_batch, 3, min_image_height, min_image_width),
                (batch_size, 3, image_height, image_width),
                (max_batch, 3, max_image_height, max_image_width),
            ],
        }

    def get_shape_dict(self, batch_size, image_height, image_width):
        latent_height, latent_width = self.check_dims(
            batch_size, image_height, image_width
        )
        return {
            "images": (batch_size, 3, image_height, image_width),
            "latent": (batch_size, 4, latent_height, latent_width),
        }

    def get_sample_input(self, batch_size, image_height, image_width):
        self.check_dims(batch_size, image_height, image_width)
        return torch.randn(
            batch_size,
            3,
            image_height,
            image_width,
            dtype=torch.float32,
            device=self.device,
        )


def make_VAEEncoder(
    version, hf_token, device, verbose, max_batch_size, inpaint=False
):
    return VAEEncoder(
        hf_token=hf_token,
        device=device,
        verbose=verbose,
        path=get_path(version, inpaint=inpaint),
        max_batch_size=max_batch_size,
        embedding_dim=get_embedding_dim(version),
    )


def make_tokenizer(version, hf_token):
    return CLIPTokenizer.from_pretrained(
        get_path(version), subfolder="tokenizer", use_auth_token=hf_token
    )


def is_diffusion_model(model) -> bool:
    try:
        from diffusers import UNet2DConditionModel
    except ImportError:
        return False

    if is_diffusion_model_pipe(model):
        return True
    if isinstance(model, (UNet2DConditionModel, DiffusionUNetWrapper)):
        return True
    if hasattr(model, "model"):
        return isinstance(model.model, UNet2DConditionModel)
    return False


================================================
FILE: optimization/nebullvm/nebullvm/tools/feedback_collector.py
================================================
import json
import os
from pathlib import Path
from typing import Any

import requests

from nebullvm.config import VERSION

NEBULLVM_METADATA_PATH = Path.home() / ".nebullvm/collect.json"


class FeedbackCollector:
    def __init__(
        self, url: str, disable_telemetry_environ_var: str, app_version: str
    ):
        self._disable_telemetry_environ_var = disable_telemetry_environ_var
        self._is_active = (
            int(os.getenv(disable_telemetry_environ_var, "0")) == 0
        )
        self._url = url
        self._metadata = {
            "nebullvm_version": VERSION,
            "app_version": app_version,
        }

    def _store_ip_address(self):
        try:
            self._metadata["ip_address"] = requests.get(
                "https://api.ipify.org"
            ).text
        except Exception:
            self._metadata["ip_address"] = "Unknown"

    @property
    def is_active(self):
        return self._is_active

    def _inform_user(self):
        message = (
            f"Nebuly collects anonymous usage statistics to help improve the "
            f"product. You can opt-out by setting the environment variable "
            f"{self._disable_telemetry_environ_var}=1."
        )
        print(message)

    def store_info(self, key: str, value: Any):
        if key in self._metadata and isinstance(value, list):
            self._metadata[key] += value
        else:
            self._metadata[key] = value

    def send_feedback(self, timeout: int = 30):
        if not self.is_active:
            return {}
        self._store_ip_address()
        request_body = self._metadata
        headers = {
            "accept": "application/json",
            "Content-Type": "application/json",
        }
        response = requests.post(
            self._url,
            data=json.dumps(request_body),
            headers=headers,
            timeout=timeout,
        )
        return response

    def get(self, key: str, default: Any = None):
        return self._metadata.get(key, default)

    def reset(self, key: str):
        self._metadata.pop(key, None)


================================================
FILE: optimization/nebullvm/nebullvm/tools/hardware_utils.py
================================================
import os
import platform

import cpuinfo
import psutil

from nebullvm.core.models import HardwareSetup, Device, DeviceType
from nebullvm.optional_modules.torch_xla import xm
from nebullvm.optional_modules.utils import (
    torch_is_available,
    tensorflow_is_available,
)
from nebullvm.tools.pytorch import torch_get_device_name
from nebullvm.tools.tf import tensorflow_get_gpu_name
from nebullvm.tools.utils import (
    gpu_is_available,
    tpu_is_available,
    neuron_is_available,
)


def get_hw_setup(device: Device = None) -> HardwareSetup:
    accelerator = None
    if (
        device is not None and device.type is DeviceType.GPU
    ) or gpu_is_available():
        accelerator = _get_gpu_name()
    elif (
        device is not None and device.type is DeviceType.TPU
    ) or tpu_is_available():
        accelerator = _get_tpu_device_name()
    elif (
        device is not None and device.type is DeviceType.NEURON
    ) or neuron_is_available():
        accelerator = _get_neuron_device_name()
    return HardwareSetup(
        cpu=cpuinfo.get_cpu_info()["brand_raw"],
        operating_system=platform.system(),
        memory_gb=round(psutil.virtual_memory().total * 1e-9, 2),
        accelerator=accelerator,
    )


def _get_gpu_name() -> str:
    if torch_is_available():
        name = torch_get_device_name()
    elif tensorflow_is_available():
        name = tensorflow_get_gpu_name()
    else:
        name = "Unknown"
    return name


def _get_neuron_device_name() -> str:
    output = os.popen("lshw -businfo").read()
    neuron_name = "Unknown Neuron"
    for line in output.splitlines():
        if "neuron" in line.lower():
            words = line.split(" ")
            if len(words) > 2:
                neuron_name = " ".join(words[-2:])
                break
    return neuron_name


def _get_tpu_device_name() -> str:
    return xm.xla_device_hw(xm.xla_device())


================================================
FILE: optimization/nebullvm/nebullvm/tools/huggingface.py
================================================
from collections import OrderedDict
from typing import (
    Union,
    Iterable,
    List,
    Dict,
    Tuple,
    Type,
    Any,
)

import numpy as np

from nebullvm.core.models import Device, DeviceType
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Module

try:
    from transformers import (
        PreTrainedModel,
    )
    from transformers.tokenization_utils import PreTrainedTokenizer
except ImportError:
    # add placeholders for function definition
    PreTrainedModel = None
    PreTrainedTokenizer = None


class PyTorchTransformerWrapper(Module):
    """Class for wrappering the Transformers and give them an API compatible
    with nebullvm. The class takes and input of the forward method positional
    arguments and transform them in the input dictionaries needed by
    transformers classes. At the end it also flattens their output.
    """

    def __init__(
        self,
        core_model: Module,
        encoded_input: Dict[str, torch.Tensor],
    ):
        super().__init__()
        self.core_model = core_model
        self.inputs_types = OrderedDict()
        for key, value in encoded_input.items():
            self.inputs_types[key] = value.dtype

    def forward(self, *args: torch.Tensor):
        inputs = {
            key: value for key, value in zip(self.inputs_types.keys(), args)
        }
        outputs = self.core_model(**inputs)
        outputs = outputs.values() if isinstance(outputs, dict) else outputs
        return tuple(flatten_outputs(outputs))


class TensorFlowTransformerWrapper(tf.keras.Model):
    def __init__(
        self,
        core_model: tf.Module,
        encoded_input: Dict[str, tf.Tensor],
    ):
        super().__init__()
        self.core_model = core_model
        self.inputs_types = OrderedDict()
        for key, value in encoded_input.items():
            self.inputs_types[key] = value.dtype

    def call(self, *args: tf.Tensor):
        inputs = {
            key: value for key, value in zip(self.inputs_types.keys(), args[0])
        }
        outputs = self.core_model(**inputs)
        outputs = outputs.values() if isinstance(outputs, dict) else outputs
        return tuple(flatten_outputs(list(outputs)))


def flatten_outputs(
    outputs: Union[torch.Tensor, tf.Tensor, Iterable]
) -> List[Union[torch.Tensor, tf.Tensor]]:
    new_outputs = []
    for output in outputs:
        if isinstance(output, (torch.Tensor, tf.Tensor)):
            new_outputs.append(output)
        else:
            flatten_list = flatten_outputs(output)
            new_outputs.extend(flatten_list)
    return new_outputs


def get_size_recursively(
    tensor_tuple: Union[torch.Tensor, tf.Tensor, Tuple]
) -> List[int]:
    if isinstance(tensor_tuple[0], (torch.Tensor, tf.Tensor)):
        return [len(tensor_tuple)]
    else:
        inner_size = get_size_recursively(tensor_tuple[0])
        return [len(tensor_tuple), *inner_size]


def get_output_structure_from_text(
    text: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    tokenizer_args: Dict,
    device: Device,
) -> Tuple[OrderedDict, Type]:
    """Function needed for saving in a dictionary the output structure of the
    transformers model.
    """
    encoded_input = tokenizer([text], **tokenizer_args)
    if isinstance(model, torch.nn.Module):
        encoded_input = encoded_input.to(device.to_torch_format())
    output = model(**encoded_input)
    structure = OrderedDict()
    if isinstance(output, tuple):
        for i, value in enumerate(output):
            if isinstance(value, (torch.Tensor, tf.Tensor)):
                structure[f"output_{i}"] = None
            else:
                size = get_size_recursively(value)
                structure[f"output_{i}"] = size
    else:
        for key, value in output.items():
            if isinstance(value, (torch.Tensor, tf.Tensor)):
                structure[key] = None
            else:
                size = get_size_recursively(value)
                structure[key] = size
    return structure, type(output)


def get_output_structure_from_dict(
    input_example: Dict,
    model: PreTrainedModel,
    device: Device,
) -> Tuple[OrderedDict, Type]:
    """Function needed for saving in a dictionary the output structure of the
    transformers model.
    """

    if (
        isinstance(model, torch.nn.Module)
        and device.type is not DeviceType.TPU
    ):
        model.to(device.to_torch_format())
        input_example.to(device.to_torch_format())

    output = model(**input_example)
    structure = OrderedDict()
    if isinstance(output, tuple):
        for i, value in enumerate(output):
            if isinstance(value, (torch.Tensor, tf.Tensor)):
                structure[f"output_{i}"] = None
            else:
                size = get_size_recursively(value)
                structure[f"output_{i}"] = size
    else:
        for key, value in output.items():
            if isinstance(value, (torch.Tensor, tf.Tensor)):
                structure[key] = None
            else:
                size = get_size_recursively(value)
                structure[key] = size
    return structure, type(output)


def restructure_output(
    output: Tuple[Union[torch.Tensor, tf.Tensor]],
    structure: OrderedDict,
    output_type: Any = None,
):
    """Restructure the flatter output using the structure dictionary given as
    input.
    """
    output_dict = {}
    idx = 0
    for key, value in structure.items():
        if value is None:
            output_dict[key] = output[idx]
            idx += 1
        else:
            tensor_shape = output[idx].shape[1:]
            stack_fn = (
                torch.stack
                if isinstance(output[idx], torch.Tensor)
                else tf.stack
            )
            reshape_fn = (
                torch.reshape
                if isinstance(output[idx], torch.Tensor)
                else tf.reshape
            )

            output_dict[key] = list(
                reshape_fn(
                    stack_fn(
                        output[idx : int(np.prod(value)) + idx]  # noqa E203
                    ),
                    (*value, *tensor_shape),
                )
            )
            idx += np.prod(value)
    if output_type is not None:
        return output_type(**output_dict)
    return output_dict


================================================
FILE: optimization/nebullvm/nebullvm/tools/logger.py
================================================
import logging
import os
import sys
import warnings
from typing import Any

from loguru import logger


levels_map = {
    0: "ERROR",
    1: "WARNING",
    2: "INFO",
    3: "DEBUG",
}


def debug_mode_enabled():
    return int(os.environ.get("DEBUG_MODE", "0")) > 0


def setup_logger():
    if not debug_mode_enabled():
        warnings.filterwarnings("ignore")

    logging_level = int(os.environ.get("NEBULLVM_LOG_LEVEL", "2"))

    logger.remove()
    logger.add(
        sys.stdout,
        colorize=True,
        format=(
            "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
            "<level>{level: <8}</level> | <level>{message}</level>"
        ),
        level=levels_map[logging_level],
    )
    logger.level("WARNING", color="<fg #d3d3d3>")


class LoggingContext(object):
    def __init__(
        self,
        logger: logging.Logger,
        disabled: bool = False,
        handler: Any = None,
        close: bool = True,
    ):
        self.logger = logger
        self.disabled = disabled
        self.handler = handler
        self.close = close

    def __enter__(self):
        self.logger.disabled = self.disabled
        if self.handler:
            self.logger.addHandler(self.handler)

    def __exit__(self, et: Any, ev: Any, tb: Any):
        if self.disabled is True:
            self.logger.disabled = False
        if self.handler:
            self.logger.removeHandler(self.handler)
        if self.handler and self.close:
            self.handler.close()
        # implicit return of None => don't swallow exceptions


================================================
FILE: optimization/nebullvm/nebullvm/tools/onnx.py
================================================
from typing import List, Tuple, Any, Optional, Dict

import numpy as np
from loguru import logger

from nebullvm.config import ONNX_PROVIDERS
from nebullvm.core.models import (
    DeepLearningFramework,
    Device,
    DeviceType,
    InputInfo,
    DataType,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch


def convert_to_numpy(tensor: Any):
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.cpu().detach().numpy()
    elif isinstance(tensor, tf.Tensor) and tensor is not None:
        tensor = tensor.numpy()
    elif isinstance(tensor, int):
        tensor = np.array([tensor])
    else:
        if not isinstance(tensor, np.ndarray):
            raise TypeError(f"Unsupported data type: {type(tensor)}")
    return tensor


def convert_to_target_framework(
    tensor: np.ndarray, framework: DeepLearningFramework
) -> Any:
    if framework is DeepLearningFramework.PYTORCH:
        return torch.from_numpy(tensor)
    elif framework is DeepLearningFramework.TENSORFLOW:
        return tf.convert_to_tensor(tensor)
    else:
        return tensor


def get_input_names(onnx_model: str):
    model = onnx.load(onnx_model)
    input_all = [node.name for node in model.graph.input]
    return input_all


def get_output_names(onnx_model: str):
    model = onnx.load(onnx_model)
    output_all = [node.name for node in model.graph.output]
    return output_all


def run_onnx_model(
    onnx_model: str, input_tensors: List[np.ndarray], device: Device
) -> List[np.ndarray]:
    from nebullvm.optional_modules.onnxruntime import onnxruntime as ort

    if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3:
        ONNX_PROVIDERS["cuda"][1] = (
            "CUDAExecutionProvider",
            {
                "device_id": device.idx,
            },
        )

    model = ort.InferenceSession(
        onnx_model,
        providers=ONNX_PROVIDERS["cuda"][1:]
        if device.type is DeviceType.GPU
        else ONNX_PROVIDERS["cpu"],
    )
    inputs = {
        name: array
        for name, array in zip(get_input_names(onnx_model), input_tensors)
    }
    res = model.run(
        output_names=get_output_names(onnx_model), input_feed=inputs
    )
    return list(res)


def _extract_dynamic_axis(
    onnx_model: str,
    data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],
    input_sizes: List[Tuple[int, ...]],
    device: Device,
    max_data: int = 100,
) -> Optional[Dict]:
    from nebullvm.tools.utils import inspect_dynamic_size

    dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
    output_sizes = []
    for i, input_data in enumerate(data):
        input_tensors = input_data[0]
        if i >= max_data:
            break
        inspect_dynamic_size(
            input_tensors, input_sizes, dynamic_axis["inputs"]
        )
        outputs = tuple(
            run_onnx_model(onnx_model, list(input_tensors), device)
        )
        if i == 0:
            dynamic_axis["outputs"] = [{}] * len(outputs)
            output_sizes = [tuple(output.shape[1:]) for output in outputs]
        inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
    if any(
        len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
    ):
        return dynamic_axis
    return None


def extract_info_from_np_data(
    onnx_model: str,
    data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],
    dynamic_axis: Dict,
    device: Device,
    **kwargs,
):
    from nebullvm.tools.utils import ifnone

    input_row = data[0][0]
    batch_size = int(input_row[0].shape[0])
    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
        logger.warning("Detected not consistent batch size in the inputs.")

    input_sizes = [tuple(x.shape) for x in input_row]
    input_types = [
        "int32"
        if x.dtype is np.int32
        else "int64"
        if x.dtype is np.int64
        else "float16"
        if x.dtype is np.float16
        else "float32"
        for x in input_row
    ]
    dynamic_axis = ifnone(
        dynamic_axis,
        _extract_dynamic_axis(onnx_model, data, input_sizes, device),
    )
    return batch_size, input_sizes, input_types, dynamic_axis


def get_output_info_onnx(
    onnx_model: str, input_tensors: List[np.ndarray], device
) -> List[Tuple[Tuple[int, ...], DataType]]:
    res = run_onnx_model(onnx_model, input_tensors, device)
    sizes = [
        (tuple(output.shape), DataType.from_framework_format(output.dtype))
        for output in res
    ]
    return sizes


def create_model_inputs_onnx(input_infos: List[InputInfo]) -> List[np.ndarray]:
    input_tensors = (
        np.random.randn(*input_info.size).astype(np.float32)
        if input_info.dtype is DataType.FLOAT32
        else np.random.randint(
            size=input_info.size,
            low=input_info.min_value or 0,
            high=input_info.max_value or 100,
        )
        for input_info in input_infos
    )
    return list(input_tensors)


def onnx_is_gpu_available():
    return ort.get_device() == "GPU"


================================================
FILE: optimization/nebullvm/nebullvm/tools/pytorch.py
================================================
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Union, Sequence

from loguru import logger

from nebullvm.core.models import Device, DataType, DeviceType, InputInfo
from nebullvm.optional_modules.torch import torch, DataLoader
from nebullvm.tools.data import DataManager
from nebullvm.tools.diffusers import get_default_dynamic_info

FX_MODULE_NAME = "NebullvmFxModule"


def save_with_torch_fx(model: torch.nn.Module, path: Path):
    traced_model = torch.fx.symbolic_trace(model)
    traced_model.to_folder(path, FX_MODULE_NAME)


def load_with_torch_fx(
    path: Path, state_dict_name: str = "pruned_state_dict.pt"
):
    module_file = path / "module.py"
    with open(module_file, "r") as f:
        module_str = f.read()
    exec(module_str, globals())
    model = eval(FX_MODULE_NAME)()
    model.load_state_dict(torch.load(path / state_dict_name))
    return model


def get_output_info_torch(
    torch_model: torch.nn.Module,
    input_tensors: List[torch.Tensor],
    device: Device,
) -> List[Tuple[Tuple[int, ...], DataType]]:
    if device.type is DeviceType.GPU:
        input_tensors = [x.to(device.to_torch_format()) for x in input_tensors]
        torch_model.to(device.to_torch_format())
    with torch.no_grad():
        outputs = torch_model(*input_tensors)
        if isinstance(outputs, torch.Tensor):
            return [
                (
                    tuple(outputs.size()),
                    DataType.from_framework_format(outputs.dtype),
                )
            ]
        else:
            return [
                (
                    tuple(output.size()),
                    DataType.from_framework_format(output.dtype),
                )
                for output in outputs
            ]


def create_model_inputs_torch(
    input_infos: List[InputInfo],
) -> List[torch.Tensor]:
    input_tensors = (
        torch.randn(*input_info.size)
        if input_info.dtype is DataType.FLOAT32
        else torch.randint(
            size=input_info.size,
            low=input_info.min_value or 0,
            high=input_info.max_value or 100,
        )
        for input_info in input_infos
    )
    return list(input_tensors)


def run_torch_model(
    torch_model: torch.nn.Module,
    input_tensors: List[torch.Tensor],
    device: Device,
    dtype: torch.dtype = torch.float,
) -> List[torch.Tensor]:
    torch_model.eval()
    if device.type is DeviceType.GPU:
        torch_model.to(device.to_torch_format())
        if dtype != torch.half:
            input_tensors = (
                t.to(device.to_torch_format()) for t in input_tensors
            )
        else:
            input_tensors = (
                t.to(device.to_torch_format()).half()
                if t.dtype == torch.float
                else t.to(device.to_torch_format())
                for t in input_tensors
            )
    with torch.no_grad():
        pred = torch_model(*input_tensors)
    if isinstance(pred, torch.Tensor):
        pred = [pred.cpu()]
    else:
        pred = [p.cpu() for p in pred]
    return pred


def _extract_dynamic_axis(
    torch_model: torch.nn.Module,
    dataloader: DataManager,
    input_sizes: List[Tuple[int, ...]],
    device: Device,
    max_data: int = 100,
) -> Optional[Dict]:
    from nebullvm.tools.utils import inspect_dynamic_size

    dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
    output_sizes = []
    for i, input_data in enumerate(dataloader):
        input_tensors = input_data[0]
        if i >= max_data:
            break
        inspect_dynamic_size(
            input_tensors, input_sizes, dynamic_axis["inputs"]
        )
        outputs = tuple(run_torch_model(torch_model, input_tensors, device))
        if i == 0:
            dynamic_axis["outputs"] = [{}] * len(outputs)
            output_sizes = [tuple(output.shape) for output in outputs]
        inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
    if any(
        len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
    ):
        return dynamic_axis
    return None


def extract_info_from_torch_data(
    model: torch.nn.Module,
    dataloader: Union[DataLoader, Sequence],
    dynamic_axis: Dict,
    device: Device,
    is_diffusion: bool = False,
):
    from nebullvm.tools.utils import ifnone

    input_data = (
        dataloader[0]
        if isinstance(dataloader, Sequence)
        else next(iter(dataloader))
    )
    input_row = input_data[0]
    batch_size = int(input_row[0].shape[0])
    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
        logger.warning("Detected not consistent batch size in the inputs.")

    input_sizes = [tuple(x.shape) for x in input_row]
    input_types = [
        "int64"
        if isinstance(x.cpu(), torch.LongTensor)
        else "int32"
        if isinstance(x.cpu(), torch.IntTensor)
        else "float16"
        if isinstance(x.cpu(), torch.HalfTensor)
        else "float32"
        for x in input_row
    ]

    # For the Stable Diffusion UNet we must provide dynamic axis
    # even when using static shapes, because otherwise the converted
    # onnx model will have size issues.
    if dynamic_axis is None and device.type is DeviceType.GPU and is_diffusion:
        dynamic_axis = get_default_dynamic_info(input_sizes)

    if dynamic_axis is not None:
        dynamic_axis["inputs"] = [
            {int(k): v for (k, v) in val.items()}
            for val in dynamic_axis["inputs"]
        ]
        dynamic_axis["outputs"] = [
            {int(k): v for (k, v) in val.items()}
            for val in dynamic_axis["outputs"]
        ]

    dynamic_axis = ifnone(
        dynamic_axis,
        _extract_dynamic_axis(model, dataloader, input_sizes, device),
    )
    return batch_size, input_sizes, input_types, dynamic_axis


def torch_is_gpu_available():
    return torch.cuda.is_available()


def torch_get_device_name():
    return torch.cuda.get_device_name(0)


def get_torch_model_size(
    model: Union[torch.nn.Module, torch.jit.ScriptModule, torch.fx.GraphModule]
):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    return param_size + buffer_size


================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/__init__.py
================================================


================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_data.py
================================================
import tensorflow as tf
import torch

from nebullvm.tools.data import DataManager


def test_custom_input_data():
    input_data = [
        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
        ((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
    ]

    data_manager = DataManager(input_data)

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 1
    assert data_manager[0][0][0].shape == (2, 3, 10, 10)
    assert data_manager[0][1].shape == (2, 1)


def test_torch_dataloader_single_input_with_label():
    dataset = torch.utils.data.TensorDataset(
        torch.randn(8, 3, 10, 10), torch.randn(8, 1)
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
    data_manager = DataManager.from_dataloader(dataloader)

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 1
    assert data_manager[0][0][0].shape == (2, 3, 10, 10)
    assert data_manager[0][1].shape == (2, 1)


def test_torch_dataloader_two_inputs_with_label():
    dataset = torch.utils.data.TensorDataset(
        torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 1)
    )

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
    data_manager = DataManager.from_dataloader(dataloader)

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 2
    assert data_manager[0][0][0].shape == (2, 3, 10, 10)
    assert data_manager[0][0][1].shape == (2, 3, 10, 10)
    assert data_manager[0][1].shape == (2, 1)


def test_torch_dataloader_three_inputs_with_label():
    dataset = torch.utils.data.TensorDataset(
        torch.randn(8, 3, 10, 10),
        torch.randn(8, 3, 10, 10),
        torch.randn(8, 3, 10, 10),
        torch.randn(8, 1),
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
    data_manager = DataManager.from_dataloader(dataloader)

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 3
    assert data_manager[0][0][0].shape == (2, 3, 10, 10)
    assert data_manager[0][0][1].shape == (2, 3, 10, 10)
    assert data_manager[0][0][2].shape == (2, 3, 10, 10)
    assert data_manager[0][1].shape == (2, 1)


def test_torch_dataloader_single_input_without_label():
    dataset = torch.utils.data.TensorDataset(torch.randn(8, 3, 10, 10))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
    data_manager = DataManager.from_dataloader(dataloader)

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 1
    assert data_manager[0][0][0].shape == (2, 3, 10, 10)


def test_tensorflow_dataloader_single_input_with_label():
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1]))
    )
    data_manager = DataManager.from_dataloader(dataset.batch(2))

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 1
    assert data_manager[0][0][0].shape == (2, 10, 10, 3)
    assert data_manager[0][1].shape == (2, 1)


def test_tensorflow_dataloader_two_inputs_with_label():
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            tf.random.normal([8, 10, 10, 3]),
            tf.random.normal([8, 10, 10, 3]),
            tf.random.normal([8, 1]),
        )
    )
    data_manager = DataManager.from_dataloader(dataset.batch(2))

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 2
    assert data_manager[0][0][0].shape == (2, 10, 10, 3)
    assert data_manager[0][0][1].shape == (2, 10, 10, 3)
    assert data_manager[0][1].shape == (2, 1)


def test_tensorflow_dataloader_three_inputs_with_label():
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            tf.random.normal([8, 10, 10, 3]),
            tf.random.normal([8, 10, 10, 3]),
            tf.random.normal([8, 10, 10, 3]),
            tf.random.normal([8, 1]),
        )
    )
    data_manager = DataManager.from_dataloader(dataset.batch(2))

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 3
    assert data_manager[0][0][0].shape == (2, 10, 10, 3)
    assert data_manager[0][0][1].shape == (2, 10, 10, 3)
    assert data_manager[0][0][2].shape == (2, 10, 10, 3)
    assert data_manager[0][1].shape == (2, 1)


def test_tensorflow_dataloader_single_input_without_label():
    dataset = tf.data.Dataset.from_tensor_slices(
        tf.random.normal([8, 10, 10, 3])
    )
    data_manager = DataManager.from_dataloader(dataset.batch(2))

    assert len(data_manager) == 4
    assert len(data_manager[0]) == 2
    assert len(data_manager[0][0]) == 1
    assert data_manager[0][0][0].shape == (2, 10, 10, 3)


================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_hardware_utils.py
================================================
import unittest
from unittest.mock import patch

from nebullvm.tools import hardware_utils


class TestGetHwSetup(unittest.TestCase):
    @patch(
        "nebullvm.tools.hardware_utils.gpu_is_available", return_value=False
    )
    @patch(
        "nebullvm.tools.hardware_utils.tpu_is_available", return_value=False
    )
    @patch(
        "nebullvm.tools.hardware_utils.neuron_is_available", return_value=False
    )
    def test_hw_setup__gpu_not_available(self, *_):
        setup = hardware_utils.get_hw_setup()
        self.assertIsNone(setup.accelerator)
        self.assertGreater(len(setup.cpu), 0)
        self.assertGreater(len(setup.operating_system), 0)
        self.assertGreater(setup.memory_gb, 0)

    @patch("nebullvm.tools.hardware_utils.gpu_is_available", return_value=True)
    @patch(
        "nebullvm.tools.hardware_utils._get_gpu_name", return_value="mock-gpu"
    )
    def test_hw_setup__gpu_is_available(self, *_):
        setup = hardware_utils.get_hw_setup()
        self.assertEqual("mock-gpu", setup.accelerator)
        self.assertGreater(len(setup.cpu), 0)
        self.assertGreater(len(setup.operating_system), 0)
        self.assertGreater(setup.memory_gb, 0)


================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_utils.py
================================================
import unittest
from unittest.mock import patch

from nebullvm.core.models import DeviceType
from nebullvm.tools import utils


class TestGetThroughput(unittest.TestCase):
    def test_latency_is_zero(self):
        self.assertEqual(-1, utils.get_throughput(0, 10))


class TestCheckDevice(unittest.TestCase):
    @patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
    @patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
    @patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
    def test_device_is_none_no_device_available(self, *_):
        device = utils.check_device()
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.gpu_is_available", return_value=True)
    @patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
    @patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
    def test_device_is_none_gpu_is_available(self, *_):
        device = utils.check_device()
        self.assertEqual(DeviceType.GPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.tpu_is_available", return_value=True)
    @patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
    @patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
    def test_device_is_none_tpu_is_available(self, *_):
        device = utils.check_device()
        self.assertEqual(DeviceType.TPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.neuron_is_available", return_value=True)
    @patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
    @patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
    def test_device_is_none_neuron_is_available(self, *_):
        device = utils.check_device()
        self.assertEqual(DeviceType.NEURON, device.type)
        self.assertEqual(device.idx, 0)

    def test_device_is_cpu(self):
        device = utils.check_device("cpu")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
    def test_device_is_gpu_no_gpu_available(self, _):
        device = utils.check_device("gpu")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("cuda")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("cuda:1")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("gpu:2")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.gpu_is_available", return_value=True)
    def test_device_is_gpu_gpu_is_available(self, _):
        device = utils.check_device("gpu")
        self.assertEqual(DeviceType.GPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("cuda")
        self.assertEqual(DeviceType.GPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("cuda:1")
        self.assertEqual(DeviceType.GPU, device.type)
        self.assertEqual(device.idx, 1)

        device = utils.check_device("gpu:2")
        self.assertEqual(DeviceType.GPU, device.type)
        self.assertEqual(device.idx, 2)

    @patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
    def test_device_is_tpu_no_tpu_available(self, _):
        device = utils.check_device("tpu")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("tpu:1")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.tpu_is_available", return_value=True)
    def test_device_is_tpu_tpu_is_available(self, _):
        device = utils.check_device("tpu")
        self.assertEqual(DeviceType.TPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("tpu:1")
        self.assertEqual(DeviceType.TPU, device.type)
        self.assertEqual(device.idx, 1)

    @patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
    def test_device_is_neuron_no_neuron_available(self, _):
        device = utils.check_device("neuron")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("neuron:1")
        self.assertEqual(DeviceType.CPU, device.type)
        self.assertEqual(device.idx, 0)

    @patch("nebullvm.tools.utils.neuron_is_available", return_value=True)
    def test_device_is_neuron_neuron_is_available(self, _):
        device = utils.check_device("neuron")
        self.assertEqual(DeviceType.NEURON, device.type)
        self.assertEqual(device.idx, 0)

        device = utils.check_device("neuron:1")
        self.assertEqual(DeviceType.NEURON, device.type)
        self.assertEqual(device.idx, 1)


================================================
FILE: optimization/nebullvm/nebullvm/tools/tf.py
================================================
from typing import Union, List, Tuple, Any, Optional, Dict

import numpy as np
from loguru import logger

from nebullvm.core.models import Device, DataType, InputInfo
from nebullvm.optional_modules.tensorflow import tensorflow as tf


def get_output_info_tf(
    tf_model: Union[tf.Module, tf.keras.Model],
    input_tensors: List[tf.Tensor],
    device: Device,
) -> List[Tuple[Tuple[int, ...], DataType]]:
    with tf.device(device.to_tf_format()):
        outputs = tf_model(input_tensors)
    if isinstance(outputs, tf.Tensor) and outputs is not None:
        return [
            (
                tuple(outputs.shape),
                DataType.from_framework_format(outputs.dtype),
            )
        ]
    return [
        (tuple(x.shape), DataType.from_framework_format(x.dtype))
        for x in outputs
    ]


def create_model_inputs_tf(input_infos: List[InputInfo]) -> List[tf.Tensor]:
    return [
        tf.random_normal_initializer()(
            shape=(
                input_info.size[0],
                *input_info.size[2:],
                input_info.size[1],
            )
        )
        if input_info.dtype is DataType.FLOAT32
        else tf.random.uniform(
            shape=(
                input_info.size[0],
                *input_info.size[2:],
                input_info.size[1],
            ),
            minval=input_info.min_value or 0,
            maxval=input_info.max_value or 100,
            dtype=tf.int32,
        )
        for input_info in input_infos
    ]


def run_tf_model(
    model: tf.Module,
    input_tensors: Tuple[tf.Tensor],
    device: Device,
) -> Tuple[tf.Tensor]:
    with tf.device(device.to_tf_format()):
        pred = model(input_tensors)
    if isinstance(pred, tf.Tensor):
        pred = (pred,)
    return pred


def _extract_dynamic_axis(
    tf_model: tf.Module,
    dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],
    input_sizes: List[Tuple[int, ...]],
    device: Device,
    max_data: int = 100,
) -> Optional[Dict]:
    from nebullvm.tools.utils import inspect_dynamic_size

    dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
    output_sizes = []
    for i, input_data in enumerate(dataset):
        input_tensors = input_data[0]
        if i >= max_data:
            break
        inspect_dynamic_size(
            input_tensors, input_sizes, dynamic_axis["inputs"]
        )
        outputs = tuple(run_tf_model(tf_model, input_tensors, device))
        if i == 0:
            dynamic_axis["outputs"] = [{}] * len(outputs)
            output_sizes = [tuple(output.shape[1:]) for output in outputs]
        inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
    if any(
        len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
    ):
        return dynamic_axis
    return None


def extract_info_from_tf_data(
    tf_model: tf.Module,
    dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],
    dynamic_axis: Dict,
    device: Device,
    **kwargs,
):
    from nebullvm.tools.utils import ifnone

    input_row = dataset[0][0]
    batch_size = int(input_row[0].shape[0])
    if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
        logger.warning("Detected not consistent batch size in the inputs.")

    input_sizes = [tuple(x.shape) for x in input_row]
    input_types = [
        "int32"
        if x.dtype in [tf.int32, np.int32]
        else "int64"
        if x.dtype in [tf.int64, np.int64]
        else "float16"
        if x.dtype in [tf.float16, np.float16]
        else "float32"
        for x in input_row
    ]

    dynamic_axis = ifnone(
        dynamic_axis,
        _extract_dynamic_axis(tf_model, dataset, input_sizes, device),
    )
    return batch_size, input_sizes, input_types, dynamic_axis


def tensorflow_is_gpu_available():
    return len(tf.config.list_physical_devices("GPU")) > 0


def tensorflow_get_gpu_name():
    gpu_devices = tf.config.list_physical_devices("GPU")
    if gpu_devices:
        details = tf.config.experimental.get_device_details(gpu_devices[0])
        details.get("device_name", "Unknown GPU")
        return details["device_name"]
    else:
        return "Unknown GPU"


================================================
FILE: optimization/nebullvm/nebullvm/tools/transformations.py
================================================
import copy
from abc import ABC, abstractmethod
from typing import List, Any, Dict

import numpy as np

from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch


class BaseTransformation(ABC):
    @abstractmethod
    def _transform(self, _input: Any, **kwargs) -> Any:
        raise NotImplementedError()

    def __call__(self, _input: Any, **kwargs):
        return self._transform(_input, **kwargs)

    def to_dict(self):
        return {
            "module": self.__class__.__module__,
            "name": self.__class__.__name__,
        }

    @classmethod
    def from_dict(cls, tfm_dict: Dict):
        return cls()


class MultiStageTransformation(BaseTransformation):
    def __init__(self, transformations: List[BaseTransformation]):
        self._tfms = transformations

    def _transform(self, _input: Any, **kwargs) -> Any:
        for tfm in self._tfms:
            _input = tfm(_input, **kwargs)
        return _input

    def append(self, __tfm: BaseTransformation):
        self._tfms.append(__tfm)

    def extend(self, tfms: List[BaseTransformation]):
        self._tfms += tfms

    def to_dict(self) -> Dict:
        return {"tfms": [tfm.to_dict() for tfm in self._tfms]}

    def to_list(self):
        return self._tfms

    @classmethod
    def from_dict(cls, tfms_dict: Dict):
        tfms = []
        for tfm_dict in tfms_dict["tfms"]:
            exec(f"from {tfm_dict['module']} import {tfm_dict['name']}")
            tfm = eval(tfm_dict["name"]).from_dict(tfm_dict)
            tfms.append(tfm)
        return cls(tfms)

    def copy(self):
        new_list = copy.deepcopy(self._tfms)
        return self.__class__(new_list)

    def __len__(self):
        return len(self._tfms)


class HalfPrecisionTransformation(BaseTransformation):
    @staticmethod
    def _transform_numpy(_input: np.ndarray) -> np.ndarray:
        return _input.astype(dtype=np.float16)

    @staticmethod
    def _transform_tf(_input: tf.Tensor) -> tf.Tensor:
        return tf.cast(_input, tf.float16)

    @staticmethod
    def _transform_torch(_input: torch.Tensor) -> torch.Tensor:
        return _input.half()

    def _transform(self, _input: Any, **kwargs) -> Any:
        if isinstance(_input, np.ndarray):
            return (
                self._transform_numpy(_input)
                if _input.dtype == np.float32
                else _input
            )
        elif isinstance(_input, torch.Tensor):
            return (
                self._transform_torch(_input)
                if _input.dtype == torch.float32
                else _input
            )
        elif isinstance(_input, tf.Tensor) and _input is not None:
            return (
                self._transform_tf(_input)
                if _input.dtype == tf.float32
                else _input
            )
        else:
            raise TypeError(
                f"The given input type is not currently supported. "
                f"Got {type(_input)}, expected one between (np.ndarray, "
                f"torch.Tensor, tf.Tensor)"
            )


class NoOp(BaseTransformation):
    def _transform(self, _input: Any, **kwargs):
        return _input


class VerifyContiguity(BaseTransformation):
    def _transform(self, _input: Any, **kwargs) -> Any:
        if not isinstance(_input, torch.Tensor):
            return _input
        if not _input.is_contiguous():
            _input = _input.contiguous()
        return _input


================================================
FILE: optimization/nebullvm/nebullvm/tools/utils.py
================================================
import os
import subprocess
import sys
import uuid
from pathlib import Path
from types import ModuleType
from typing import (
    Tuple,
    Any,
    List,
    Dict,
    Union,
    Iterable,
    Sequence,
    Optional,
    Callable,
)

import numpy as np
from loguru import logger
from packaging import version

from nebullvm.core.models import (
    DeepLearningFramework,
    Device,
    ModelParams,
    DeviceType,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import (
    extract_info_from_np_data,
    get_output_info_onnx,
)
from nebullvm.tools.pytorch import (
    extract_info_from_torch_data,
    get_output_info_torch,
)
from nebullvm.tools.tf import (
    extract_info_from_tf_data,
    get_output_info_tf,
)


def get_model_size_mb(model: Any) -> float:
    if isinstance(model, str):
        size = os.stat(model).st_size
    elif isinstance(model, Path):
        size = os.path.getsize(model.as_posix())
    elif isinstance(model, torch.nn.Module):
        size = sum(p.nelement() * p.element_size() for p in model.parameters())
    else:
        # we assume it is a tf_model
        # assuming full precision 32 bit
        size = model.count_params() * 4
    return round(size * 1e-6, 2)


def get_model_name(model: Any) -> str:
    if isinstance(model, str):
        return model
    if isinstance(model, Path):
        return model.as_posix()
    return model.__class__.__name__


def generate_model_id(model: Any) -> str:
    model_name = get_model_name(model)
    return f"{str(uuid.uuid4())}_{hash(model_name)}"


def get_throughput(latency: float, batch_size: int = 1) -> float:
    if latency == 0:
        return -1
    return (1 / latency) * batch_size


def ifnone(target, new_value):
    if target is None:
        return new_value
    else:
        return target


def inspect_dynamic_size(
    tensors: Tuple[Any, ...],
    sizes: List[Tuple[int, ...]],
    axis_list: List[Dict],
):
    for idx, (tensor, size) in enumerate(zip(tensors, sizes)):
        for idy, (j, k) in enumerate(zip(tensor.shape, size)):
            if j != k:
                if idy == 0:
                    tag = "batch_size"
                else:
                    tag = f"val_{j}_{k}"
                axis_list[idx][idy] = tag


def gpu_is_available():
    try:
        subprocess.check_output("nvidia-smi")
        return True
    except Exception:
        return False


def neuron_is_available():
    try:
        subprocess.check_output("neuron-ls")
        return True
    except Exception:
        return False


def tpu_is_available():
    # Check if a tpu is available
    try:
        import torch_xla
        import torch_xla.core.xla_model as xm

        return xm.xla_device_hw(torch_xla.core.xla_model.xla_device()) == "TPU"
    except Exception:
        return False


def check_module_version(
    module: ModuleType, min_version: str = None, max_version: str = None
) -> bool:
    installed_version = module.__version__

    if min_version is not None:
        if version.parse(installed_version) < version.parse(min_version):
            return False

    if max_version is not None:
        if version.parse(installed_version) > version.parse(max_version):
            return False

    return True


def is_python_version_3_10():
    return (
        str(sys.version_info.major) + "." + str(sys.version_info.minor)
        == "3.10"
    )


def get_dl_framework(model: Any):
    if isinstance(model, torch.nn.Module):
        return DeepLearningFramework.PYTORCH
    elif isinstance(model, tf.Module) and model is not None:
        return DeepLearningFramework.TENSORFLOW
    elif isinstance(model, str):
        if Path(model).is_file():
            return DeepLearningFramework.NUMPY
        else:
            raise FileNotFoundError(
                f"No file '{model}' found, please provide a valid path to "
                f"a model."
            )
    else:
        raise TypeError(f"Model type {type(model)} not supported.")


def check_input_data(input_data: Union[Iterable, Sequence]):
    try:
        assert len(input_data) > 0
        assert isinstance(input_data[0], tuple)
        assert isinstance(input_data[0][0], tuple)
        assert isinstance(
            input_data[0][0][0], (np.ndarray, torch.Tensor, tf.Tensor)
        )
        if len(input_data[0]) > 1:
            assert isinstance(
                input_data[0][1],
                (np.ndarray, torch.Tensor, tf.Tensor, int, float, type(None)),
            )
    except:  # noqa E722
        return False
    else:
        return True


def is_data_subscriptable(input_data: Union[Iterable, Sequence]):
    try:
        input_data[0]
    except:  # noqa E722
        return False
    else:
        return True


def check_dynamic_info_inputs(
    dynamic_info: Optional[Dict], input_sample: Tuple[Any]
):
    if dynamic_info is not None:
        assert dynamic_info.get("inputs") is not None, (
            "Dynamic info must contain an 'inputs' key with a list of "
            "dictionaries as value."
        )

        num_dynamic_inputs = len(dynamic_info["inputs"])
        num_model_inputs = len(input_sample)
        assert len(dynamic_info["inputs"]) == len(input_sample), (
            f"The number of dynamic inputs provided in the dynamic info "
            f"dict ({num_dynamic_inputs}) is not equal to the number "
            f"of inputs of the model ({num_model_inputs}). Detected model "
            f"input shapes are: {[input.shape for input in input_sample]} "
        )

        assert dynamic_info.get("outputs") is not None, (
            "Dynamic info must contain an 'outputs' key with a list of "
            "dictionaries as value."
        )


def extract_info_from_data(
    model: Any,
    input_data: DataManager,
    dl_framework: DeepLearningFramework,
    dynamic_info: Optional[Dict],
    device: Device,
    is_diffusion: bool = False,
):
    check_dynamic_info_inputs(dynamic_info, input_data.get_list(1)[0])
    batch_size, input_sizes, input_types, dynamic_info = INFO_EXTRACTION_DICT[
        dl_framework
    ](
        model,
        input_data,
        dynamic_axis=dynamic_info,
        device=device,
        is_diffusion=is_diffusion,
    )

    output_infos = OUTPUT_INFO_COMPUTATION_DICT[dl_framework](
        model, input_data[0][0], device
    )
    model_params = ModelParams(
        batch_size=batch_size,
        input_infos=[
            {"size": size, "dtype": dtype}
            for size, dtype in zip(input_sizes, input_types)
        ],
        output_sizes=[info[0] for info in output_infos],
        output_types=[info[1] for info in output_infos],
        dynamic_info=dynamic_info,
    )
    return model_params


def is_huggingface_data(data_sample: Any) -> bool:
    if is_dict_type(data_sample):
        return True
    elif isinstance(data_sample, str):
        return True
    elif isinstance(data_sample[0], str):
        return True
    return False


def is_dict_type(data_sample: Any):
    try:
        data_sample.items()
    except AttributeError:
        return False
    else:
        return True


def _get_idx(device: str) -> int:
    device_info = device.split(":")
    if len(device_info) == 2 and device_info[1].isdigit():
        idx = int(device_info[1])
    else:
        idx = 0
    return idx


def _set_device(
    accelerator_is_available: bool, device_type: DeviceType, idx: int
) -> Device:
    if not accelerator_is_available:
        logger.warning(
            f"Selected {device_type.name} device but no available "
            f"{device_type.name} found on this platform. CPU will "
            f"be used instead. Please make sure that the "
            f"{device_type.name} is installed and can be used by your "
            "framework."
        )
        device = Device(DeviceType.CPU)
    else:
        device = Device(device_type, idx=idx)

    return device


def check_device(device: Optional[str] = None) -> Device:
    if device is None:
        if gpu_is_available():
            device = Device(DeviceType.GPU)
        elif neuron_is_available():
            device = Device(DeviceType.NEURON)
        elif tpu_is_available():
            device = Device(DeviceType.TPU)
        else:
            device = Device(DeviceType.CPU)
    else:
        if any(x in device.lower() for x in ["cuda", "gpu"]):
            device = _set_device(
                accelerator_is_available=gpu_is_available(),
                device_type=DeviceType.GPU,
                idx=_get_idx(device),
            )
        elif "neuron" in device.lower():
            device = _set_device(
                accelerator_is_available=neuron_is_available(),
                device_type=DeviceType.NEURON,
                idx=_get_idx(device),
            )
        elif "tpu" in device.lower():
            device = _set_device(
                accelerator_is_available=tpu_is_available(),
                device_type=DeviceType.TPU,
                idx=_get_idx(device),
            )
        else:
            device = Device(DeviceType.CPU)

    return device


def get_gpu_compute_capability(gpu_idx: int) -> float:
    compute_capability = subprocess.check_output(
        ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"]
    ).decode("utf-8")
    return float(compute_capability.split("\n")[gpu_idx])


INFO_EXTRACTION_DICT: Dict[DeepLearningFramework, Callable] = {
    DeepLearningFramework.PYTORCH: extract_info_from_torch_data,
    DeepLearningFramework.TENSORFLOW: extract_info_from_tf_data,
    DeepLearningFramework.NUMPY: extract_info_from_np_data,
}

OUTPUT_INFO_COMPUTATION_DICT: Dict[DeepLearningFramework, Callable] = {
    DeepLearningFramework.PYTORCH: get_output_info_torch,
    DeepLearningFramework.TENSORFLOW: get_output_info_tf,
    DeepLearningFramework.NUMPY: get_output_info_onnx,
}


================================================
FILE: optimization/nebullvm/nebullvm/tools/venv.py
================================================
import subprocess
import tempfile
import venv

from loguru import logger


class EnvBuilder(venv.EnvBuilder):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.context = None

    def post_setup(self, context):
        self.context = context


def run_in_different_venv(
    requirements_file: str,
    script_path: str,
    use_gpu: bool,
    *args,
):
    """Run a python scripts in a new temporary environment. Arguments for the
    script must be passed in the function args.
    it is equivalent to create and activate a new environment and running
    > pip install -r $requirement_file
    > python -m script_path *args
    Args:
        requirements_file (str): File (.txt) containing the list of
            requirements.
        script_path (str): Path to the script that must be run.
        args: Arguments of the script.
    """
    logger.debug(f"Debug: Running script {script_path} in a new virtual env.")
    with tempfile.TemporaryDirectory() as target_dir_path:
        logger.debug("Debug: Creating virtual environment...")
        venv_builder = EnvBuilder(with_pip=True)
        venv_builder.create(str(target_dir_path))
        venv_context = venv_builder.context

        logger.debug("Debug: Installing requirements...")

        if use_gpu:
            pip_install_command = [
                venv_context.env_exe,
                "-m",
                "pip",
                "install",
                "torch==1.9.1+cu111",
                "torchvision==0.10.1+cu111",
                "-f",
                "https://download.pytorch.org/whl/torch_stable.html",
            ]
        else:
            pip_install_command = [
                venv_context.env_exe,
                "-m",
                "pip",
                "install",
                "torch<=1.9.1",
                "torchvision<=0.10.1",
            ]
        subprocess.check_call(pip_install_command)

        pip_install_command = [
            venv_context.env_exe,
            "-m",
            "pip",
            "install",
            "-r",
            requirements_file,
        ]
        subprocess.check_call(pip_install_command)

        logger.debug("Debug: Executing script...")
        script_command = [venv_context.env_exe, script_path, *args]
        subprocess.check_call(script_command)


================================================
FILE: optimization/nebullvm/nebullvm.toml
================================================
[build-system]
requires = [
    "setuptools>=42",
    "wheel"
]
build-backend = "setuptools.build_meta"

================================================
FILE: optimization/nebullvm/requirements-dev.txt
================================================
pytest
pytest-mock
torchvision
sentencepiece


================================================
FILE: optimization/nebullvm/requirements.txt
================================================
numpy>=1.21.0, <1.24.0
packaging>=21.3
py-cpuinfo==8.0.0
PyYAML>=6.0
psutil>=5.0.0
requests>=2.26.1
tqdm>=4.36.0
loguru>=0.5.3

================================================
FILE: optimization/nebullvm/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages


REQUIREMENTS = [
    "numpy>=1.21.0, <1.24.0",
    "py-cpuinfo>=8.0.0",
    "PyYAML>=6.0",
    "psutil>=5.0.0",
    "requests>=2.26.0",
    "tqdm>=4.36.0",
    "packaging>=21.3",
    "loguru>=0.5.3",
]

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")

setup(
    name="nebullvm",
    version="0.10.0",
    packages=find_packages(),
    install_requires=REQUIREMENTS,
    long_description=long_description,
    include_package_data=True,
    long_description_content_type="text/markdown",
)


================================================
FILE: optimization/open_alpha_tensor/README.md
================================================
# 🐉 OpenAlphaTensor
OpenAlphaTensor provides an open-source implementation of Deepmind's AlphaTensor algorithm.

With OpenAlphaTensor, you can increase the computational performances of an AI model with custom-generated matrix multiplication algorithms. You can train your own AlphaTensor algorithm for a specific matrix size or fine-tune a pre-trained AlphaTensor model to produce optimized kernels for a specific hardware.

OpenAlphaTensor is based on Deepmind's paper [Discovering Faster Matrix Multiplication Algorithms with Reinforcement Learning](https://www.nature.com/articles/s41586-022-05172-4).

If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)

## 🧑‍🏫 Installation
You can install the package cloning the repository and running the following commands:
```bash
git clone https://github.com/nebuly-ai/nebullvm.git
cd nebullvm/apps/accelerate/open_alpha_tensor
pip install -e .
```

## 🚀 Get started
For training your AlphaTensor model, you can execute the following command:
```bash
python main.py 
```
Model parameters can be given either as command line arguments or as a JSON file. The `config.json` file contains the default parameters for training a model for matrix size 4x4x4.

Alternatively, if you want to have a more fine-grained control over the training process, you can use the python API:
```python
from open_alpha_tensor import train_alpha_tensor

cardinality_vector = 5  # The actions can have values in range [-2, 2]
N_bar = 100  # parameter for smoothing the temperature while adjusting the probability distribution
matrix_size = 5
input_size = matrix_size**2
n_steps = 15
n_actions = cardinality_vector ** (3 * input_size // n_steps)
action_memory = 7

train_alpha_tensor(
    tensor_length=action_memory + 1,
    input_size=input_size,
    scalars_size=1,
    emb_dim=2048,
    n_steps=n_steps,
    n_logits=n_actions,
    n_samples=32,
    device="cuda",
    len_data=2048,
    n_synth_data=1000000,
    pct_synth=0.7,
    batch_size=32,
    epochs=600000,
    lr=1e-4,
    lr_decay_factor=0.5,
    lr_decay_steps=5000,
    weight_decay=1e-5,
    optimizer_name="adamw",
    loss_params=(1, 1),
    limit_rank=150,
    checkpoint_dir="path/to/checkpoint/dir",
    checkpoint_data_dir="path/where/to/save/data/generated/by/the/model",
    n_actors=1,
    mc_n_sim=200,
    n_cob=100000,
    cob_prob=0.9983,
    data_augmentation=True,
    N_bar=N_bar,
    random_seed=42,
    extra_devices=None,
    save_dir="path/to/save/final/model",
)
```

## 🧪 Missing features
- [ ] Release weights of pre-trained models. **Coming out soon**.
- [ ] Add compilation of Alpha Tensor kernels in OpenAI's Triton and JAX/XLA.
- [ ] Add support for fine-tuning on target hardware.
- [ ] Support training on Multiple GPUs (it allows training on a larger batch size).
- [ ] Add support for other compilers (e.g. llvm).
- [ ] Reduce memory footprint of the Acting Agent.
- [ ] Improve acting speed.

## 💫 Contributing

We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.

A special thanks to [BrianPulfer](https://github.com/BrianPulfer) for his awesome contribution to the OpenAlphaTensor module.


================================================
FILE: optimization/open_alpha_tensor/config.json
================================================
{
    "batch_size": 16,
    "max_epochs": 600000,
    "action_memory": 7,
    "optimizer": "adamw",
    "weight_decay": 1e-5,
    "lr": 1e-4,
    "lr_decay_factor": 0.1,
    "lr_decay_steps": 500000,
    "device": "cuda:0",
    "len_data": 2048,
    "pct_synth": 0.9,
    "n_synth_data": 100000,
    "limit_rank": 125,
    "alpha": 1.0,
    "beta": 1.0,
    "matrix_size": 4,
    "embed_dim": 1024,
    "actions_sampled": 32,
    "n_actors": 1,
    "mc_n_sim": 200,
    "n_cob": 100000,
    "cob_prob": 0.9983,
    "cardinality_vector": 5,
    "n_bar": 100
}

================================================
FILE: optimization/open_alpha_tensor/main.py
================================================
import json
import os
from argparse import ArgumentParser
from pathlib import Path

from open_alpha_tensor import train_alpha_tensor


def _compute_largest_divisor(n: int) -> int:
    """Compute the largest divisor of n."""
    for i in range(n // 2, 0, -1):
        if n % i == 0:
            return i
    return 1


def main():
    config_file = Path(os.getenv("CONFIG_FILE", "config.json"))
    if config_file.exists():
        with open(config_file) as f:
            config = json.load(f)
    else:
        config = {}
    parser = ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--max_epochs", type=int, default=1)
    parser.add_argument("--action_memory", type=int, default=1)
    parser.add_argument("--optimizer", type=str, default="adamw")
    parser.add_argument("--weight_decay", type=float, default=1e-5)
    parser.add_argument("--lr", type=float, default=1e-4)
    parser.add_argument("--lr_decay_factor", type=float, default=0.5)
    parser.add_argument("--lr_decay_steps", type=int, default=5000)
    parser.add_argument("--device", type=str, default="cuda")
    # parser.add_argument("--half", action="store_true")
    parser.add_argument("--len_data", type=int, default=100)
    parser.add_argument("--pct_synth", type=float, default=0.5)
    parser.add_argument("--n_synth_data", type=int, default=100)
    parser.add_argument("--limit_rank", type=int, default=15)
    parser.add_argument("--alpha", type=float, default=1.0)
    parser.add_argument("--beta", type=float, default=1.0)
    parser.add_argument("--random_seed", type=int, default=None)
    parser.add_argument("--checkpoint_dir", type=str, default=None)
    parser.add_argument("--checkpoint_data_dir", type=str, default=None)
    parser.add_argument("--matrix_size", type=int, default=3)
    parser.add_argument("--embed_dim", type=int, default=1024)
    parser.add_argument("--actions_sampled", type=int, default=10)
    parser.add_argument("--n_actors", type=int, default=1)
    parser.add_argument("--mc_n_sim", type=int, default=100)
    parser.add_argument("--n_cob", type=int, default=100000)
    parser.add_argument("--cob_prob", type=float, default=0.9983)  # 1 - 0.0017
    parser.add_argument("--data_augmentation", action="store_true")
    parser.add_argument("--cardinality_vector", type=int, default=5)
    parser.add_argument(
        "--n_bar",
        type=int,
        default=100,
        help="N_bar parameter for policy temperature.",
    )
    parser.add_argument("--save_dir", type=str, default=None)
    parser.add_argument("extra_devices", nargs="*", type=str, default=[])
    parser.set_defaults(**config)
    args = parser.parse_args()

    cardinality_vector = args.cardinality_vector
    N_bar = args.n_bar
    input_size = args.matrix_size**2
    n_steps = _compute_largest_divisor(input_size)
    n_actions = cardinality_vector ** (3 * input_size // n_steps)
    loss_params = (args.alpha, args.beta)

    train_alpha_tensor(
        tensor_length=args.action_memory + 1,
        input_size=input_size,
        scalars_size=1,
        emb_dim=args.embed_dim,
        n_steps=n_steps,
        n_logits=n_actions,
        n_samples=args.actions_sampled,
        device=args.device,
        len_data=args.len_data,
        n_synth_data=args.n_synth_data,
        pct_synth=args.pct_synth,
        batch_size=args.batch_size,
        epochs=args.max_epochs,
        lr=args.lr,
        lr_decay_factor=args.lr_decay_factor,
        lr_decay_steps=args.lr_decay_steps,
        weight_decay=args.weight_decay,
        optimizer_name=args.optimizer,
        loss_params=loss_params,
        limit_rank=args.limit_rank,
        random_seed=args.random_seed,
        checkpoint_dir=args.checkpoint_dir,
        checkpoint_data_dir=args.checkpoint_data_dir,
        n_actors=args.n_actors,
        mc_n_sim=args.mc_n_sim,
        n_cob=args.n_cob,
        cob_prob=args.cob_prob,
        data_augmentation=args.data_augmentation or False,
        N_bar=N_bar,
        extra_devices=args.extra_devices,
        save_dir=args.save_dir,
    )


if __name__ == "__main__":
    main()


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/__init__.py
================================================
from open_alpha_tensor.api.functions import train_alpha_tensor  # noqa: F401


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/functions.py
================================================
from typing import List, Tuple

from open_alpha_tensor.root_op import TrainAlphaTensorRootOp


def train_alpha_tensor(
    tensor_length: int,
    input_size: int,
    scalars_size: int,
    emb_dim: int,
    n_steps: int,
    n_logits: int,
    n_samples: int,
    optimizer_name: str,
    lr: float,
    lr_decay_factor: float,
    lr_decay_steps: int,
    weight_decay: float,
    loss_params: Tuple[float, float],
    checkpoint_dir: str,
    checkpoint_data_dir: str,
    epochs: int,
    batch_size: int,
    len_data: int,
    n_synth_data: int,
    pct_synth: float,
    limit_rank: int,
    n_actors: int,
    mc_n_sim: int,
    N_bar: int,
    device: str,
    save_dir: str,
    random_seed: int,
    n_cob: int,
    cob_prob: float,
    data_augmentation: bool,
    extra_devices: List[str],
):
    """Trains an AlphaTensor model to learn more efficient matrix
    multiplications and returns it.

    Args:
        tensor_length (int): Number of tensors to as history.
        input_size (int): Flattened size of the matrices to be multiplied.
        scalars_size (int): Size of the scalar vectors fed to the torso model.
        emb_dim (int): Embedding dimension.
        n_steps (int): Number of steps used to get a single action out of a
        triplet.
        n_logits (int): Number of logits output by the policy head.
        n_samples (int): Number of samples used by the policy head at
        evaluation time.
        optimizer_name (str): Name of the optimizer used.
        lr (float): Learning rate.
        lr_decay_factor (float): Learning rate's decay factor.
        lr_decay_steps (int): Number of learning rate's decay steps.
        weight_decay (float): Weight decay used by the optimizer.
        loss_params (Tuple[float, float]): Alpha and Beta parameters used in
        the loss function.
        checkpoint_dir (str): Directory used to store model checkpoints.
        checkpoint_data_dir (str): Directory used to store games as JSON files.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size.
        len_data (int): Number of training samples used (both actor generated
        and synthetic).
        n_synth_data (int): Number of synthetic training samples.
        pct_synth (float): Initial percentage of synthetic samples used for
        training.
        limit_rank (int): Maximum number of steps per episode and maximum rank
        for synthetically-generated matrices.
        n_actors (int): Number of actors to play a single each game at each
        training step.
        mc_n_sim (int): Number of simulations during Monte Carlo tree search.
        N_bar (int): N_bar parameter used to compute tau when improving the
        policy.
        device (str): The name of the torch device used for training.
        save_dir (str): Directory where the final trained model will be stored.
        random_seed (int): Randomizing seed.
        n_cob (int): Number of change of basis (cob) used for a single
        training sample.
        cob_prob (float): Probability of applying a change of basis.
        data_augmentation (bool): Whether to randomly swap the last operation
        of an episode with another operation.
        extra_devices (List[str]): Extra devices names used for multi-GPU
        training.
    """
    root_op = TrainAlphaTensorRootOp()
    root_op.execute(
        tensor_length=tensor_length,
        input_size=input_size,
        scalars_size=scalars_size,
        emb_dim=emb_dim,
        n_steps=n_steps,
        n_logits=n_logits,
        n_samples=n_samples,
        optimizer_name=optimizer_name,
        lr=lr,
        lr_decay_factor=lr_decay_factor,
        lr_decay_steps=lr_decay_steps,
        weight_decay=weight_decay,
        loss_params=loss_params,
        checkpoint_dir=checkpoint_dir,
        checkpoint_data_dir=checkpoint_data_dir,
        epochs=epochs,
        batch_size=batch_size,
        len_data=len_data,
        n_synth_data=n_synth_data,
        pct_synth=pct_synth,
        limit_rank=limit_rank,
        n_actors=n_actors,
        mc_n_sim=mc_n_sim,
        N_bar=N_bar,
        device=device,
        save_dir=save_dir,
        random_seed=random_seed,
        n_cob=n_cob,
        cob_prob=cob_prob,
        data_augmentation=data_augmentation,
        extra_devices=extra_devices,
    )
    return root_op.get_result()


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/config.py
================================================
BASE_CHECKPOINT_DIR = "checkpoints"
BASE_CHECKPOINT_DATA_DIR = "games"


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/stage.py
================================================
from typing import Dict, List

import torch

from open_alpha_tensor.core.data.utils import (
    get_scalars,
    map_action_to_triplet,
)
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel


def game_is_finished(state):
    """Tells if the game is finished or not.

    Args:
        state (torch.Tensor): The state of the game.
    """
    # state size (1, S, S, S)
    return (state == 0).all()


def remove_duplicates(reducing_tensor: torch.Tensor):
    """Remove duplicates from a tensor.

    Args:
        reducing_tensor (torch.Tensor): The tensor to remove duplicates from.
    """
    # reducing tensor has shape (1, N_mc, S, S, S)
    n_mc = reducing_tensor.shape[1]
    indexes = []
    idx_map = {}
    for idx in range(n_mc):
        if len(indexes) == 0:
            indexes.append(idx)
            idx_map[idx] = []
        else:
            idx_tensor = reducing_tensor[:, idx]
            for index in indexes:
                if (reducing_tensor[:, index] - idx_tensor == 0).all():
                    idx_map[index].append(idx)
                    break
            else:
                indexes.append(idx)
                idx_map[idx] = []

    # idx_map = {i: len(v) for i, v in enumerate(idx_map.values())}
    old_idx_to_new_idx_map = {}
    for new_idx, (key, values) in enumerate(idx_map.items()):
        old_idx_to_new_idx_map[key] = new_idx
        for second_idx in values:
            old_idx_to_new_idx_map[second_idx] = new_idx
    return (
        reducing_tensor[:, indexes],
        old_idx_to_new_idx_map,
        idx_map,
        indexes,
    )


def extract_children_states_from_actions(
    state: torch.Tensor,
    actions: torch.Tensor,
    vec_cardinality: int = 5,
):
    """Extract the children states from the actions.

    Args:
        state (torch.Tensor): The state of the game.
        actions (torch.Tensor): The actions to apply to the state.
        vec_cardinality (int, optional): The cardinality of the vectors.
    """
    # state (1, T, S, S, S)
    # actions (1, K, N_steps)
    # we assume actions to be with N_steps = 1,
    #  and N_logits = |F|^(3S/N_steps). Each action is then mapped in a
    #  unique way to a triplet (u, v, w) where each vector has size S.
    # vector cardinality represents the number of values it can take an entry
    #  of u, v or w.
    bs, k, n_steps = actions.shape[:3]
    len_token = 3 * state.shape[2] // n_steps
    actions = map_action_to_triplet(actions, vec_cardinality, len_token)
    actions = actions.reshape(bs, k, n_steps * len_token)
    vec_dim = state.shape[2]
    u = actions[:, :, :vec_dim].reshape(bs, k, vec_dim, 1, 1)
    v = actions[:, :, vec_dim : 2 * vec_dim].reshape(  # noqa E203
        bs, k, 1, vec_dim, 1
    )
    w = actions[:, :, 2 * vec_dim :].reshape(bs, k, 1, 1, vec_dim)  # noqa E203
    reducing_tensor = u * v * w
    (
        reducing_tensor,
        old_idx_to_new_idx,
        repetition_map,
        not_duplicate_indexes,
    ) = remove_duplicates(reducing_tensor)
    old_state = state[:, 0]
    new_state = old_state.unsqueeze(1) - reducing_tensor
    rolling_states = torch.roll(state, 1)[:, 2:]
    return (
        [
            torch.cat(
                [
                    new_state[:, i : i + 1],  # noqa E203
                    reducing_tensor[:, i : i + 1],  # noqa E203
                    rolling_states,
                ],
                dim=1,
            )
            for i in range(k)
        ],
        old_idx_to_new_idx,
        repetition_map,
        not_duplicate_indexes,
    )


def _reduce_memory_consumption_before_storing(
    possible_states: List[torch.Tensor],
):
    """Reduce the memory consumption before storing the states.

    Args:
        possible_states (List[torch.Tensor]): The possible states.
    """
    final_states = [state[:, 0:2] for state in possible_states]
    previous_actions = possible_states[0][:, 2:]
    storing_dict = {
        "final_states": final_states,
        "previous_actions": previous_actions,
    }
    return storing_dict


def _recompose_possible_states(reduced_memory_states_dict: Dict):
    """Recompose the possible states from the reduced memory states.

    Args:
        reduced_memory_states_dict (Dict): The reduced memory states.
    """
    final_states = reduced_memory_states_dict["final_states"]
    previous_actions = reduced_memory_states_dict["previous_actions"]
    possible_states = [
        torch.cat(
            [
                final_states[i],
                previous_actions,
            ],
            dim=1,
        )
        for i in range(len(final_states))
    ]
    return possible_states


def extract_present_state(state: torch.Tensor) -> torch.Tensor:
    return state[:, 0]


def to_hash(tensor: torch.Tensor) -> str:
    """Converts a tensor to a hash string.

    Args:
        tensor: The tensor to convert.
    """
    hashable_tensor = "_".join(
        tensor.reshape(-1).long().detach().cpu().numpy().astype(str).tolist()
    )
    return hashable_tensor


def from_hash(hashable_tensor: str, shape: tuple) -> torch.Tensor:
    """Converts a hash string back to the original tensor.

    Args:
        hashable_tensor (str): The hash string.
        shape (tuple): The shape of the original tensor.
    """
    return torch.tensor([float(x) for x in hashable_tensor.split("_")]).resize(
        shape
    )


def record_action(tree_dict: Dict, state: str, action: str):
    """Record the action in the tree dictionary.

    Args:
        tree_dict (Dict): The tree dictionary.
        state (str): The state as a hash string.
        action (str): The action as a hash string.
    """
    if state in tree_dict:
        tree_dict[state].append(action)
    else:
        tree_dict[state] = [action]


def select_future_state(
    possible_states: List[torch.Tensor],
    q_values: torch.Tensor,
    N_s_a: torch.Tensor,
    repetitions: Dict[int, list],
    c_1: float = 1.25,
    c_2: float = 19652,
    return_idx: bool = False,
) -> torch.Tensor:
    """Select the future state maximizing the upper confidence bound."""
    # q_values (1, K, 1)
    pi = torch.tensor(
        [
            len(repetitions[i])
            for i in range(len(possible_states))
            if i in repetitions
        ]
    ).to(q_values.device)
    if pi.shape[0] != N_s_a.shape[1]:
        print(pi)
        print(pi.shape, q_values.shape, N_s_a.shape)
        pi = pi[: N_s_a.shape[1]]
    ucb = q_values.reshape(-1) + pi * torch.sqrt(
        torch.sum(N_s_a) / (1 + N_s_a)
    ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))
    if return_idx:
        return ucb.argmax()
    return possible_states[ucb.argmax()]


@torch.no_grad()
def simulate_game(
    model,
    state: torch.Tensor,
    t_time: int,
    max_steps: int,
    game_tree: Dict,
    states_dict: Dict,
    horizon: int = 5,
):
    """Simulates a game from a given state.

    Args:
        model: The model to use for the simulation.
        state (torch.Tensor): The initial state.
        t_time (int): The current time step.
        max_steps (int): The maximum number of steps to simulate.
        game_tree (Dict): The game tree.
        states_dict (Dict): The states dictionary.
        horizon (int): The horizon to use for the simulation.
    """
    idx = t_time
    max_steps = min(max_steps, t_time + horizon)
    state_hash = to_hash(extract_present_state(state))
    trajectory = []
    # selection
    while state_hash in game_tree:
        (
            possible_states_dict,
            old_idx_to_new_idx,
            repetition_map,
            N_s_a,
            q_values,
            actions,
        ) = states_dict[state_hash]
        possible_states = _recompose_possible_states(possible_states_dict)
        state_idx = select_future_state(
            possible_states, q_values, N_s_a, repetition_map, return_idx=True
        )
        trajectory.append((state_hash, state_idx))  # state_hash, action_idx
        future_state = extract_present_state(possible_states[state_idx])
        state = possible_states[state_idx]
        state_hash = to_hash(future_state)
        idx += 1

    # expansion
    if idx <= max_steps:
        trajectory.append((state_hash, None))
        if not game_is_finished(extract_present_state(state)):
            state = state.to(model.device)
            scalars = get_scalars(state, idx).to(state.device)
            actions, probs, q_values = model(state, scalars)
            (
                possible_states,
                cloned_idx_to_idx,
                repetitions,
                not_dupl_indexes,
            ) = extract_children_states_from_actions(
                state,
                actions,
            )
            not_dupl_actions = actions[:, not_dupl_indexes].to("cpu")
            not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(
                "cpu"
            )
            N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu")
            present_state = extract_present_state(state)
            states_dict[to_hash(present_state)] = (
                _reduce_memory_consumption_before_storing(possible_states),
                cloned_idx_to_idx,
                repetitions,
                N_s_a,
                not_dupl_q_values,
                not_dupl_actions,
            )
            game_tree[to_hash(present_state)] = [
                to_hash(extract_present_state(fut_state))
                for fut_state in possible_states
            ]
            leaf_q_value = q_values
    else:
        leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())
    # backup
    backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)


def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):
    """Backward pass of the montecarlo algorithm"""
    reward = 0
    for idx, (state, action_idx) in enumerate(reversed(trajectory)):
        if action_idx is None:  # leaf node
            reward += leaf_q_value
        else:
            (
                _,
                old_idx_to_new_idx,
                _,
                N_s_a,
                q_values,
                _,
            ) = states_dict[state]
            if isinstance(reward, torch.Tensor):
                reward = reward.to(q_values.device)
            action_idx = int(action_idx)
            if action_idx in old_idx_to_new_idx:
                not_dupl_index = old_idx_to_new_idx[int(action_idx)]
            else:
                not_dupl_index = action_idx
            reward -= 1
            q_values[:, not_dupl_index] = (
                N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward
            ) / (N_s_a[:, not_dupl_index] + 1)
            N_s_a[:, not_dupl_index] += 1


def monte_carlo_tree_search(
    model: torch.nn.Module,
    state: torch.Tensor,
    n_sim: int,
    t_time,
    n_steps: int,
    game_tree: Dict,
    state_dict: Dict,
):
    """Runs the monte carlo tree search algorithm.

    Args:
        model (torch.nn.Module): The model to use for the simulation.
        state (torch.Tensor): The initial state.
        n_sim (int): The number of simulations to run.
        t_time (int): The current time step.
        n_steps (int): The maximum number of steps to simulate.
        game_tree (Dict): The game tree.
        state_dict (Dict): The dictionary containing the states.
    """
    # Note that game tree is not the full tree, but just the one having as root
    #  the current node(state).
    # should we accept also previous updated trajectories for the current node?
    # is it something we should considering when deciding how many simulations
    # we should run? (I think yes)
    state_hash = to_hash(extract_present_state(state))
    if state_hash in state_dict:
        with torch.no_grad():
            N_s_a = state_dict[state_hash][3]
            n_sim -= int(N_s_a.sum())
            n_sim = max(n_sim, 0)

    for _ in range(n_sim):
        simulate_game(model, state, t_time, n_steps, game_tree, state_dict)
    # return next state
    possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[
        state_hash
    ]
    possible_states = _recompose_possible_states(possible_states_dict)
    next_state_idx = select_future_state(
        possible_states, q_values, N_s_a, repetitions, return_idx=True
    )
    next_state = possible_states[next_state_idx]
    return next_state


@torch.no_grad()
def compute_improved_policy(
    state_dict: Dict,
    states: List[str],
    model_n_steps: int,
    model_n_logits: int,
    N_bar: int,
):
    """Compute the improved policy given the state_dict, the list of states.
    The improved policy is computed as (N_s_aˆ(1/tau) / (N_s_aˆ(1/tau)).sum())
    where tau is (log(N_s_a.sum()) / log(N_bar))
    """
    policies = torch.zeros(len(states), model_n_steps, model_n_logits)
    N_bar = torch.tensor(N_bar)
    for idx, state in enumerate(states):
        N_s_a = state_dict[state][3]
        actions = state_dict[state][5]
        if N_s_a.sum() > N_bar:
            tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()
        else:
            tau = 1
        N_s_a = N_s_a ** (1 / tau)
        improved_policy = N_s_a / N_s_a.sum()
        for sample_id in range(actions.shape[1]):
            action_ids = actions[0, sample_id]
            for step_id, action_id in enumerate(action_ids):
                policies[idx, step_id, action_id] += improved_policy[
                    0, sample_id
                ]
    return policies


def actor_prediction(
    model: AlphaTensorModel,
    input_tensor: torch.Tensor,
    maximum_rank: int,
    mc_n_sim: int,
    N_bar: int,
    return_actions: bool = False,
):
    """Runs the monte carlo tree search algorithm to obtain the next states,
    policies and rewards.

    Args:
        model (AlphaTensorModel): The model to use for the simulation.
        input_tensor (torch.Tensor): The initial state.
        maximum_rank (int): The maximum number of steps to simulate.
        mc_n_sim (int): The number of simulations to run.
        N_bar (int): The parameter used to compute the improved policy.
        return_actions (bool): If True, only actions are returned.
    """
    # input_tensor has shape (1, T, S, S, S)
    state = input_tensor
    rank = 0
    game_tree = {}
    state_dict = {}
    hash_states = []
    states = []
    while rank < maximum_rank:
        states.append(state)
        hash_states.append(to_hash(extract_present_state(state)))
        state = monte_carlo_tree_search(
            model,
            state,
            mc_n_sim,
            rank,
            maximum_rank,
            game_tree,
            state_dict,
        )
        if game_is_finished(extract_present_state(state)):
            break
        rank += 1
    final_state = extract_present_state(state)
    policies = compute_improved_policy(
        state_dict, hash_states, model.n_steps, model.n_logits, N_bar
    )
    reward = (
        int(torch.linalg.matrix_rank(final_state).sum())
        if not game_is_finished(final_state)
        else 0
    )
    rewards = torch.cumsum(
        torch.tensor([-1] * (len(policies) - 1) + [reward]), dim=0
    )
    if return_actions:
        actions = [state_dict[hash_state][5] for hash_state in hash_states]
        return actions
    # policies do not have the batch size, but states still have it
    states = [s.squeeze(0) for s in states]
    return states, policies, rewards


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/basis_change.py
================================================
from pathlib import Path
from typing import Callable

import numpy as np
import torch


def get_change_basis_matrix(
    tensor_size: int,
    n_cob: int,
    entry_distribution: Callable = torch.randn,
    random_seed: int = None,
):
    """Generate a list of change of basis matrices.

    Args:
        tensor_size (int): Size of the tensor.
        n_cob (int): Number of change of basis matrices.
        entry_distribution (Callable, optional): Distribution of the entries
        of the change of basis matrices.
        random_seed (int, optional): Random seed for reproducibility.
    """
    if random_seed is not None:
        torch.random.manual_seed(random_seed)
    for _ in range(n_cob):
        diag_p = 2 * (torch.rand(tensor_size) > 0.5).float() - 1
        diag_l = 2 * (torch.rand(tensor_size) > 0.5).float() - 1
        random_matrix = entry_distribution((tensor_size, tensor_size))
        p_matrix = torch.diag(diag_p)
        l_matrix = torch.diag(diag_l)
        p_matrix = p_matrix + torch.triu(random_matrix, diagonal=1)
        l_matrix = l_matrix + torch.tril(random_matrix, diagonal=-1)
        yield torch.matmul(p_matrix, l_matrix)


def cob_entry_prob_distribution(size):
    full_size = int(np.prod(size))
    vals = torch.tensor([-1, 0, 1])
    probs = torch.tensor([0.0075, 0.985, 0.0075]).unsqueeze(0)
    cum_sum = torch.cumsum(probs, dim=-1)
    unif_prob = torch.rand((full_size, 1))
    tensor_idx = torch.argmax((unif_prob <= cum_sum).int(), dim=1)
    tensor = vals[tensor_idx]
    return tensor.reshape(size)


class ChangeOfBasis:
    """Change of Basis class."""

    """Change of Basis class."""

    def __init__(
        self,
        tensor_size: int,
        n_cob: int,
        cob_prob: float,
        device: str,
        random_seed: int = None,
    ):
        """Builds a ChangeOfBasis object.

        Args:
            tensor_size (int): Size of the tensor.
            n_cob (int): Number of change of basis matrices.
            cob_prob (float): Probability of applying a change of basis.
            device (str): Name of the torch device to use.
            random_seed (int, optional): Random seed for reproducibility.
        """
        self.tmp_dir = Path.home() / ".data_alpha_tensor/cob_matrices"
        self.tmp_dir.mkdir(exist_ok=True, parents=True)
        for i, cob_matrix in enumerate(
            get_change_basis_matrix(
                tensor_size, n_cob, cob_entry_prob_distribution, random_seed
            )
        ):
            torch.save(cob_matrix, f"{self.tmp_dir}/cob_matrix_{i}.pt")
        self.tensor_size = tensor_size
        self.n_cob = n_cob
        self.cob_prob = cob_prob
        self.device = device

    @torch.no_grad()
    def __call__(self, tensor: torch.Tensor, return_basis: bool = False):
        """Apply a change of basis to a tensor.

        Args:
            tensor (torch.Tensor): Tensor to apply the change of basis to.
            return_basis (bool, optional): Whether to return the change of
            basis matrix as well.
        """
        cob_prob = torch.rand(1).item()
        if cob_prob > self.cob_prob:
            return tensor
        random_cob = torch.randint(low=0, high=self.n_cob, size=(1,))
        cob_matrix = torch.load(
            f"{self.tmp_dir}/cob_matrix_{int(random_cob)}.pt"
        ).to(self.device)

        # apply change of basis to each tensor dimension
        inner_tensor = tensor[0, 0]
        tensor_size = inner_tensor.shape[-1]
        original_shape = inner_tensor.shape
        cob_matrix = cob_matrix.transpose(0, 1)
        inner_tensor = torch.matmul(
            inner_tensor.reshape(-1, tensor_size), cob_matrix
        ).reshape(original_shape)
        inner_tensor = inner_tensor.permute(0, 2, 1)
        inner_tensor = torch.matmul(
            inner_tensor.reshape(-1, tensor_size), cob_matrix
        ).reshape(original_shape)
        inner_tensor = inner_tensor.permute(2, 1, 0)
        inner_tensor = torch.matmul(
            inner_tensor.reshape(-1, tensor_size), cob_matrix
        ).reshape(original_shape)
        inner_tensor = inner_tensor.permute(2, 0, 1)
        tensor[0, 0] = inner_tensor
        if return_basis:
            return tensor, cob_matrix.transpose(0, 1)
        return tensor


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/dataset.py
================================================
import json
import os
import shutil
import tempfile
from pathlib import Path
from typing import List, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset

from open_alpha_tensor.core.data.generation import generate_synthetic_data
from open_alpha_tensor.core.data.utils import (
    get_scalars,
    map_triplet_to_action,
)

SAVE_DIR_SYNT = str(Path.home() / ".data_alpha_tensor/synthetic_data")


def compute_move(triplets: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
    """Computes the outer product of the three tensors in the triplet that
    will be subtracted from the current state.

    Args:
        triplets (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tensors u,
        v, and w.
    """
    u, v, w = triplets
    return u.reshape(-1, 1, 1) * v.reshape(1, -1, 1) * w.reshape(1, 1, -1)


class SyntheticDataBuffer(Dataset):
    """Dataset of synthetically generated demonstrations."""

    def __init__(
        self,
        tensor_size,
        n_data,
        limit_rank,
        prob_distr,
        n_prev_actions: int,
        device: str,
        n_steps: int,
        random_seed=None,
    ):
        """Builds a dataset of synthetic demonstrations.

        Args:
            tensor_size (int): Size of the tensor.
            n_data (int): Number of demonstrations to generate.
            limit_rank (int): Maximum rank of the generated tensors.
            prob_distr (Callable): Probability distribution to use to generate
            the tensors.
            n_prev_actions (int): Number of previous actions to use as input.
            device (str): Name of the torch device to use.
            n_steps (int): Number of steps to perform in the environment.
            random_seed (int, optional): Random seed to use.
        """
        self.device = device
        self.len_data = 0
        self.n_prev_actions = n_prev_actions
        self.limit_rank = limit_rank
        self.n_steps = n_steps
        self.save_dir = os.path.join(SAVE_DIR_SYNT, f"size_{tensor_size}")
        Path(self.save_dir).mkdir(parents=True, exist_ok=True)
        number_of_triplets = len(list(Path(self.save_dir).glob("*.pt"))) // 2
        if number_of_triplets < n_data:
            self.len_data = number_of_triplets
            for i, (output_tensor, list_of_triplets) in enumerate(
                generate_synthetic_data(
                    tensor_size,
                    n_data - number_of_triplets,
                    limit_rank,
                    prob_distr,
                    random_seed,
                )
            ):
                torch.save(
                    output_tensor,
                    os.path.join(
                        self.save_dir, f"output_tensor_{self.len_data}.pt"
                    ),
                )
                torch.save(
                    list_of_triplets,
                    os.path.join(
                        self.save_dir, f"list_of_triplets_{self.len_data}.pt"
                    ),
                )
                self.len_data += 1
        else:
            self.len_data = n_data

    def __len__(self):
        return self.len_data * self.limit_rank

    @torch.no_grad()
    def __getitem__(self, idx):
        i = idx // self.limit_rank
        j = idx % self.limit_rank
        output_tensor = torch.load(
            os.path.join(self.save_dir, f"output_tensor_{i}.pt")
        )
        list_of_triplets = torch.load(
            os.path.join(self.save_dir, f"list_of_triplets_{i}.pt")
        )
        if j != self.limit_rank - 1:
            moves = list_of_triplets[j + 1 :]  # noqa E203
            output_tensor = self._apply_moves(output_tensor, moves)
        triplet = list_of_triplets[j]
        output_tensor = torch.stack(
            [
                output_tensor,
                *(
                    compute_move(t)
                    for t in reversed(
                        list_of_triplets[
                            j + 1 : j + 1 + self.n_prev_actions  # noqa E203
                        ]
                    )
                ),
            ]
        )
        if len(output_tensor) < self.n_prev_actions + 1:
            output_tensor = torch.cat(
                [
                    output_tensor,
                    torch.zeros(
                        self.n_prev_actions + 1 - len(output_tensor),
                        *output_tensor.shape[1:],
                    ),
                ]
            )
        policy = map_triplet_to_action(triplet, base=5, n_steps=self.n_steps)
        reward = torch.tensor([-(j + 1)])
        scalar = get_scalars(output_tensor, self.limit_rank - j, with_bs=False)
        return (
            output_tensor.to(self.device),
            scalar.to(self.device),
            policy.to(self.device),
            reward.to(self.device),
        )

    @staticmethod
    def _apply_moves(
        tensor: torch.Tensor,
        moves: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
    ):
        """Given an initial state and a list of moves, applies the moves to
        the state.

        Args:
            tensor (torch.Tensor): Initial state.
            moves (List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
            List of moves.
        """
        for u, v, w in moves:
            tensor = tensor - u.reshape(-1, 1, 1) * v.reshape(
                1, -1, 1
            ) * w.reshape(1, 1, -1)
        return tensor


class GameDataBuffer(Dataset):
    """Buffer to store the data from the games played by the MCTS agent."""

    def __init__(self, device: str, max_buffer_size: int):
        """Initializes the buffer.

        Args:
            device (str): Name of the torch device to use.
            max_buffer_size (int): Maximum size of the buffer.
        """
        self.num_games = 0
        self.temp_dir = tempfile.mkdtemp("game_data_buffer")
        self.game_data = {}
        self.max_buffer_size = max_buffer_size
        self.device = device

    def __del__(self):
        shutil.rmtree(self.temp_dir)

    def add_game(
        self,
        states: List[torch.Tensor],
        policies: List[torch.Tensor],
        rewards: List[torch.Tensor],
    ):
        """Adds a played game to the buffer.

        Args:
            states (List[torch.Tensor]): Observed game states.
            policies (List[torch.Tensor]): List of policies.
            rewards (List[torch.Tensor]): Observed rewards.
        """
        self.game_data[self.num_games] = len(states)
        torch.save(
            states, os.path.join(self.temp_dir, f"states_{self.num_games}.pt")
        )
        torch.save(
            policies,
            os.path.join(self.temp_dir, f"policies_{self.num_games}.pt"),
        )
        torch.save(
            rewards,
            os.path.join(self.temp_dir, f"rewards_{self.num_games}.pt"),
        )
        self.num_games += 1
        if self.num_games >= self.max_buffer_size:
            # remove oldest game. Note that this line is not thread safe. Lock
            # should be added if multiple threads are used.
            self.num_games = 0

    def __len__(self):
        return sum(self.game_data.values())

    @torch.no_grad()
    def __getitem__(self, idx):
        i = 0
        while idx >= self.game_data[i]:
            idx -= self.game_data[i]
            i += 1
        states = torch.load(os.path.join(self.temp_dir, f"states_{i}.pt"))
        policies = torch.load(os.path.join(self.temp_dir, f"policies_{i}.pt"))
        rewards = torch.load(os.path.join(self.temp_dir, f"rewards_{i}.pt"))
        return (
            states[idx].to(self.device),
            get_scalars(states[idx], idx, with_bs=False).to(self.device),
            policies[idx].to(self.device).argmax(dim=-1),
            rewards[idx].to(self.device).reshape(1),
        )

    def save_game_data(self, path: str):
        """Copy save_dir content in path and save game_data
        in json format
        """
        shutil.copytree(self.temp_dir, path, dirs_exist_ok=True)
        with open(os.path.join(path, "game_data.json"), "w") as f:
            json.dump(self.game_data, f)

    def load_game_data(self, path: str):
        """Load game_data from json format and copy content
        in save_dir
        """
        with open(os.path.join(path, "game_data.json"), "r") as f:
            self.game_data = json.load(f)
        shutil.copytree(path, self.temp_dir)
        self.num_games = len(self.game_data)


class TensorGameDataset(Dataset):
    """Dataset to be used for training the AlphaTensor algorithm using both
    actor generated and synthetic data. A basis change can be applied to both
    the data type with a probability specified in the constructor. The
    synthetic data and the actor generated one are stored in two data buffers.
    """

    def __init__(
        self,
        len_data,
        pct_synth,
        tensor_size,
        n_synth_data,
        limit_rank,
        prob_distr,
        action_memory_len: int,
        device: str,
        n_steps: int,
        random_seed=None,
    ):
        self.synthetic_data_buffer = SyntheticDataBuffer(
            tensor_size,
            n_synth_data,
            limit_rank,
            prob_distr,
            action_memory_len,
            n_steps=n_steps,
            device=device,
            random_seed=random_seed,
        )
        self.game_data_buffer = GameDataBuffer(
            device=device, max_buffer_size=100000
        )
        self.best_game_data_buffer = GameDataBuffer(
            device=device, max_buffer_size=1000
        )
        self.len_data = len_data
        self.pct_synth = pct_synth
        self.pct_best_game = 0
        self.synth_bool = torch.ones(len_data, dtype=torch.bool)
        self.synth_idx = torch.from_numpy(
            np.random.choice(
                len(self.synthetic_data_buffer), len_data, replace=False
            )
        )
        self.game_idx = None
        self.best_game_idx = None
        self.action_memory_len = action_memory_len
        self.tensor_size = tensor_size
        self.device = device

    def change_training_split(self, pct_synth, pct_best_game):
        self.pct_synth = pct_synth
        self.pct_best_game = pct_best_game

    def recompute_synthetic_indexes(self):
        if len(self.game_data_buffer) > 0:
            self.synth_bool = torch.rand(self.len_data) < self.pct_synth
            len_synth_data = self.synth_bool.sum().item()
            self.synth_idx = torch.from_numpy(
                np.random.choice(
                    len(self.synthetic_data_buffer),
                    len_synth_data,
                    replace=False,
                )
            )
            if len(self.best_game_data_buffer) > 0 and self.pct_best_game > 0:
                len_game_data = int(
                    (1 - self.pct_synth - self.pct_best_game) * self.len_data
                )
                replace_game = len_game_data > len(self.game_data_buffer)
                len_best_game_data = (
                    self.len_data - len_synth_data - len_game_data
                )
                replace_best_game = len_best_game_data > len(
                    self.best_game_data_buffer
                )
                self.game_idx = torch.from_numpy(
                    np.random.choice(
                        len(self.game_data_buffer),
                        len_game_data,
                        replace=replace_game,
                    )
                )
                self.best_game_idx = torch.from_numpy(
                    np.random.choice(
                        len(self.best_game_data_buffer),
                        len_best_game_data,
                        replace=replace_best_game,
                    )
                )
            else:
                len_game_data = self.len_data - len_synth_data
                replace_game = len_game_data > len(self.game_data_buffer)
                self.game_idx = torch.from_numpy(
                    np.random.choice(
                        len(self.game_data_buffer),
                        len_game_data,
                        replace=replace_game,
                    )
                )

    def __getitem__(self, idx):
        if self.synth_bool[idx]:
            return self.synthetic_data_buffer[
                self.synth_idx[self.synth_bool[:idx].sum()]
            ]
        else:
            if self.pct_best_game > 0 and self.best_game_idx is not None:
                if idx - self.synth_bool[:idx].sum() < len(self.best_game_idx):
                    return self.best_game_data_buffer[
                        self.best_game_idx[idx - self.synth_bool[:idx].sum()]
                    ]
                else:
                    return self.game_data_buffer[
                        self.game_idx[
                            idx
                            - self.synth_bool[:idx].sum()
                            - len(self.best_game_idx)
                        ]
                    ]
            else:
                return self.game_data_buffer[
                    self.game_idx[idx - self.synth_bool[:idx].sum()]
                ]

    def __len__(self):
        return self.len_data

    def add_game(
        self,
        states: List[torch.Tensor],
        policies: List[torch.Tensor],
        rewards: List[torch.Tensor],
    ):
        self.game_data_buffer.add_game(states, policies, rewards)

    def add_best_game(
        self,
        states: List[torch.Tensor],
        policies: List[torch.Tensor],
        rewards: List[torch.Tensor],
    ):
        self.best_game_data_buffer.add_game(states, policies, rewards)

    def save_game_data(self, path):
        self.game_data_buffer.save_game_data(os.path.join(path, "game_data"))
        self.best_game_data_buffer.save_game_data(
            os.path.join(path, "best_game_data")
        )

    def load_game_data(self, path):
        self.game_data_buffer.load_game_data(os.path.join(path, "game_data"))
        self.best_game_data_buffer.load_game_data(
            os.path.join(path, "best_game_data")
        )

    @property
    def input_tensor(self) -> torch.Tensor:
        max_matrix_size = int(np.sqrt(self.tensor_size))
        input_tensor = torch.zeros(
            1,
            self.action_memory_len + 1,
            self.tensor_size,
            self.tensor_size,
            self.tensor_size,
        )
        matrix_dims = (
            torch.randint(1, max_matrix_size, (3,))
            .detach()
            .cpu()
            .numpy()
            .tolist()
        )
        operation_tensor = self._build_tensor_game_input(
            *matrix_dims, action_memory_len=self.action_memory_len
        )

        input_tensor[
            0,
            :,
            : operation_tensor.shape[1],
            : operation_tensor.shape[2],
            : operation_tensor.shape[3],
        ] = operation_tensor
        return input_tensor.to(self.device)

    @staticmethod
    def _build_tensor_game_input(
        dim_1: int, dim_k: int, dim_2: int, action_memory_len: int
    ):
        """Build the input tensor for the game. The input tensor has shape
        (action_memory_len+1, matrix_size**2, matrix_size**2, matrix_size**2).
        The first slice represent the matrix multiplication tensor which will
        be reduced by the TensorGame algorithm. The other slices represent the
        action memory.
        """
        input_tensor = torch.zeros(
            action_memory_len + 1, dim_1 * dim_k, dim_k * dim_2, dim_1 * dim_2
        )
        for r in range(dim_1 * dim_2):
            for k in range(dim_k):
                input_tensor[
                    0, (r // dim_2) * dim_k + k, k * dim_2 + r % dim_2, r
                ] = 1
        return input_tensor

    def games_are_good(self):
        return False


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/generation.py
================================================
from typing import Callable

import torch


def generate_synthetic_data(
    tensor_size: int,
    n_data: int,
    limit_rank: int,
    prob_distr: Callable = torch.randn,
    random_seed: int = None,
):
    """Generates synthetic demonstrations.

    Args:
        tensor_size (int): Size of the tensor.
        n_data (int): Number of demonstrations.
        limit_rank (int): Limit rank of each tensor.
        prob_distr (Callable, optional): Distribution of the entries of the
        tensor.
        random_seed (int, optional): Random seed for reproducibility.
    """
    if random_seed is not None:
        torch.random.manual_seed(random_seed)
    for _ in range(n_data):
        # rank = torch.randint(low=1, high=limit_rank + 1, size=(1,)).item()
        rank = limit_rank
        output_tensor = torch.zeros(tensor_size, tensor_size, tensor_size)
        list_of_triplets = []
        for i in range(rank):
            valid_triplet = False
            while not valid_triplet:
                u = prob_distr(tensor_size)
                v = prob_distr(tensor_size)
                w = prob_distr(tensor_size)
                generated_tensor = (
                    u.reshape(-1, 1, 1)
                    * v.reshape(1, -1, 1)
                    * w.reshape(1, 1, -1)
                )
                if not (generated_tensor == 0).all():
                    valid_triplet = True
                    list_of_triplets.append((u, v, w))
                    output_tensor += generated_tensor
        yield output_tensor, list_of_triplets


def f_prob_distribution(size):
    """Samples a tensor of values from a distribution with a peak at 0 and a
    tail at -2 and 2.

    Args:
        size (int): Number of values to sample.
    """
    f_vals = torch.tensor([-2, -1, 0, 1, 2])
    f_probs = torch.tensor([0.001, 0.099, 0.8, 0.099, 0.001]).unsqueeze(0)
    f_cum_sum = torch.cumsum(f_probs, dim=-1)
    unif_prob = torch.rand((size, 1))
    tensor_idx = torch.argmax((unif_prob <= f_cum_sum).int(), dim=1)
    tensor = f_vals[tensor_idx]
    return tensor


def z2_prob_distribution(size):
    """Samples a binary tensor with uniform probability of 0 and 1.

    Args:
        size (int): Number of values to sample.
    """
    return (torch.rand(size) > 0.5).int()


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/utils.py
================================================
from typing import Tuple

import torch


def get_scalars(input_tensor: torch.Tensor, t_step: int, with_bs: bool = True):
    """Adds the time step to the current state tensor.

    Args:
        input_tensor (torch.Tensor): Current state tensor.
        t_step (int): Current time step.
        with_bs (bool, optional): Whether the batch size is present in the
        input tensor.
    """
    # scalars containing the iteration time
    if with_bs:
        bs = input_tensor.shape[0]
        scalars = torch.zeros((bs, 1))
        scalars[:, 0] = t_step
    else:
        scalars = torch.tensor(t_step).unsqueeze(-1).float()
    return scalars


def map_triplet_to_action(
    triplet: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
    base: int,
    n_steps: int,
    add_bias: bool = True,
):
    """Maps a triplet of tensors to an action.

    Args:
        triplet (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Triplet of
        tensors u, v, and w.
        base (int): Base used for the conversion.
        n_steps (int): Number of steps in the action.
        add_bias (bool, optional): Whether to add a bias to the action.
    """
    # map the triplet to an action. First, we concatenate the three tensors and
    # then we convert it to an action using the given base representation. Each
    # element is converted using the formula:
    #   action += element * base^(element_index)
    u, v, w = triplet
    n_dim = u.ndim
    action = torch.cat((u, v, w), dim=-1)
    action = action.reshape(-1, n_steps, action.shape[-1] // n_steps)
    if n_dim == 1:
        action = action.squeeze(0)
    if add_bias:
        action = action + base // 2
    action = action * torch.tensor(
        [base**i for i in range(action.shape[-1])]
    )
    action = action.sum(dim=-1)
    return action


# @torch.jit.script
def _single_action_to_triplet(
    action_val: int,
    basis: int,
    out_dim: int,
    bias: int,
    device: str,
):
    """Converts an action to the original triplet (u, v, w) that generated it.

    Args:
        action_val (int): Action to convert.
        basis (int): Basis used for the conversion.
        out_dim (int): Output dimension.
        bias (int): Bias to subtract from the action.
        device (str): Name of the torch device to use.
    """
    triplet = torch.zeros(out_dim).to(device)
    if action_val > 0:
        idx = int(
            torch.log(torch.tensor(action_val))
            // torch.log(torch.tensor(basis))
        )
    else:
        idx = 0
    while idx >= 0:
        temp = int(basis**idx)
        triplet[idx] = action_val // temp - bias
        action_val = action_val - temp
        idx -= 1
    return triplet


def map_action_to_triplet(
    action_tensor: torch.Tensor,
    cardinality: int = 5,
    vector_size: int = 5,
    add_bias: bool = True,
):
    """Maps a batch of actions to the batch of triplets that generated them.

    Args:
        action_tensor (torch.Tensor): Batch of actions.
        cardinality (int, optional): Cardinality of the action space.
        vector_size (int, optional): Size of the vector.
        add_bias (bool, optional): Whether to use bias.
    """
    # map the action to a triplet. The action is converted to a base 5
    # representation and then the three elements are extracted from it.
    # The action has shape (bs, n_steps) and it contains the token for
    # recreating u, v and w. The token is a number between 0 and n_logits.
    action_shape = action_tensor.shape
    action_tensor = action_tensor.reshape(-1)
    if add_bias:
        bias = cardinality // 2
    else:
        bias = 0
    triplets = torch.stack(
        [
            _single_action_to_triplet(
                action_tensor[idx],
                cardinality,
                vector_size,
                bias,
                action_tensor.device,
            )
            for idx in range(len(action_tensor))
        ]
    )
    final_size = triplets.shape[-1]
    return triplets.reshape((*action_shape, final_size))


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/alpha_tensor.py
================================================
import torch

from open_alpha_tensor.core.modules.extras import (
    QuantileLoss,
    ValueRiskManagement,
)
from open_alpha_tensor.core.modules.heads import PolicyHead, ValueHead
from open_alpha_tensor.core.modules.torso import TorsoModel


class AlphaTensorModel(torch.nn.Module):
    def __init__(
        self,
        tensor_length: int,
        input_size: int,
        scalars_size: int,
        emb_dim: int,
        n_steps: int,
        n_logits: int,
        n_samples: int,
    ):
        # scalar_size = s
        # input_size = S
        # tensor_length = T
        # emb_dim = c
        super().__init__()
        self.tensor_length = tensor_length
        self.input_size = input_size
        self.emb_dim = emb_dim
        self.torso = TorsoModel(
            scalars_size, input_size, tensor_length, emb_dim
        )
        emb_size = 3 * input_size * input_size
        print("Build policy head")
        self.policy_head = PolicyHead(
            emb_size, emb_dim, n_steps, n_logits, n_samples
        )
        print("Build value head")
        self.value_head = ValueHead(
            2048
        )  # value dependent on num_head and proj_dim
        self.policy_loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
        self.quantile_loss_fn = QuantileLoss()
        self.risk_value_management = ValueRiskManagement()

    @property
    def device(self):
        return next(self.parameters()).device

    def _train_forward(
        self,
        x: torch.Tensor,
        s: torch.Tensor,
        g_action: torch.Tensor,
        g_value: torch.Tensor,
    ):
        # shapes
        # x = (N, T, S, S, S)
        # s = (N, s)
        # g_action = (N, N_steps)
        # g_value = (N, )
        e = self.torso(x, s)
        o, z1 = self.policy_head(e, g_action)
        l_policy = self.policy_loss_fn(
            o.reshape(-1, o.shape[-1]), g_action.reshape(-1)
        )
        q = self.value_head(z1)
        l_value = self.quantile_loss_fn(q, g_value.float())
        return l_policy, l_value

    def _eval_forward(self, x: torch.Tensor, s: torch.Tensor):
        e = self.torso(x, s)
        a, p, z1 = self.policy_head(e)
        q = self.value_head(z1)
        q = self.risk_value_management(q)
        return a, p, q

    def forward(
        self,
        x: torch.Tensor,
        s: torch.Tensor,
        g_action: torch.Tensor = None,
        g_value: torch.Tensor = None,
    ):
        if g_action is None:
            return self._eval_forward(x, s)
        else:
            assert g_value is not None
            return self._train_forward(x, s, g_action, g_value)

    @property
    def n_logits(self):
        return self.policy_head.n_logits

    @property
    def n_steps(self):
        return self.policy_head.n_steps

    @property
    def n_samples(self):
        return self.policy_head.n_samples


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/attention.py
================================================
import torch
from torch.nn import functional as F


class AttentionHead(torch.nn.Module):
    def __init__(self, x_size: int, y_size: int, proj_dim: int):
        # x_size = N_x
        # y_size = N_y
        super(AttentionHead, self).__init__()
        self.proj_dim = proj_dim
        self.proj_dim_isqrt = 1 / torch.sqrt(torch.tensor(proj_dim))
        self.queries_proj_layer = torch.nn.Linear(x_size, proj_dim)
        self.keys_proj_layer = torch.nn.Linear(y_size, proj_dim)
        self.values_proj_layer = torch.nn.Linear(y_size, proj_dim)

    def forward(self, x: torch.Tensor, y: torch.Tensor, mask: bool = False):
        queries = self.queries_proj_layer(x)
        keys = self.keys_proj_layer(y)
        values = self.values_proj_layer(y)
        attention = F.softmax(
            torch.matmul(queries, keys.transpose(-2, -1))
            * self.proj_dim_isqrt,
            dim=-1,
        )
        if mask:
            attention = torch.triu(attention, diagonal=1)
        output = torch.matmul(attention, values)
        return output


class AttentionDenseBlock(torch.nn.Module):
    def __init__(self, inner_size: int, multiplier: int = 4):
        super().__init__()
        self.norm = torch.nn.LayerNorm(inner_size)
        self.linear = torch.nn.Linear(inner_size, inner_size * multiplier)
        self.activation = torch.nn.GELU()
        self.linear_final = torch.nn.Linear(
            inner_size * multiplier, inner_size
        )

    def forward(self, x: torch.Tensor):
        x_temp = self.activation(self.linear(self.norm(x)))
        return x + self.linear_final(x_temp)


class AlphaMultiHeadAttention(torch.nn.Module):
    def __init__(
        self,
        x_dim: int,
        y_dim: int,
        proj_dim: int = 32,
        n_heads: int = 16,
        multiplier: int = 4,
    ):
        # x_dim = size of the last dimension of x
        # y_dim = size of the last dimension of y
        super().__init__()
        self.norm_layer_x = torch.nn.LayerNorm(x_dim)
        self.norm_layer_y = torch.nn.LayerNorm(y_dim)
        self.module_list = torch.nn.ModuleList(
            [AttentionHead(x_dim, y_dim, proj_dim) for _ in range(n_heads)]
        )
        self.linear = torch.nn.Linear(n_heads * proj_dim, x_dim)

        self.dense = AttentionDenseBlock(x_dim, multiplier)

    def forward(
        self, x: torch.nn.Module, y: torch.nn.Module, mask: bool = False
    ):
        # x.size = (Nx, c1), y.size = (Ny, c2)
        x_norm = self.norm_layer_x(x)
        y_norm = self.norm_layer_y(y)
        temp = torch.cat(
            [layer(x_norm, y_norm, mask) for layer in self.module_list], dim=-1
        )
        x = x + self.linear(temp)
        return self.dense(x)


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/extras.py
================================================
import torch


class QuantileLoss(torch.nn.Module):
    def __init__(self, delta: float = 1.0):
        super().__init__()
        self.huber_loss = torch.nn.HuberLoss(reduction="none", delta=delta)

    def forward(self, q: torch.Tensor, g: torch.Tensor):
        n = q.shape[-1]
        tau = torch.arange(0, n).unsqueeze(0).to(q.device) / n
        h = self.huber_loss(g, q)
        k = torch.abs(tau - (g - q > 0).float())
        return torch.mean(h * k)


class ValueRiskManagement(torch.nn.Module):
    def __init__(self, u_q: float = 0.75):
        super(ValueRiskManagement, self).__init__()
        self.u_q = u_q

    def forward(self, q: torch.Tensor):
        # q shape = (N, n)
        j = int(self.u_q * q.shape[-1])
        return torch.mean(q[:, j:], dim=-1)


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/heads.py
================================================
import math

import torch
import torch.nn.functional as F

from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention


class PositionEncoding(torch.nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[: x.size(0)]
        return x


class PolicyHeadDoubleAttention(torch.nn.Module):
    def __init__(
        self,
        n_steps: int,
        n_heads: int,
        n_feat: int,
        emb_size: int,
        emb_dim: int,
    ):
        super().__init__()
        d_model = n_feat * n_heads
        self.layer_norm1 = torch.nn.LayerNorm(d_model)
        self.attention1 = AlphaMultiHeadAttention(d_model, d_model)
        self.drop1 = torch.nn.Dropout()
        self.layer_norm2 = torch.nn.LayerNorm(d_model)
        self.attention2 = AlphaMultiHeadAttention(d_model, emb_dim)
        self.drop2 = torch.nn.Dropout()

    def forward(self, x: torch.Tensor, e: torch.Tensor):
        x = self.layer_norm1(x)
        c = self.attention1(x, x, mask=True)
        c = self.drop1(c)
        x = x + c
        x = self.layer_norm2(x)
        c = self.attention2(x, e, mask=False)
        c = self.drop2(c)
        x = x + c
        return x


class PolicyHeadCore(torch.nn.Module):
    def __init__(
        self,
        emb_size: int,
        emb_dim: int,
        n_steps: int,
        n_logits: int,
        n_feat: int = 64,
        n_heads: int = 32,
        n_layers: int = 2,
    ):
        super().__init__()
        self.embedding = torch.nn.Embedding(n_logits, n_feat * n_heads)
        self.position_encoding = PositionEncoding(n_feat * n_heads)
        self.decoders = torch.nn.ModuleList(
            [
                PolicyHeadDoubleAttention(
                    n_steps, n_heads, n_feat, emb_size, emb_dim
                )
                for _ in range(n_layers)
            ]
        )
        self.relu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(n_feat * n_heads, n_logits)

    def forward(self, a: torch.Tensor, e: torch.Tensor):
        x = self.position_encoding(self.embedding(a))
        for layer in self.decoders:
            x = layer(x, e)
        o = self.linear2(self.relu(x))
        return o, x


def sample_from_logits(a):
    # returns a sampled element and the associated probability
    # since cross entropy is run during training we expect logits
    # to be probabilities yet.
    probs = torch.cumsum(F.softmax(a, dim=-1), dim=-1)
    random_vals = torch.rand(probs.shape[0]).unsqueeze(-1).to(a.device)
    n_classes = a.shape[-1]
    new_a_idx = torch.argmax(1.0 * (probs > random_vals), dim=-1)
    index_bias = torch.arange(0, len(new_a_idx)).to(a.device) * n_classes
    probs = torch.take(probs, new_a_idx + index_bias)
    # new_a = F.one_hot(new_a_idx, n_classes)
    return new_a_idx, probs


class PolicyHead(torch.nn.Module):
    def __init__(
        self,
        emb_size: int,
        emb_dim: int,
        n_steps: int,
        n_logits: int,
        n_samples: int,
    ):
        super().__init__()
        self.n_logits = n_logits
        self.n_samples = n_samples
        self.n_steps = n_steps
        self.core = PolicyHeadCore(emb_size, emb_dim, n_steps, n_logits)

    def _train_forward(self, e: torch.Tensor, g: torch.Tensor):
        # e is the embedding, shape = (N, m, c)
        # g represents the previous actions, when training it represents the
        # list of correct actions, thus we need to shift them (since we do not
        # want to consider also the latest, correct action when predicting).
        # g has shape (N, N_steps) and it is a one-hot encoding of N_logits
        g = torch.roll(g, shifts=-1, dims=1)
        # the first raw will have attention zero during training
        # g = F.one_hot(g, self.n_logits).float()
        o, z = self.core(g, e)
        return o, z[:, 0]

    def _eval_forward(self, e: torch.Tensor):
        bs = e.shape[0]
        future_g = (
            torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)
        )
        ps = torch.ones((bs, self.n_samples)).to(e.device)
        e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)

        future_g = future_g.view(-1, self.n_steps)
        ps = ps.view(-1)
        e = e.view(-1, e.shape[-2], e.shape[-1])
        for i in range(self.n_steps):
            o_s, z_s = self.core(future_g[:, : i + 1], e)
            future_g[:, i], p_i = sample_from_logits(o_s[:, i])
            ps *= p_i
        future_g = future_g.view(bs, self.n_samples, self.n_steps)
        ps = ps.view(bs, self.n_samples)
        return (
            future_g,
            ps,
            z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),
        )

    def forward(self, e: torch.Tensor, g: torch.Tensor = None):
        if g is None:
            return self._eval_forward(e)
        return self._train_forward(e, g)


class ValueHeadCore(torch.nn.Module):
    def __init__(self, input_size: int, output_size: int):
        super().__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
        self.relu = torch.nn.ReLU()

    def forward(self, x: torch.Tensor):
        return self.relu(self.linear(x))


class ValueHead(torch.nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int = 512, output_size: int = 8
    ):
        super().__init__()
        self.layers = torch.nn.Sequential(
            *(
                [ValueHeadCore(input_size, hidden_size)]
                + [ValueHeadCore(hidden_size, hidden_size)] * 2
            )
        )
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor):
        return self.linear(self.layers(x))


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/torso.py
================================================
import torch

from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention


class TorsoAttentiveModes(torch.nn.Module):
    def __init__(self, input_dim: int):
        # input_dim = c
        super().__init__()
        self.attention = AlphaMultiHeadAttention(
            input_dim,
            input_dim,
        )

    def forward(self, x1, x2, x3):
        # x1.size = x2.size = x3.size = (N, S, S, c)
        # where N is the batch size
        size = x1.shape[-2]
        input_list = [x1, x2, x3]
        for m1, m2 in [(0, 1), (2, 0), (1, 2)]:
            matrix = torch.cat([input_list[m1], input_list[m2]], dim=-2)
            # matrix_size = (N, S, 2S, c)
            out = self.attention(matrix, matrix)
            input_list[m1] = out[:, :, :size]
            input_list[m2] = out[:, :, size:]
        return input_list


class TorsoModel(torch.nn.Module):
    """Torso model of OpenAlphaTensor.

    It maps an input tensor of shape (N, T, S, S, S) to (N, 3S*S, c), where:

        N is the batch size;
        T is the context size (size of the history + 1);
        S is the number of elements in each matrix to be multiplied;
        c is the output dimensionality.
    """

    def __init__(
        self,
        scalars_size: int,
        input_size: int,
        tensor_length: int,
        out_size: int,
    ):
        # scalar_size = s
        # input_size = S
        # tensor_length = T
        # out_size = c
        super(TorsoModel, self).__init__()
        self.linears_1 = torch.nn.ModuleList(
            [
                torch.nn.Linear(scalars_size, input_size * input_size)
                for _ in range(3)
            ]
        )
        self.linears_2 = torch.nn.ModuleList(
            [
                torch.nn.Linear(input_size * tensor_length + 1, out_size)
                for _ in range(3)
            ]
        )
        self.attentive_modes = torch.nn.ModuleList(
            [TorsoAttentiveModes(out_size) for _ in range(8)]
        )

    def forward(self, x: torch.Tensor, scalars: torch.Tensor):
        # x.size = (N, T, S, S, S)
        # scalars.size = (N, s)
        batch_size = x.shape[0]
        S = x.shape[-1]
        T = x.shape[1]
        x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)
        x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)
        x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)
        input_list = [x1, x2, x3]
        for i in range(3):
            temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)
            input_list[i] = torch.cat([input_list[i], temp], dim=-1)
            input_list[i] = self.linears_2[i](input_list[i])
        x1, x2, x3 = input_list
        for layer in self.attentive_modes:
            x1, x2, x3 = layer(x1, x2, x3)
        return torch.stack([x1, x2, x3], dim=2).reshape(
            batch_size, 3 * S * S, -1
        )


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/training.py
================================================
from pathlib import Path
from typing import Tuple, List

import torch.optim
import tqdm
from torch.utils.data import DataLoader

from open_alpha_tensor.config import (
    BASE_CHECKPOINT_DATA_DIR,
    BASE_CHECKPOINT_DIR,
)
from open_alpha_tensor.core.actors.stage import actor_prediction
from open_alpha_tensor.core.data.basis_change import ChangeOfBasis
from open_alpha_tensor.core.data.dataset import TensorGameDataset
from open_alpha_tensor.core.data.generation import f_prob_distribution
from open_alpha_tensor.core.data.utils import map_action_to_triplet
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel


@torch.no_grad()
def _single_act(
    actor_id: int,
    model: torch.nn.Module,
    input_tensor: torch.Tensor,
    device: str,
    mc_n_sim: int,
    N_bar: int,
    cob: ChangeOfBasis,
    max_rank: int,
):
    """Executes an episode for a single actor using the MCTS.
    The method is called multiple times in parallel with different actor ids.

    Args:
        actor_id (int): The id of the actor.
        model (torch.nn.Module): The model used to take the action.
        input_tensor (torch.Tensor): State of the game.
        device (str): The name of the torch device used for training.
        mc_n_sim (int): Number of simulations during Monte Carlo tree search.
        N_bar (int): N_bar parameter used to compute tau when improving the
        policy.
        cob (ChangeOfBasis): The change of basis used to generate the input
        tensor.
        max_rank (int): The maximum matrix rank achieved by the actor before
        tree search is stopped.
    """
    print(f"Acting with actor {actor_id}")
    model.to(device)
    cob.device = device
    input_tensor = input_tensor.to(device)
    input_tensor_cob = cob(input_tensor)
    states, policies, rewards = actor_prediction(
        model, input_tensor_cob, max_rank, mc_n_sim, N_bar
    )
    print(f"Actor {actor_id} finished")
    states = [s.to("cpu") for s in states]
    policies = policies.to("cpu")
    rewards = rewards.to("cpu")
    return actor_id, states, policies, rewards


def swap_data(
    states: List[torch.Tensor],
    actions: List[torch.Tensor],
):
    """Swaps the last action with a random one and updates the states
    accordingly for a single game.

    Args:
        states (List[torch.Tensor]): All the states for a single game.
        actions (List[torch.Tensor]): All the actions through the game.
    """
    last_action = actions[-1]
    swap_index = torch.randint(0, len(states) - 1, (1,)).item()
    actions[-1] = actions[swap_index]
    actions[swap_index] = last_action

    actual_state = states[swap_index]
    for i in range(swap_index + 1, len(states) + 1):
        prev_action = actions[i - 1]
        triplet = map_action_to_triplet(
            prev_action, vector_size=actual_state.shape[-1]
        )
        vector_size = actual_state.shape[-1] // 3
        bs = actual_state.shape[0]
        u = triplet[:, :vector_size].reshape(bs, -1, 1, 1)
        v = triplet[:, vector_size : 2 * vector_size].reshape(  # noqa E203
            bs, 1, -1, 1
        )
        w = triplet[:, 2 * vector_size :].reshape(bs, 1, 1, -1)  # noqa E203
        reduced_state = u * v * w
        fut_state = actual_state[:, 0] - reduced_state
        new_state = actual_state[:, 1:].roll(1, dims=1)
        new_state[:, 0] = reduced_state
        actual_state = torch.cat([fut_state, new_state], dim=1)
        states[i] = actual_state
    return states, actions


class Trainer:
    """Trainer for the AlphaTensor model. The trainer does not require an
    explicit loss since the loss is computed by the model itself. The trainer
    is responsible for both the training step and the acting one, storing
    acting performance in a buffer.
    """

    def __init__(
        self,
        model: AlphaTensorModel,
        tensor_size: int,
        n_steps: int,
        batch_size: int,
        optimizer: torch.optim.Optimizer,
        device: str,
        len_data: int,
        pct_synth: float,
        n_synth_data: int,
        limit_rank: int,
        n_cob: int,
        cob_prob: float,
        data_augmentation: bool,
        loss_params: Tuple[float, float] = None,
        random_seed: int = None,
        checkpoint_dir: str = None,
        checkpoint_data_dir: Path = None,
        extra_devices: List[str] = None,
    ):
        """Initializes the trainer.

        Args:
            model (AlphaTensorModel): The model to train.
            tensor_size (int): Flattened size of the matrices to be multiplied.
            n_steps (int): Number of steps used to get a single action out of
            a triplet.
            batch_size (int): Batch size.
            optimizer (torch.optim.Optimizer): The optimizer used to train the
            model.
            device (str): The name of the torch device used for training.
            len_data (int): Number of training samples used (both actor
            generated and synthetic).
            pct_synth (float): Initial percentage of synthetic samples used
            for training.
            n_synth_data (int): Number of synthetic training samples.
            limit_rank (int): Maximum rank for synthetically-generated
            matrices.
            n_cob (int): Number of change of basis (cob) used for a single
            training sample.
            cob_prob (float): Probability of applying a change of basis.
            data_augmentation (bool): Whether to randomly swap the last
            operation of an episode with another operation.
            loss_params (Tuple[float, float]): Alpha and Beta parameters used
            in the loss function.
            random_seed (int): Randomizing seed.
            checkpoint_dir (str): Directory used to store model checkpoints.
            checkpoint_data_dir (str): Directory used to store games as JSON
            files.
            extra_devices (List[str]): Extra devices names used for multi-GPU
            training.
        """
        self.model = model
        self.optimizer = optimizer
        self.device = device
        self.dataset = TensorGameDataset(
            len_data,
            pct_synth,
            tensor_size,
            n_synth_data,
            limit_rank,
            f_prob_distribution,
            device=device,
            n_steps=n_steps,
            action_memory_len=(model.tensor_length - 1),
            random_seed=random_seed,
        )
        self.batch_size = batch_size
        self.max_rank = limit_rank
        if loss_params is None:
            self.alpha = 1
            self.beta = 1
        else:
            self.alpha, self.beta = loss_params
        self.checkpoint_dir = Path(
            checkpoint_dir if checkpoint_dir else BASE_CHECKPOINT_DIR
        )
        self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
        self.checkpoint_data_dir = (
            checkpoint_data_dir
            if checkpoint_data_dir
            else Path(BASE_CHECKPOINT_DATA_DIR)
        )
        self.checkpoint_data_dir.mkdir(exist_ok=True, parents=True)
        self.change_of_basis = ChangeOfBasis(
            tensor_size, n_cob, cob_prob, device, random_seed
        )
        self.data_augmentation = data_augmentation
        self.extra_devices = extra_devices

    def train_step(self):
        """Executes a single training step by optimizing the current model
        parameters."""
        self.dataset.recompute_synthetic_indexes()
        self.model.train()
        total_loss = 0
        dl = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)
        print("Training AlphaTensor")
        for states, scalars, policies, rewards in tqdm.tqdm(dl):
            loss_policy, loss_value = self.model(
                states, scalars, policies, rewards
            )
            loss = self.alpha * loss_policy + self.beta * loss_value
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            total_loss += loss.item()
        print(f"Total loss: {total_loss}")

    @torch.no_grad()
    def act_step(
        self,
        input_tensor: torch.Tensor,
        n_games: int,
        mc_n_sim: int,
        N_bar: int,
    ):
        """Runs actors in parallel to generate multiple games starting from
        the same input tensor.

        Args:
            input_tensor (torch.Tensor): The input tensor used to generate the
            games.
            n_games (int): Number of games to generate / actors to be run in
            parallel.
            mc_n_sim (int): Number of simulations used in the Monte Carlo tree
            search.
            N_bar (int): N_bar parameter used to compute tau when improving
            the policy.
        """
        self.model.eval()
        best_reward = -1e10
        best_game = None

        if self.extra_devices:
            from joblib import Parallel, delayed

            # this means that there is an empty GPU available
            # thus we can use it to parallelize the acting step
            # use joblib to parallelize the acting step
            # we should use _single_act as a function to be parallelized
            extra_devices = (
                self.extra_devices * (n_games // len(self.extra_devices))
                + self.extra_devices[: n_games % len(self.extra_devices)]
            )
            self.model.to("cpu")
            input_tensor = input_tensor.to("cpu")

            print(f"Starting acting phase with {n_games} games")
            results = Parallel(n_jobs=len(self.extra_devices))(
                delayed(_single_act)(
                    actor_id,
                    self.model,
                    input_tensor,
                    extra_devices[actor_id],
                    mc_n_sim,
                    N_bar,
                    self.change_of_basis,
                    self.max_rank,
                )
                for actor_id in range(n_games)
            )
            self.model.to(self.device)

            for actor_id, states, policies, rewards in results:
                if rewards[-1] > best_reward:
                    print(f"New best actor! Actor: {actor_id}")
                    best_reward = rewards[-1]
                    best_game = (states, policies, rewards)
                self.dataset.add_game(states, policies, rewards)
                if self.data_augmentation:
                    states, policies = swap_data(states, policies)
                    self.dataset.add_game(states, policies, rewards)
            if best_game is not None:
                self.dataset.add_best_game(*best_game)
        else:
            for actor_id in range(n_games):
                input_tensor_cob = self.change_of_basis(input_tensor).to(
                    self.device
                )
                print(f"Running actor {actor_id} / {n_games}")
                states, policies, rewards = actor_prediction(
                    self.model,
                    input_tensor_cob,
                    self.max_rank,
                    mc_n_sim,
                    N_bar,
                )
                print(
                    f"Actor {actor_id} finished. Final reward: {rewards[-1]}"
                )
                if rewards[-1] > best_reward:
                    print("New best actor!")
                    best_reward = rewards[-1]
                    best_game = (states, policies, rewards)
                self.dataset.add_game(states, policies, rewards)
                if self.data_augmentation:
                    states, policies = swap_data(states, policies)
                    self.dataset.add_game(states, policies, rewards)
            if best_game is not None:
                self.dataset.add_best_game(*best_game)

    def train(
        self,
        n_epochs: int,
        n_games: int,
        mc_n_sim: int,
        N_bar: int,
        initial_lr: float,
        lr_decay_factor: float,
        lr_decay_steps: int,
        starting_epoch: int = 0,
    ):
        """Trains the model for a given number of epochs.

        Args:
            n_epochs (int): Number of training epochs.
            n_games (int): Number of games to generate / actors to be run in
            parallel at each step.
            mc_n_sim (int): Number of simulations used in the Monte Carlo tree
            search at each step.
            N_bar (int): N_bar parameter used to compute tau when improving
            the policy.
            initial_lr (float): Initial learning rate.
            lr_decay_factor (float): Learning rate's decay factor.
            lr_decay_steps (int): Number of learning rate's decay steps.
            starting_epoch (int, optional): Epoch from which to start / resume
            training.
        """
        self.model = self.model.to(self.device)
        if starting_epoch + 1 > n_epochs // 50:
            self.dataset.change_training_split(0.7, 0.05)
        if (
            starting_epoch + 1 > n_epochs // 10
        ):  # when restarting from a checkpoint
            mc_n_sim = mc_n_sim * 4
        for epoch in range(starting_epoch, n_epochs):
            if epoch + 1 == n_epochs // 50:
                self.dataset.change_training_split(0.7, 0.05)
            if epoch + 1 == n_epochs // 10:
                mc_n_sim = mc_n_sim * 4
            # apply learning rate decay each epoch if epoch < lr_decay_steps
            if 0 < epoch < lr_decay_steps - 1:
                lr = initial_lr * lr_decay_factor ** (epoch / lr_decay_steps)
                for param_group in self.optimizer.param_groups:
                    param_group["lr"] = lr

            print(f"Epoch {epoch} / {n_epochs}")
            self.train_step()
            if epoch % 10 == 0:
                self.act_step(
                    self.dataset.input_tensor, n_games, mc_n_sim, N_bar
                )
            # save checkpoint
            if (epoch + 1) % 100 == 0:
                checkpoint_name = f"checkpoint_{epoch + 1}.pt"
                checkpoint = {
                    "model_state_dict": self.model.state_dict(),
                    "optimizer_state_dict": self.optimizer.state_dict(),
                }
                torch.save(
                    checkpoint,
                    self.checkpoint_dir / checkpoint_name,
                )
                self.dataset.save_game_data(self.checkpoint_data_dir)
            # exit strategy
            if self.dataset.games_are_good():
                break
        print("Training finished")


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/__init__.py
================================================


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/checkpoint_op.py
================================================
from pathlib import Path
from typing import Any

import torch
from nebullvm.operations.base import Operation

from open_alpha_tensor.config import (
    BASE_CHECKPOINT_DATA_DIR,
    BASE_CHECKPOINT_DIR,
)
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.core.training import Trainer


def optimizer_to(optim: torch.optim.Optimizer, device: str):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)


class LoadCheckPointOp(Operation):
    """An operation which loads a checkpoint during training of an
    OpenAlphaTensor model."""

    def __init__(self):
        super().__init__()
        self._last_epoch = None
        self._model = None
        self._optimizer = None

    def execute(
        self,
        model: AlphaTensorModel,
        optimizer: torch.optim.Optimizer,
        checkpoint_dir: str,
    ):
        """Load a checkpoint from a directory.

        Args:
            model: The model to load the checkpoint into.
            optimizer: The optimizer to load the checkpoint into.
            checkpoint_dir: The directory to load the checkpoint from.
        """
        checkpoint_dir = checkpoint_dir or BASE_CHECKPOINT_DIR
        if (
            Path(checkpoint_dir).exists()
            and len(list(Path(checkpoint_dir).glob("*.pt"))) > 0
        ):

            def key_func(x):
                return int(x.stem.split("_")[-1])

            checkpoint_path = sorted(
                Path(checkpoint_dir).glob("*.pt"), key=key_func
            )[-1]
            print(f"Loading checkpoint from {checkpoint_path}")
            old_device = model.device
            checkpoint = torch.load(checkpoint_path)
            model.load_state_dict(checkpoint["model_state_dict"])
            model.to(old_device)
            print(f"Loaded model to {old_device}")
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            optimizer_to(optimizer, old_device)
            last_epoch = int(checkpoint_path.stem.split("_")[-1])
        else:
            last_epoch = 0

        self._last_epoch = last_epoch
        self._model = model
        self._optimizer = optimizer

    def get_last_epoch(self) -> int:
        """Returns the last epoch of the loaded checkpoint."""
        return self._last_epoch

    def get_model(self) -> AlphaTensorModel:
        """Returns the model loaded from the checkpoint."""
        return self._model

    def get_optimizer(self) -> torch.optim.Optimizer:
        """Returns the optimizer loaded from the checkpoint."""
        return self._optimizer

    def get_result(self) -> Any:
        pass


class LoadCheckpointDataOp(Operation):
    """An operation which loads the games played while training an
    OpenAlphaTensor model."""

    def __init__(self):
        super().__init__()
        self._loaded = False

    def execute(self, games_store_dir: Path, trainer: Trainer):
        """Load the games played while training an OpenAlphaTensor model.

        Args:
            games_store_dir: The directory where the games are stored.
            trainer: The trainer to load the games into.
        """
        games_store_dir = games_store_dir or BASE_CHECKPOINT_DATA_DIR
        # if games_store_dir contains games, load them
        if (
            games_store_dir.exists()
            and (games_store_dir / "game_data.json").exists()
        ):
            trainer.dataset.load_games(games_store_dir)
        self._loaded = True

    def get_result(self) -> bool:
        """Returns whether the games were loaded or not."""
        return self._loaded


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/model_op.py
================================================
import json
from pathlib import Path
from typing import Any

import torch
from nebullvm.operations.base import Operation

from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel


class BuildModelOp(Operation):
    """An operation which builds an OpenAlphaTensor model."""

    def __init__(self):
        super().__init__()
        self._model = None

    def execute(
        self,
        tensor_length: int,
        input_size: int,
        scalars_size: int,
        emb_dim: int,
        n_steps: int,
        n_logits: int,
        n_samples: int,
    ):
        """Builds the OpenAlphaTensor model.

        Args:
            tensor_length (int): Number of tensors to as history.
            input_size (int): Flattened size of the matrices to be multiplied.
            scalars_size (int): Size of the scalar vectors fed to the torso
            model.
            emb_dim (int): Embedding dimension.
            n_steps (int): Number of steps used to get a single action out of
            a triplet.
            n_logits (int): Number of logits output by the policy head.
            n_samples (int): Number of samples used by the policy head at
            evaluation time.
        """
        self._model = AlphaTensorModel(
            tensor_length=tensor_length,
            input_size=input_size,
            scalars_size=scalars_size,
            emb_dim=emb_dim,
            n_steps=n_steps,
            n_logits=n_logits,
            n_samples=n_samples,
        )

    def get_model(self) -> AlphaTensorModel:
        """Returns the built model."""
        return self._model

    def get_result(self) -> Any:
        pass


class BuildOptimizerOp(Operation):
    """An operation which builds an optimizer for an OpenAlphaTensor model."""

    def __init__(self):
        super().__init__()
        self._optimizer = None

    def execute(
        self,
        optimizer_name: str,
        model: AlphaTensorModel,
        lr: float,
        weight_decay: float,
    ):
        """Builds the optimizer for the OpenAlphaTensor model.

        Args:
            optimizer_name (str): Name of the optimizer used.
            model (AlphaTensorModel): OpenAlphaTensor model to be trained.
            lr (float): Learning rate.
            weight_decay (float): Weight decay used by the optimizer.
        """
        if optimizer_name == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_name == "adamw":
            optimizer = torch.optim.AdamW(
                model.parameters(), lr=lr, weight_decay=weight_decay
            )
        elif optimizer_name == "sgd":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        else:
            raise ValueError(f"Optimizer {optimizer_name} not supported")
        self._optimizer = optimizer

    def get_optimizer(self) -> torch.optim.Optimizer:
        """Returns the built optimizer."""
        return self._optimizer

    def get_result(self) -> Any:
        pass


class SaveModelOp(Operation):
    """An operation which saves an OpenAlphaTensor model.
    The model parameters are stored in a json file, while the model weights
    are stored in a .pt file."""

    def get_result(self) -> Any:
        pass

    def execute(
        self,
        model: AlphaTensorModel,
        save_dir: str,
    ):
        """Saves the OpenAlphaTensor model.

        Args:
            model (AlphaTensorModel): OpenAlphaTensor model to be saved.
            save_dir (str): Directory where the model will be saved.
        """
        save_dir = Path(save_dir if save_dir else ".")
        save_dir.mkdir(parents=True, exist_ok=True)
        torch.save(model.state_dict(), save_dir / "final_model.pt")
        model_params = {
            "input_size": model.input_size,
            "tensor_length": model.tensor_length,
            "scalars_size": 1,
            "emb_dim": model.emb_dim,
            "n_steps": model.n_steps,
            "n_logits": model.n_logits,
            "n_samples": model.n_samples,
        }
        # save parameters in a json file
        with open(save_dir / "model_params.json", "w") as f:
            json.dump(model_params, f)


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/training_op.py
================================================
from pathlib import Path
from typing import Tuple, Any, List

import torch.optim
from nebullvm.operations.base import Operation

from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.core.training import Trainer
from open_alpha_tensor.operations.checkpoint_op import LoadCheckpointDataOp


class TrainingOperation(Operation):
    """Operation which trains an AlphaTensor model to learn more efficient
    matrix multiplications."""

    def __init__(self):
        super().__init__()
        self._trained_model = None

        self._load_checkpoint_data_op = LoadCheckpointDataOp()

    def execute(
        self,
        model: AlphaTensorModel,
        input_size: int,
        n_steps: int,
        batch_size: int,
        optimizer: torch.optim.Optimizer,
        device: str,
        len_data: int,
        pct_synth: float,
        n_synth_data: int,
        limit_rank: int,
        max_epochs: int,
        n_actors: int,
        mc_n_sim: int,
        N_bar: int,
        last_epoch: int,
        lr: float,
        lr_decay_factor: float,
        lr_decay_steps: int,
        loss_params: Tuple[float, float] = None,
        random_seed: int = None,
        checkpoint_dir: str = None,
        checkpoint_data_dir: str = None,
        n_cob: int = 0,
        cob_prob: float = 0.0,
        data_augmentation: bool = False,
        extra_devices: List[str] = None,
    ):
        """Trains an AlphaTensor model to learn more efficient matrix
        multiplications.

        Args:
            model (AlphaTensorModel): The model to be trained.
            input_size (int): Flattened size of the matrices to be multiplied.
            n_steps (int): Number of steps used to get a single action out of
            a triplet.
            batch_size (int): Batch size.
            optimizer (torch.optim.Optimizer): The optimizer used for training.
            device (str): The name of the torch device used for training.
            len_data (int): Number of training samples used (both actor
            generated and synthetic).
            pct_synth (float): Initial percentage of synthetic samples used
            for training.
            n_synth_data (int): Number of synthetic training samples.
            limit_rank (int): Maximum rank for synthetically-generated
            matrices.
            max_epochs (int): Number of training epochs.
            n_actors (int): Number of actors to play a single each game at
            each training step.
            mc_n_sim (int): Number of simulations during Monte Carlo tree
            search.
            N_bar (int): N_bar parameter used to compute tau when improving
            the policy.
            last_epoch (int): Latest epoch reached during training from which
            checkpoint data will be loaded.
            lr (float): Learning rate.
            lr_decay_factor (float): Learning rate's decay factor.
            lr_decay_steps (int): Number of learning rate's decay steps.
            loss_params (Tuple[float, float]): Alpha and Beta parameters used
            in the loss function.
            random_seed (int): Randomizing seed.
            checkpoint_dir (str): Directory used to store model checkpoints.
            checkpoint_data_dir (str): Directory used to store games as JSON
            files.
            n_cob (int): Number of change of basis (cob) used for a single
            training sample.
            cob_prob (float): Probability of applying a change of basis.
            data_augmentation (bool): Whether to randomly swap the last
            operation of an episode with another operation.
            extra_devices (List[str]): Extra devices names used for multi-GPU
            training.
        """
        checkpoint_data_dir = Path(checkpoint_data_dir or "games")
        # build trainer
        trainer = Trainer(
            model=model,
            tensor_size=input_size,
            n_steps=n_steps,
            batch_size=batch_size,
            optimizer=optimizer,
            device=device,
            len_data=len_data,
            pct_synth=pct_synth,
            n_synth_data=n_synth_data,
            limit_rank=limit_rank,
            loss_params=loss_params,
            random_seed=random_seed,
            checkpoint_dir=checkpoint_dir,
            checkpoint_data_dir=checkpoint_data_dir,
            data_augmentation=data_augmentation,
            cob_prob=cob_prob,
            n_cob=n_cob,
            extra_devices=extra_devices,
        )

        # load checkpoint data
        self._load_checkpoint_data_op.execute(
            games_store_dir=checkpoint_data_dir,
            trainer=trainer,
        )

        # train
        trainer.train(
            n_epochs=max_epochs,
            n_games=n_actors,
            mc_n_sim=mc_n_sim,
            N_bar=N_bar,
            starting_epoch=last_epoch,
            initial_lr=lr,
            lr_decay_factor=lr_decay_factor,
            lr_decay_steps=lr_decay_steps,
        )
        self._trained_model = trainer.model

    def get_trained_model(self):
        """Returns the trained model."""
        return self._trained_model

    def get_result(self) -> Any:
        pass


================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/root_op.py
================================================
from typing import Tuple, List

from nebullvm.operations.base import Operation

from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.operations.checkpoint_op import LoadCheckPointOp
from open_alpha_tensor.operations.model_op import (
    BuildModelOp,
    SaveModelOp,
    BuildOptimizerOp,
)
from open_alpha_tensor.operations.training_op import TrainingOperation


class TrainAlphaTensorRootOp(Operation):
    """Root operation which trains an AlphaTensor model to learn more
    efficient matrix multiplications."""

    def __init__(self):
        super().__init__()
        self._model = None
        self._optimizer = None

        self._build_model_op = BuildModelOp()
        self._build_optimizer_op = BuildOptimizerOp()
        self._load_checkpoint_op = LoadCheckPointOp()
        self._training_op = TrainingOperation()
        self._save_model_op = SaveModelOp()

    def execute(
        self,
        tensor_length: int,
        input_size: int,
        scalars_size: int,
        emb_dim: int,
        n_steps: int,
        n_logits: int,
        n_samples: int,
        optimizer_name: str,
        lr: float,
        lr_decay_factor: float,
        lr_decay_steps: int,
        weight_decay: float,
        loss_params: Tuple[float, float],
        checkpoint_dir: str,
        checkpoint_data_dir: str,
        epochs: int,
        batch_size: int,
        len_data: int,
        n_synth_data: int,
        pct_synth: float,
        limit_rank: int,
        n_actors: int,
        mc_n_sim: int,
        N_bar: int,
        device: str,
        save_dir: str,
        random_seed: int,
        n_cob: int,
        cob_prob: float,
        data_augmentation: bool,
        extra_devices: List[str],
    ):
        """Trains an AlphaTensor model to learn more efficient matrix
        multiplications.

        Args:
            tensor_length (int): Number of step tensors fed to the model
            (history and current state),
            input_size (int): Flattened size of the matrices to be multiplied,
            scalars_size (int): Size of the scalar vectors fed to the torso
            model,
            emb_dim (int): Embedding dimension,
            n_steps (int): Number of steps used to get a single action out of
            a triplet,
            n_logits (int): Number of logits output by the policy head,
            n_samples (int): Number of samples used by the policy head at
            evaluation time,
            optimizer_name (str): Name of the optimizer used,
            lr (float): Learning rate,
            lr_decay_factor (float): Learning rate's decay factor,
            lr_decay_steps (int): Number of learning rate's decay steps,
            weight_decay (float): Weight decay used by the optimizer,
            loss_params (Tuple[float, float]): Alpha and Beta parameters used
            in the loss function,
            checkpoint_dir (str): Directory used to store model checkpoints,
            checkpoint_data_dir (str): Directory used to store games as JSON
            files,
            epochs (int): Number of training epochs,
            batch_size (int): Batch size,
            len_data (int): Number of training samples used (both actor
            generated and synthetic),
            n_synth_data (int): Number of synthetic training samples,
            pct_synth (float): Initial percentage of synthetic samples used
            for training,
            limit_rank (int): Maximum rank for synthetically-generated
            matrices,
            n_actors (int): Number of actors to play a single each game at
            each training step,
            mc_n_sim (int): Number of simulations during Monte Carlo tree
            search,
            N_bar (int): N_bar parameter used to compute tau when improving
            the policy,
            device (str): The name of the torch device used for training,
            save_dir (str): Directory where the final trained model will be
            stored,
            random_seed (int): Randomizing seed,
            n_cob (int): Number of change of basis (cob) used for a single
            training sample,
            cob_prob (float): Probability of applying a change of basis,
            data_augmentation (bool): Whether to randomly swap the last
            operation of an episode with another operation,
            extra_devices (List[str]): Extra devices names used for multi-GPU
            training.
        """
        if self._model is None:
            self._build_model_op.execute(
                tensor_length=tensor_length,
                input_size=input_size,
                scalars_size=scalars_size,
                emb_dim=emb_dim,
                n_steps=n_steps,
                n_logits=n_logits,
                n_samples=n_samples,
            )
            self._model = self._build_model_op.get_model().to(device)

        if self._build_model_op.get_model() is not None:
            self._build_optimizer_op.execute(
                optimizer_name=optimizer_name,
                model=self._build_model_op.get_model(),
                lr=lr,
                weight_decay=weight_decay,
            )
            self._optimizer = self._build_optimizer_op.get_optimizer()

        if self._model is not None and self._optimizer is not None:
            self._load_checkpoint_op.execute(
                self._model, self._optimizer, checkpoint_dir
            )

        if self._load_checkpoint_op.get_model() is not None:
            self._model = self._load_checkpoint_op.get_model()
            self._optimizer = self._load_checkpoint_op.get_optimizer()
            starting_epoch = self._load_checkpoint_op.get_last_epoch()
            self._training_op.execute(
                model=self._model,
                input_size=input_size,
                n_steps=n_steps,
                batch_size=batch_size,
                optimizer=self._optimizer,
                device=device,
                len_data=len_data,
                pct_synth=pct_synth,
                n_synth_data=n_synth_data,
                limit_rank=limit_rank,
                max_epochs=epochs,
                n_actors=n_actors,
                mc_n_sim=mc_n_sim,
                N_bar=N_bar,
                last_epoch=starting_epoch,
                lr=lr,
                lr_decay_factor=lr_decay_factor,
                lr_decay_steps=lr_decay_steps,
                loss_params=loss_params,
                random_seed=random_seed,
                checkpoint_dir=checkpoint_dir,
                checkpoint_data_dir=checkpoint_data_dir,
                n_cob=n_cob,
                cob_prob=cob_prob,
                data_augmentation=data_augmentation,
                extra_devices=extra_devices,
            )
        if self._training_op.get_trained_model() is not None:
            self._model = self._training_op.get_trained_model()
            self._save_model_op.execute(
                model=self._model,
                save_dir=save_dir,
            )

    def get_result(self) -> AlphaTensorModel:
        """Returns the trained torch model"""
        return self._model


================================================
FILE: optimization/open_alpha_tensor/resources/open_alpha_tensor.md
================================================

# Open Source Implementation of DeepMind’s AlphaTensor


Matrix multiplication is a fundamental operation used in many systems, from neural networks to scientific computing routines. Finding efficient and provably correct algorithms for matrix multiplication can have a huge impact on making computation faster and more efficient, but is a very challenging task. The space of possible algorithms is enormous, and traditional methods for discovering algorithms, such as human-designed heuristics or combinatorial search, are often suboptimal.

[DeepMind](https://www.deepmind.com/)'s recently proposed an AI-based solution for automated search that goes far beyond human intuition. The solution consists of a deep reinforcement learning agent called AlphaTensor, built on top of [AlphaZero](https://www.deepmind.com/blog/alphazero-shedding-new-light-on-chess-shogi-and-go). This agent is trained to play a single-player game, TensorGame, where the goal is to discover computationally efficient algorithms for matrix multiplication.

AlphaTensor is particularly good at handling large matrices by decomposing large matrix multiplications into smaller multiplications. Moreover, AlphaTensor can be used to achieve state-of-the-art performance for matrix multiplication once fine-tuned on a specific hardware device.

AlphaTensor has great potential for accelerating deep learning computing. In deep learning, many time-consuming operations can be mapped to matrix multiplications. By using AlphaTensor to optimize these operations, the overall performance of deep learning models can be significantly improved. 

In this article, we will explore DeepMind's AlphaTensor architecture and algorithm and how it discovers new efficient algorithms by playing the TensorGame. Next, we will examine the [first open-source implementation of AlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor), and unresolved challenges to potentially revolutionize the computational performance of deep learning models with AlphaTensors.

![deepmind-4QVqSh4VvP4-unsplash](https://user-images.githubusercontent.com/83510798/221407730-77526b8f-b363-4716-9945-6ccd518632e5.jpg)

Photo by [DeepMind](https://unsplash.com/@deepmind?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText) on [Unsplash](https://unsplash.com/photos/4QVqSh4VvP4)

# What is DeepMind’s AlphaTensor?

AlphaTensor is a reinforcement learning algorithm based on the AlphaZero algorithm and trained to play a simple one-player game: the TensorGame. This game consists in finding the tensor decomposition of a three-dimensional tensor representing the matrix multiplication.

### Matrix Multiplication Tensor

For non-experts in Matrix Multiplication optimization, it may not be straightforward to understand how an operation, such as a matrix multiplication, can be mapped in a three-dimensional tensor. I will try to explain it in simple words and with examples.

Let’s consider the product `C = A*B`, where for simplicity both A and B are square matrices of size N. The multiplication operation can be mapped in a 3D tensor of shape `(N^2, N^2, N^2)` . The first tensor dimension represents the flatten matrix A, the second dimension the flatten matrix B and the third dimension the flatten matrix C.

The tensor has only binary values (either 1 or 0) for each entry. Note that the tensor represents the multiplication operation, so it is independent of the values of the matrices A and B.

Every entry of the tensor corresponds to the coefficient of the operation. For example, to compute C[1,1], it is necessary to multiply both A[1,1] and B[1,1]. Therefore, the tensor entry [0,0,0], which corresponds to A[1,1], B[1,1] and C[1,1], will have value 1. In contrast, to compute C[1,1], A[2,1] is not needed. Thus, the tensor row T[N+1, :, 0] will contain only zeros.

The image below from [DeepMind’s paper](https://www.marktechpost.com/2023/02/20/a-new-ai-approach-using-embedding-recycling-er-can-make-language-model-development-more-efficient-with-2x-faster-training-and-1-8x-speedup-in-inference/) shows an example of a tensor for N=2.

<img width="972" alt="Screen Shot 2023-02-26 at 12 33 26 PM" src="https://user-images.githubusercontent.com/83510798/221408016-9228ec6e-1cd6-44f7-a34c-45ad293989fe.png">

As shown in (b) and (c) in the figure above, it is possible to implement an algorithm for computing the product using a decomposition of the 3D tensor. More specifically, the algorithm below can be used for converting a tensor decomposition (the matrices U, V, W) in a matrix multiplication algorithm.

<img width="637" alt="Screen Shot 2023-02-26 at 1 36 10 PM" src="https://user-images.githubusercontent.com/83510798/221410847-74a7a115-4de6-42d6-9969-51124c2e986b.png">

## The TensorGame

The problem of finding efficient algorithms for matrix multiplication is extremely challenging because the number of possible algorithms to consider is much larger than the number of atoms in the universe, even for small instances of matrix multiplication. 

DeepMind converted this problem into a single-player game, and called it the TensorGame. In this game, the player chooses how to combine different entries of matrices to multiply them. A score is assigned based on the number of operations required to achieve the correct multiplication result. The game ends when the zero tensor is reached or when the maximum number of moves has been made. The final factorization is evaluated based on an estimation of the residual rank and certain optimization criteria, such as asymptotic time complexity or practical runtime.

The initial position in the TensorGame corresponds to the Matrix Multiplication Tensor expressed on some random basis.

In each step t **of the game, the player writes down three vectors $\vec{u}(t), \vec{v}(t), \vec{w}(t)$, which specifies the rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$. The state of the game is updated by subtracting the vectors selected by the player:

$$
\tilde{S}_{t+1} = \tilde{S}_{t} - \vec{u} \otimes \vec{v} \otimes \vec{w}
$$

where $\tilde{S}_0$ is the Matrix Multiplication Tensor.

If the game ends in p steps, this means that the Matrix Multiplication Tensor $\tilde S_0$ can be decomposed into p rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$, i.e. it has at least rank p.

The TensorGame can then be interpreted as a rank decomposition algorithm and AlphaTensor can be seen as an algorithm for estimating the rank of the tensor.

## AlphaTensor Architecture

So far we have learned about the TensorGame and clarified how its solution can be seen as a matrix multiplication algorithm. Let’s now explore the main concepts of AlphaTensor, the algorithm used for the game.

AlphaTensor architecture is basically an encoder-decoder Transformer architecture where: 

- the encoder takes as input the game state $\tilde S_t$, the n previous actions taken by the model (usually n=7) and the time index t **of the current action. Information is stacked together in a tensor with shape `(n+1, N^2, N^2, N^2)` . This tensor is then reshaped and transformed (using three linear layers) in a tensor of shape `(N^2, N^2, c)` where c is the inner dimension of the model.
- the decoder generates the `n_steps` actions from the embedded vector given by the encoder in an auto-regressive way.  Each action corresponds to a token of the triplets $(\vec{u}, \vec{v}, \vec{w})$ representing one of the triplets decomposing the game tensor (i.e. reducing its rank)

The model is trained by alternating back-propagation and model acting. Model acting is used to generate data that is then used to train the model. In practice, the model is trained with a mixture of synthetically generated data and data generated by the model during acting. The acting step is done by taking a 3D tensor corresponding to a matrix operation and playing `n_actors` games on it. Each actor plays a game either on the standard basis or on an alternative basis (the change of basis is applied with a given probability). The results are then collected and can be used in the training step with the synthetic data.

The acting step is based on AlphaZero's Monte Carlo Tree Search (MCTS), modified to support large action spaces. In short, before choosing the action, `n_sims` paths are explored from the model output with a maximum future exploration of 5 steps. The probabilities generated by the model are then adjusted taking into account the generated paths. Then the action with the most promising future path(s) is chosen to continue the game.

While training the model, the reward is actually a negative reward (penalty). Its absolute value increases with each additional step required to solve the game. If the model takes `m` steps to solve a TensorGame, the reward associated with the game is `r=-m.` If the model is not able to solve the TensorGame in `max_rank` steps, the reward is computed by estimating the rank of the remaining tensor. The rank is estimated as the sum of the ranks of the matrices that compose the tensor. The estimate is an upper bound on the true rank of the tensor.

When fine-tuning the model, the penalty reward at the terminal state should also take into account the latency of the algorithm produced by the model.  The reward formula becomes `rt'=rt+λbt`, where `rt` is the reward scheme described earlier, `bt` is the benchmark reward (non-zero only at the terminal state), and *`λ`* is a user-specified coefficient.

<img width="1347" alt="Screen Shot 2023-02-26 at 1 37 12 PM" src="https://user-images.githubusercontent.com/83510798/221410915-7c57c029-e181-4030-8fb3-f4bd544f6beb.png">

The image above from DeepMind's paper shows the speed-ups (%) of AlphaTensor-discovered algorithms tailored for a GPU and a TPU, extracted from DeepMind’s paper. Speed-ups are measured relative to standard (e.g. cuBLAS for the GPU) matrix multiplication on the same hardware and compared to the Strassen-square algorithm.

# The Open Source Implementation of DeepMind’s AlphaTensor

[OpenAlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor) is the first open source implementation of AlphaTensor and was developed by [Diego Fiori](https://www.linkedin.com/in/diego-fiori-/). 

Let's discover more about the implementation.

As we discussed earlier, the AlphaTensor architecture is fairly straightforward, based on a standard transformer with an encoder-decoder architecture. The most interesting components of AlphaTensor are the first layer in the encoder part and the way the actions are sampled.

Let’s start with the first encoding layer.

```python
# x.size = (N, T, S, S, S)
# scalars.size = (N, s)
batch_size = x.shape[0]
S = x.shape[-1]
T = x.shape[1]
x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)
x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)
x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)
input_list = [x1, x2, x3]
for i in range(3):
    temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)
    input_list[i] = torch.cat([input_list[i], temp], dim=-1)
    input_list[i] = self.linears_2[i](input_list[i])
x1, x2, x3 = input_list
```

In the snippet above, we show how the input tensor is decomposed into three tensors, which are then used as query, key and value inputs of the transformer-layer.

1. Across the three tensor dimensions representing the flattened matrices (A, B, C), the input tensor is flattened along each dimension together with the dimension representing the previous actions. In this way, in each flattened-copy of the input tensor, the selected dimension is an aggregation of the last T-1 values and the actual value, for all the S values of the selected dimension, where S=N^2. Philosophically, it is as if, for each dimension, we focus on what happened in the previous actions in that dimension.
2. The scalars are mapped in three different spaces of dimension S^2, and then reshaped to be concatenated with the tensors obtained at the previous point. Conceptually, the scalars are mapped to an embedding space of dimension S^2, and then the embedded information is chunked into S vectors and stacked together, similar to what happens to text when tokenized.
3. Scalar tokens are concatenated with the restructured input tensor and then given as input to a linear layer for mapping the scalars+channel-history focus information in the internal dimension of the model.

These three steps can be interpreted as a way of giving to the model both information about the scalars (as in the TensorGame time step) and the focus on the previous actions for each channel.

Regarding the way the actions are produced, it is interesting to note that AlphaTensor generates as output the triplet u, v, w, which aims to reduce the tensor rank. The three vectors have size S and since they are concatenated the model has to produce a vector of size 3*S.  AlphaTensor is trained with a RL algorithm, so all possible actions must be expressed in terms of probabilities in an enumerated space, i.e. the model produces a probability over the different actions. This means that each vector in the 3S space should be mapped to a different action. This results in an action space of size |F|^(3S), where |F| is the number of different values that the element of u, v, w can take. Usually the values are restricted to (-2, -1, 0, 1, 2), resulting in a cardinality of 5 elements.

Here comes a major challenge: to generate the action probabilities for a matrix product of matrices of size 5 we would need a memory of 5^75 * 4 bytes, which would mean `~10^44 GB` of memory. Clearly we cannot manage such a large action space. 

How do we solve the problem? To reduce the memory footprint of the action probabilities we can split the triplets into smaller chunks, “tokenize” them, and threaten the chunks as generated tokens in the transformer architecture, i.e. the tokens are given as input to the decoder in an auto-regressive way.  In the example above we can split the triplets into 15 chunks, reducing the memory consumption to `15 * 5^(75/15) * 4`, i.e. `187.5 KB`.

```python
def _eval_forward(self, e: torch.Tensor):
    bs = e.shape[0]
    future_g = (
        torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)
    )
    ps = torch.ones((bs, self.n_samples)).to(e.device)
    e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)

    future_g = future_g.view(-1, self.n_steps)
    ps = ps.view(-1)
    e = e.view(-1, e.shape[-2], e.shape[-1])
    for i in range(self.n_steps):
        o_s, z_s = self.core(future_g[:, : i + 1], e)
        future_g[:, i], p_i = sample_from_logits(o_s[:, i])
        ps *= p_i
    future_g = future_g.view(bs, self.n_samples, self.n_steps)
    ps = ps.view(bs, self.n_samples)
    return (
        future_g,
        ps,
        z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),
    )

```

Above we show the code snippet for generating the full action. In the code, `self.core` contains the decoder layer and the tensor `e` represents the output of the encoder layer. Zero can be considered as the `<eos>` token in NLP models and the `n_steps` actions representing the `n_steps` chunks are generated in a progressive way. 

The model returns three quantities:

1. The generated actions
2. The probability associated with the full action
3. The logits produced for generating the first action (the first chunk) that will be used for computing the model value.

It is worth spending a few words on the `n_samples` parameter. The parameter is used for the acting step and it allows the model to generate different versions of the triplets which will then be used for exploring the action space in the Monte Carlo Tree Search algorithm used in the Acting process. The `n_samples` different actions are sampled accordingly to the policy generated by the model.

## Acting Step

The most tricky part of the whole algorithm is probably the Acting step used for solving the TensorGame. The algorithm is not deeply explained in the AlphaTensor paper, since it is based on several DeepMind’s previous papers which are just cited and given as known. Here, I’ll re-compose all the missing pieces and explain step by step our implementation. 

We can organize the acting steps in three different components:

- The Monte-Carlo Tree Search
- The game simulation
- The Improved policy computation

### Monte-Carlo Tree Search (MCTS)

Monte Carlo Tree Search (MCTS) is a widely used artificial intelligence technique for game playing, particularly in board games and video games. The algorithm creates a game tree that simulates potential moves and outcomes and uses random sampling to evaluate the expected reward for each move. The algorithm then repeatedly selects the move with the highest expected reward and continues simulating outcomes until it reaches a terminal state or a specified stopping condition. The simulations are used to estimate the probability of winning for each move and guide the decision-making process. MCTS has been shown to be effective in complex games where the number of possible moves and outcomes is large, and it has been used in successful game-playing AI systems, such as AlphaGo.

In AlphaTensor a modified version of the original MCTS is used. In particular, instead of randomly selecting the action from the whole action space, the action is selected among a subset generated directly by the model (through the `n_samples` presented before). The correction to the policy upgrade is then applied in the **Improved Policy computation** step.

In our implementation, we decided to keep all the information about the Monte-Carlo tree in a dictionary having as key the hash-version of the TensorGame state and as values the information associated with the state itself. Each Monte-Carlo step starts from a node and simulate `n_sim` mini-games, exploring the future with a horizon of 5 moves. If the node has already been explored in previous simulations, n_sim is adjusted considering the number of previous exploration. For each node the number of visits is stored in the `N_s_a` tensor, since this tensor contains the number of visits per node child action (among the ones sampled by the model).

```python
def monte_carlo_tree_search(
    model: torch.nn.Module,
    state: torch.Tensor,
    n_sim: int,
    t_time: int,
    n_steps: int,
    game_tree: Dict,
    state_dict: Dict,
):
"""Runs the monte carlo tree search algorithm.

    Args:
        model (torch.nn.Module): The model to use for the simulation.
        state (torch.Tensor): The initial state.
        n_sim (int): The number of simulations to run.
        t_time (int): The current time step.
        n_steps (int): The maximum number of steps to simulate.
        game_tree (Dict): The game tree.
        state_dict (Dict): The dictionary containing the states.
    """
    state_hash = to_hash(extract_present_state(state))
    if state_hash in state_dict:
        with torch.no_grad():
            N_s_a = state_dict[state_hash][3]
            n_sim -= int(N_s_a.sum())
            n_sim = max(n_sim, 0)

    for _ in range(n_sim):
        simulate_game(model, state, t_time, n_steps, game_tree, state_dict)
    # return next state
    possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[
        state_hash
    ]
    possible_states = _recompose_possible_states(possible_states_dict)
    next_state_idx = select_future_state(
        possible_states, q_values, N_s_a, repetitions, return_idx=True
    )
    next_state = possible_states[next_state_idx]
    return next_state

```

The code above shows our implementation of the algorithm. For a matter of code simplicity the policy correction is performed in the `simulate_game` function. 

### Game Simulation

The `simulate_game` function is responsible for exploring the tree composed of nodes representing a particular state of the TensorGame. It also runs the model whenever a leaf node is encountered and it stores all node information in the `state_dict` dictionary. Let’s give a deep look at its implementation:

```python
@torch.no_grad()
def simulate_game(
    model,
    state: torch.Tensor,
    t_time: int,
    max_steps: int,
    game_tree: Dict,
    states_dict: Dict,
    horizon: int = 5,
):
"""Simulates a game from a given state.

  Args:
      model: The model to use for the simulation.
      state (torch.Tensor): The initial state.
      t_time (int): The current time step.
      max_steps (int): The maximum number of steps to simulate.
      game_tree (Dict): The game tree.
      states_dict (Dict): The states dictionary.
      horizon (int): The horizon to use for the simulation.
  """
	idx = t_time
  max_steps = min(max_steps, t_time + horizon)
  state_hash = to_hash(extract_present_state(state))
  trajectory = []
  # selection
  while state_hash in game_tree:
      (
          possible_states_dict,
          old_idx_to_new_idx,
          repetition_map,
          N_s_a,
          q_values,
          actions,
      ) = states_dict[state_hash]
      possible_states = _recompose_possible_states(possible_states_dict)
      state_idx = select_future_state(
          possible_states, q_values, N_s_a, repetition_map, return_idx=True
      )
      trajectory.append((state_hash, state_idx))  # state_hash, action_idx
      future_state = extract_present_state(possible_states[state_idx])
      state = possible_states[state_idx]
      state_hash = to_hash(future_state)
      idx += 1

  # expansion
  if idx <= max_steps:
      trajectory.append((state_hash, None))
      if not game_is_finished(extract_present_state(state)):
          state = state.to(model.device)
          scalars = get_scalars(state, idx).to(state.device)
          actions, probs, q_values = model(state, scalars)
          (
              possible_states,
              cloned_idx_to_idx,
              repetitions,
              not_dupl_indexes,
          ) = extract_children_states_from_actions(
              state,
              actions,
          )
          not_dupl_actions = actions[:, not_dupl_indexes].to("cpu")
          not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(
              "cpu"
          )
          N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu")
          present_state = extract_present_state(state)
          states_dict[to_hash(present_state)] = (
              _reduce_memory_consumption_before_storing(possible_states),
              cloned_idx_to_idx,
              repetitions,
              N_s_a,
              not_dupl_q_values,
              not_dupl_actions,
          )
          game_tree[to_hash(present_state)] = [
              to_hash(extract_present_state(fut_state))
              for fut_state in possible_states
          ]
          leaf_q_value = q_values
  else:
      leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())
  # backup
  backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)
```

Each simulation is divided in three parts:

- Selection
- Expansion
- Backup

In the `selection` part the simulation is run on the already generated tree-nodes, and the following node is selected using the following function:

```python
def select_future_state(
    possible_states: List[torch.Tensor],
    q_values: torch.Tensor,
    N_s_a: torch.Tensor,
    repetitions: Dict[int, list],
    c_1: float = 1.25,
    c_2: float = 19652,
    return_idx: bool = False,
) -> torch.Tensor:
"""Select the future state maximizing the upper confidence bound."""
# q_values (1, K, 1)
    pi = torch.tensor(
        [
            len(repetitions[i])
            for i in range(len(possible_states))
            if i in repetitions
        ]
    ).to(q_values.device)
    ucb = q_values.reshape(-1) + pi * torch.sqrt(
        torch.sum(N_s_a) / (1 + N_s_a)
    ) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))
    if return_idx:
        return ucb.argmax()
    return possible_states[ucb.argmax()]
```

In practice, the action maximizing the `ucb` function

$$
Q(a,s) + \pi(a,s) * \sqrt{\frac{\sum_i{N(s, a_i)}}{1+N(s,a)}} * \left[c_1 + \log\left(\frac{1+c_2+\sum_i{N(s, a_i)}}{c_2}\right)\right]
$$

for the given state is selected. Where Q represents the Q values generated by the model and π represents the random distribution over the actions sampled using the model policy. `N(s, a)` represents the number of visits of the node to action a from node s.

Once the selection phase reaches a leaf node, if the simulation has not reached a terminal condition (in terms of either maximum exploration, i.e. future horizon, or game ending), the model is then used for selecting `n_samples` alternative nodes (they will be leaf nodes in the successive iteration). This is called the `expansion` phase, since new nodes are added to the tree. Then, no further node is explored in the current simulation, but the leaf q_value is sent to the following simulation step: the `backup`.

Backup is the final stage of each simulation. During backup, if the leaf node was a terminal state the final reward is computed else the leaf q value is used as an estimated reward. Then the reward is back-propagated on the simulation trajectory updating both the states q_values and updating the visit counter `N(s, a)`. In the snippet below we show the code for the reward back-propagation.

```python
def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):
"""Backward pass of the montecarlo algorithm"""
reward = 0
    for idx, (state, action_idx) in enumerate(reversed(trajectory)):
        if action_idx is None:  # leaf node
            reward += leaf_q_value
        else:
            (
                _,
                old_idx_to_new_idx,
                _,
                N_s_a,
                q_values,
                _,
            ) = states_dict[state]
            if isinstance(reward, torch.Tensor):
                reward = reward.to(q_values.device)
            action_idx = int(action_idx)
            if action_idx in old_idx_to_new_idx:
                not_dupl_index = old_idx_to_new_idx[int(action_idx)]
            else:
                not_dupl_index = action_idx
            reward -= 1
            q_values[:, not_dupl_index] = (
                N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward
            ) / (N_s_a[:, not_dupl_index] + 1)
            N_s_a[:, not_dupl_index] += 1
```

### Improved Policy Computation

Once all the simulations have been run and the MCTS offers an interesting snapshot of the near future it is time to update the policy associated with the predicted nodes and return them, so that they can be used during training. The improved policy, following the method described in [Hubert et al](https://arxiv.org/pdf/2104.06303.pdf), is used for managing large action spaces. In fact, for small search space it is possible during MCTS to sample an action randomly from the action space and evaluate its impact. A similar approach in a much larger action space would lead to all trajectories to diverge in different paths and it would need an infinite amount of trajectories for getting meaningful statistics and then update the policy. Since here we are using sample-MCTS for avoiding the dispersion, i.e. `n_samples` actions are sampled accordingly to the model policy and then MCTS just selects one of the sampled actions while exploring the tree, we need to take into account the sample-correction when computing the final updated policy that will be used while training the model.

In practice the improved policy is computed as

$$
I\pi\left(s, a\right) = \frac{N^{1/\tau(s)}(s, a)}{\sum_iN^{1/\tau(s)}(s, a_i)}
$$

where $\tau(s) = \frac{\log\left(\sum_iN(s, a_i)\right)}{\log\left(\bar{N}\right)}$ if $\sum_iN(s, a_i) > \bar{N}$ else $\tau(s) = 1$.

```python
def compute_improved_policy(
    state_dict: Dict,
    states: List[str],
    model_n_steps: int,
    model_n_logits: int,
    N_bar: int,
):
		"""Compute the improved policy given the state_dict, the list of states.
    The improved policy is computed as (N_s_a / N_s_a.sum())ˆ(1/tau) where tau
    is (log(N_s_a.sum()) / log(N_bar)) if N_s_a.sum() > N_bar else 1.
    """
		policies = torch.zeros(len(states), model_n_steps, model_n_logits)
    N_bar = torch.tensor(N_bar)
    for idx, state in enumerate(states):
        N_s_a = state_dict[state][3]
        actions = state_dict[state][5]
        if N_s_a.sum() > N_bar:
            tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()
        else:
            tau = 1
				N_s_a = N_s_a ** (1 / tau)
        improved_policy = N_s_a / N_s_a.sum()
        for sample_id in range(actions.shape[1]):
            action_ids = actions[0, sample_id]
            for step_id, action_id in enumerate(action_ids):
                policies[idx, step_id, action_id] += improved_policy[
                    0, sample_id
                ]
    return policies
```

Note that in our implementation after having computed the policy from the `N_s_a` tensor we have to map it back to the original action tensor. In fact `N_s_a` just considers the actions sampled by the model, while the final policy must contain probabilities also for the not-explored actions.

### Differences respect to ChatGPT training algorithm

AlphaTensor is the latest member of the AlphaGo/AlphaZero family of artificial intelligence methods by DeepMind. These methods are based on the Monte Carlo Tree Search (MCTS) algorithm, which has been refined and enhanced by DeepMind to tackle increasingly complex tasks. Another AI system, OpenAI's ChatGPT, which has caused a lot of buzz for its remarkable performance, was trained with a different approach, called Reinforcement Learning with Human Feedback (RLHF).

RLHF is a fine-tuning technique used to tune language models to follow a set of written instructions. It uses human preferences as a reward signal to fine-tune the model, thereby aligning the behavior of the language model with the stated preferences of a specific group of people, rather than some broader notion of ‘human values’.

In contrast, MCTS is a tree-based search algorithm used to determine the optimal moves in games. It simulates potential moves and updates the values of each move based on their outcomes, guiding the selection of the best move.

RLHF collects data from human-written demonstrations and human-labelled comparisons between AI models, and trains a reward model to predict the preferences of a given group of people. The reward model is then used to fine-tune the AI models. MCTS, on the other hand, uses simulations and evaluations to determine the best decision.

Although they are different approaches, RLHF and MCTS also have similarities. Both artificial intelligence techniques use decision-making and problem-solving methods, and both use a trial-and-error approach to explore different options and make decisions based on available information. Both are also iterative processes that improve over time as more information and experience are gathered.

The choice between RLHF and MCTS depends on the task at hand. RLHF is ideal when there is no clear metric for evaluating the model performance, while MCTS has proven effective in game-like tasks where knowledge and exploration of the future give the model a significant advantage.

## Code Optimization for AlphaTensor training

Implementing the AlphaTensor training algorithm requires finding the perfect compromise between training speed and memory consumption. As seen in the Model section, simply considering the action tokenization can save a lot of memory, but an overly aggressive action space reduction can lead to both drop in accuracy and slower performance. The latter happens because all tokens are generated sequentially in an autoregressive way by the model decoder. Therefore, the inference time grows linearly with the number of tokens per action once the softmax on the action space is not the bottleneck anymore.

When setting up AlphaTensor training, the main difficulties were found in dealing with the acting process. If the tensors are not stored in the correct format, the MCTS can easily cause uncontrolled memory usage growth. On the other hand, if the number of tensors stored during each simulation is reduced too much, the MCTS can spend an infinite amount of time re-computing the required states.

Let's take an example of the game simulation step, where the game is explored by looking at possible future scenarios. For each state, if we don't save the actions generated by the model and we decide to save only the random seed used to sample the actions from the policy, then each time we explore a tree node we would have to recompute the policy and then sample the actions. Clearly, we decided to store the sampled actions to save time and to avoid having to manage model sharing between different processes in the case of MCTS exploration parallelization.
However, just saving the actions was not enough to get a sufficiently efficient acting step. In fact, the time for converting the n_steps actions into the (u, v, w) triplet, reducing the game tensor state and creating the new3D tensors from the n_samples actions would easily be a bottleneck for the whole training.
Secondly, we didn't want to store all possible future states for each sampled action, as this would have a huge impact on the memory used by the algorithm. Suppose we set n_samples=32, n=7 and N=5, and let's remember that N is the size of the square matrix product we want to reduce and n is the number of previous actions remembered by the model. In this situation, each state tensor would have the form (8, 25, 25, 25), which multiplied by 32 would result in 32*8*25*25*25*4 bytes for each node in the graph. Now, considering that each simulation in the expansion phase generates a new node (and n_sim=200), we would have a final memory consumption of 200*32*8*25*25*25*4 = 3.2GB for the first MCTS node alone. In the worst case scenario, while exploring acting max_rank nodes (where `max_rank=150`), this would result in a total memory consumption of 150 * 3.2GB = 480GB in RAM memory (or GPU memory if all tensors were stored on the GPU). We ran the training on our workstation with 128 GB of RAM and 48 GB of GPU memory, so we had to reduce the memory consumption.

Since we didn't want to increase the execution time, we adopted an optimization that exploits the redundancy in the state tensors produced. In fact, the tensors have n-1 previous actions in common, which can then be stored once and not repeated for each stored tensor. This results in a memory reduction of 2/7~28%, meaning that in the worst case 137GB can be stored. At this point, by simply pruning the unused part of the tree (such as the unselected trajectories) and storing the tensors in CPU memory, we were able to avoid any memory error during training.

# Next Steps

With AlphaTensor now being open source, several exciting avenues for further development open up.

A natural next step is to fine-tune AlphaTensor on specific hardware devices and benchmark performance. At the time of writing, fine-tuning was in progress.

Another important advance would be the support for remote compilation, allowing users to build algorithms optimized for edge devices. This can be achieved by storing the AlphaTensor model on a server, while the matrix multiplication algorithm is evaluated on different hardware.

It could also be important to extend support for different compilers to compute the latency-based reward correction. Different compilers can lead to different optimized algorithms on a given hardware. For example, the DeepMind paper showed promising results using JAX and the XLA compiler on TPU and Nvidia GPUs. It would be interesting to evaluate this using NCCL on Nvidia or llvm on CPUs.

Finally, extending the model and training algorithm to support larger matrix sizes remains a major open challenge. Currently, AlphaTensor supports a maximum matrix size of 5, but it can be applied by splitting larger matrix multiplications into groups of tiny MMs with a size smaller than 5. This approach is suboptimal, and performing the reduction directly on the large tensor corresponding to the full MM could theoretically lead to better results.

## Speedster integration of AlphaTensor

AlphaTensor opens the doors for further improvements to Speedster. [Speedster](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster) is an open source module designed to speed up AI inference with just a few lines of code. The library automatically applies the best set of SOTA optimization techniques to achieve maximum inference speed-up.

Within Speedster, AlphaTensor will use its optimized kernels for matrix multiplication to find the optimal set of sub-operations for each layer in the AI model that involve matrix multiplication, including linear layers, attention layers, and convolution layers. The matrix multiplications will be decomposed into sub-matrix multiplications up to the maximum size supported by AlphaTensor, and the fastest decomposition will be selected for each layer. This optimization process will be applied to all layers in the neural network, resulting in a dramatically improved model.

We expect to see significant speed-ups especially in transformer models, where large matrix multiplications become the computational bottleneck at larger sizes. We also plan to support AlphaTensor algorithm generation for reduced precision formats, such as fp16 and int8, in addition to fp32.


================================================
FILE: optimization/open_alpha_tensor/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages


REQUIREMENTS = [
    "nebullvm",
    "torch",
    "tqdm",
]

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")

setup(
    name="OpenAlphaTensor",
    version="0.0.1",
    packages=find_packages(),
    install_requires=REQUIREMENTS,
    long_description=long_description,
    include_package_data=True,
    long_description_content_type="text/markdown",
)


================================================
FILE: optimization/optimate/README.md
================================================
# 🧉 OptiMate (WIP)
Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.

If you like this library, give us a star to show your support for the project ⭐

## 📖 Description
The OptiMate module is targeted at a sophisticated and savvy type of users, who need to squeeze out every last drop of performance out of a given hardware. 

The module is designed to help users to optimize their deep-learning models through the use of profilers and advanced optimization techniques. It also includes a smart assistant that guides the user through the optimization process and provides suggestions to improve the performance of the model. 

Each temporary optimization is tracked in a detailed version history, allowing the user to revert to its preferred version at the end of the optimization process.

First, the module leverages profilers to gather information about the model, such as the amount of time it takes for the model to make predictions and the amount of memory used. This information helps in identifying bottlenecks and other inefficiencies in the model.

Then, the module uses various optimization techniques to improve inference performances. These  techniques include, among others, model compression, pruning, and quantization, which can help reduce the size and computational demand of the model.

Throughout the process, the smart assistant provides guidance and suggestions to the user. For example, it might suggest which optimization techniques to try out or provide guidance on how to adjust the model parameters to improve its performance.

Overall, the module provides a user-friendly but sophisticated interface to get the most out of any model / hardware setup. Try it out today, and reach out if you have any feedback!


================================================
FILE: optimization/speedster/README.md
================================================
# 💥 Speedster

`Speedster` reduces inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs). The idea is to make AI inference way cheaper in just a few lines of code.

`Speedster` makes it easy to combine optimization techniques across the whole software-to-hardware stack, delivering best-in-class cost savings. If you like the idea, give us a star to support the project ⭐

![speedster](https://user-images.githubusercontent.com/53374883/225599469-f1a626f0-c001-42bd-bc8b-ec0e966ddad6.png)

The core `Speedster` workflow consists of 3 steps:

- [x]  **Select**: input your model in your preferred DL framework and express your preferences regarding:
    - Accuracy loss: do you want to trade off a little accuracy for significant cost savings?
    - Optimization time: achieving great savings can be time-consuming. Can you wait, or do you need an instant answer?
- [x]  **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.
- [x]  **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just cheaper 🚀).

# Installation

Install `Speedster` and its base requirements:
```
pip install speedster
```

Then make sure to install all the available deep learning compilers.
```
python -m nebullvm.installers.auto_installer --compilers all
```
> :warning: For **MacOS** with **ARM processors**, please use a conda environment.
> Moreover, if you want to optimize a **PyTorch model**, PyTorch must be pre-installed 
> on your environment before proceeding to the next step, please install it from this 
> [link](https://pytorch.org/get-started/locally/).

For more details on how to install Speedster, please visit our [Installation](https://docs.nebuly.com/Speedster/installation/) guide.

# Quick start

Only one line of code - that’s what you need to accelerate your model! Find below your getting started guide for 5 different input model frameworks:

<details>
<summary>🔥 PyTorch </summary>
    
In this section, we will learn about the 4 main steps needed to optimize PyTorch models:

1) Input your model and data
2) Run the optimization
3) Save your optimized model 
4) Load and run your optimized model in production

```python
import torch
import torchvision.models as models
from speedster import optimize_model, save_model

#1 Provide input model and data (we support PyTorch Dataloaders and custom input, see the docs to learn more)
model = models.resnet50()  
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]

#2 Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)

#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```

Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.

```python
#4 Load and run your PyTorch accelerated model in production
from speedster import load_model

optimized_model = load_model("model_save_path")

output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with PyTorch Optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/).
    
</details>
<details>
<summary>🤗 Hugging Face Transformers </summary>
    
In this section, we will learn about the 4 main steps needed to optimize 🤗 Hugging Face Transformer models:

1) Input your model and data
2) Run the optimization
3) Save your optimized model 
4) Load and run your optimized model in production

* <details><summary><b>✅ For Decoder-only or Encoder-only architectures (Bert, GPT, etc)</b></summary>

    ```python
    from transformers import AlbertModel, AlbertTokenizer
    from speedster import optimize_model, save_model

    #1a. Provide input model: Load Albert as an example
    model = AlbertModel.from_pretrained("albert-base-v1")
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")

    #1b. Dictionary input format (also string format is accepted, see the docs to learn more)
    text = "This is an example text for the huggingface model."
    input_dict = tokenizer(text, return_tensors="pt")
    input_data = [input_dict for _ in range(100)]

    #2 Run Speedster optimization (if input data is in string format, also the tokenizer 
    # should be given as input argument, see the docs to learn more)
    optimized_model = optimize_model(
        model, 
        input_data=input_data, 
        optimization_time="constrained",
        metric_drop_ths=0.05
    )

    #3 Save the optimized model
    save_model(optimized_model, "model_save_path")
    ```

    Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.

    ```python
    #4 Load and run your Huggingface accelerated model in production
    from speedster import load_model

    optimized_model = load_model("model_save_path")

    output = optimized_model(**input_sample)
    ```
    For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).

    </details>

* <details><summary><b>✅ For Encoder-Decoder architectures (T5 etc)</b></summary>


    ```python
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    from speedster import optimize_model, save_model

    #1a. Provide input model: Load T5 as an example
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer = T5Tokenizer.from_pretrained("t5-small") 

    #1b. Dictionary input format
    question = "What's the meaning of life?"
    answer = "The answer is:"
    input_dict = tokenizer(question, return_tensors="pt")
    input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids
    input_data = [input_dict for _ in range(100)]

    #2 Run Speedster optimization (if input data is in string format, also the tokenizer 
    # should be given as input argument, see the docs to learn more)
    optimized_model = optimize_model(
        model, 
        input_data=input_data, 
        optimization_time="constrained",
        metric_drop_ths=0.05
    )

    #3 Save the optimized model
    save_model(optimized_model, "model_save_path")
    ```

    Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.

    ```python
    #4 Load and run your Huggingface accelerated model in production
    from speedster import load_model

    optimized_model = load_model("model_save_path")

    output = optimized_model(**input_sample)
    ```
    For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).

    </details>
    
</details>

<details>
<summary>🧨 Hugging Face Diffusers </summary>

> :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).

In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the Diffusers library:

1) Input your model and data
2) Run the optimization
3) Save your optimized model 
4) Load and run your optimized model in production

```python
import torch
from diffusers import StableDiffusionPipeline
from speedster import optimize_model, save_model

#1 Provide input model and data
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

if device == "cuda":
    # On GPU we load by default the model in half precision, because it's faster and lighter.
    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)
else:
    pipe = StableDiffusionPipeline.from_pretrained(model_id)

# Create some example input data
input_data = [
    "a photo of an astronaut riding a horse on mars",
    "a monkey eating a banana in a forest",
    "white car on a road surrounded by palm trees",
    "a fridge full of bottles of beer",
    "madara uchiha throwing asteroids against people"
]

#2 Run Speedster optimization
optimized_model = optimize_model(
    model=pipe,
    input_data=input_data,
    optimization_time="unconstrained",
    ignore_compilers=["torch_tensor_rt", "tvm"],
    metric_drop_ths=0.1,
)

#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```

Once the optimization is completed, start using the accelerated model (on steroids 🚀).

```python
#4 Load and run your PyTorch accelerated model in production
from speedster import load_model

optimized_model = load_model("model_save_path", pipe=pipe)

test_prompt = "futuristic llama with a cyberpunk city on the background"
output = optimized_model(test_prompt).images[0]
```
For more details, please visit [Getting Started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).
    
</details>

<details>
<summary>🌊 TensorFlow/Keras </summary>
    
In this section, we will learn about the 4 main steps needed to optimize TensorFlow/Keras models:

1) Input your model and data
2) Run the optimization
3) Save your optimized model 
4) Load and run your optimized model in production

```python
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from speedster import optimize_model, save_model

#1 Provide input model and data (we support Keras dataset and custom input, see the docs to learn more)
model = ResNet50() 
input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]

#2 Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)

#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```

Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.

```python
#4 Load and run your TensorFlow accelerated model in production
from speedster import load_model

optimized_model = load_model("model_save_path")

output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/).

</details>
<details>
    
<summary> ⚡ ONNX </summary>

In this section, we will learn about the 4 main steps needed to optimize ONNX models:

1) Input your model and data
2) Run the optimization
3) Save your optimized model 
4) Load and run your optimized model in production

```python
import numpy as np
from speedster import optimize_model, save_model

#1 Provide input model and data
# Model was downloaded from here: 
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx" 
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]

#2 Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)

#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```

Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.

```python
#4 Load and run your ONNX accelerated model in production
from speedster import load_model

optimized_model = load_model("model_save_path")

output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/).
    
</details>

# **Documentation**

- [Installation](https://docs.nebuly.com/Speedster/installation/)
- [Getting started with PyTorch optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/)
- [Getting started with Hugging Face optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/)
- [Getting started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/)
- [Getting started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/)
- [Getting started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/)
- [Key concepts](https://docs.nebuly.com/Speedster/key_concepts/)
- [Notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster)
- [Advanced options](https://docs.nebuly.com/Speedster/advanced_options/)
- [Benchmarks](https://docs.nebuly.com/Speedster/benchmarks/)


# **Key concepts**

Speedster's design reflects our mission to automatically master each and every existing AI acceleration technique to deliver the most cost-efficient AI ever. As a result, `Speedster` leverages available enterprise-grade open-source optimization tools. If these tools and  communities already exist, and are distributed under a permissive license (Apache, MIT, etc), we integrate them and happily contribute to their communities. However, many tools do not exist yet, in which case we implement them and open-source the code so that our community can benefit from it.

`Speedster` is shaped around **4 building blocks** and leverages a modular design to foster scalability and integration of new acceleration components across the software to hardware stack.

- [x]  **Converter:** converts the input model from its original framework to the framework backends supported by `Speedster`, namely PyTorch, ONNX and TensorFlow. This allows the Compressor and Compiler modules to apply any optimization technique to the model.
- [x]  **Compressor:** applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.
- [x]  **Compiler:** converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.
- [x]  **Inference Learner:** takes the best performing compiled model and converts it back into the same interface as the original input model.

![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png)

The **compressor** stage leverages the following open-source projects:

- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.
- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.

The **compiler stage** leverages the following open-source projects:

- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.
- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.
- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.
- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.
- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator
- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.
- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.


# **Community**
We’re developing `Speedster` for and together with our community, so please get in touch on GitHub or Discord. 

• **[GitHub issues](https://github.com/nebuly-ai/nebullvm/issues)**: suggest new acceleration components, request new features, and report bugs and improvements.

• **[Discord](https://discord.gg/RbeQMu886J)**: learn about AI acceleration, share exciting projects and hang out with our global community.

The best way to get started is to pick a good-first issue. Please read our [contribution guidelines](https://docs.nebuly.com/contributions/) for a deep dive into how to best contribute to our project!

Don't forget to leave a star ⭐ to support the project and happy acceleration 🚀


================================================
FILE: optimization/speedster/docs/en/docs/advanced_options.md
================================================
# Advanced options

If you’re new to the library, you may want to start with the **Getting started** section.

The user guide here shows more advanced workflows and how to use the library in different ways. We are going to show some examples of more advanced usages of `Speedster`, that we hope will give you a deeper insight of how `Speedster` works. 

In particular, we will overview:

- [`optimize_model`](#optimizemodel-api) API
- [Acceleration suggestions](#acceleration-suggestions)
- [Selecting which device](#selecting-which-device-to-use--cpu-gpu-and-other-accelerators) to use: CPU, GPU and other accelerators
- [Optimization Time: constrained vs unconstrained](#optimization-time--constrained-vs-unconstrained)
- [Selecting specific compilers/compressors](#select-specific-compilerscompressors)
- [Using dynamic shape](#using-dynamic-shape)
- [Enable TensorrtExecutionProvider for ONNXRuntime on GPU](#enable-tensorrtexecutionprovider-for-onnxruntime-on-gpu)
- [Custom models](#custom-models)
- [Store the performances of all the optimization techniques](#store-the-performances-of-all-the-optimization-techniques)
- [Set number of threads](#set-number-of-threads)

## `optimize_model` API

The `optimize_model` function allows to optimize a model from one of the supported frameworks (PyTorch, HuggingFace, TensorFlow, ONNX), and returns an optimized model that can be used with the same interface as the original model.

```python
def optimize_model(
        model: Any,
        input_data: Union[Iterable, Sequence],
        metric_drop_ths: Optional[float] = None,
        metric: Union[str, (...) -> Any, None] = None,
        optimization_time: str = "constrained",
        dynamic_info: Optional[dict] = None,
        config_file: Optional[str] = None,
        ignore_compilers: Optional[List[str]] = None,
        ignore_compressors: Optional[List[str]] = None,
        store_latencies: bool = False,
        device: str = None,
        **kwargs: Any
) -> Any
```

**Arguments**

`model`: Any

The input model can belong to one of the following frameworks: PyTorch, TensorFlow, ONNX, HuggingFace. In the ONNX case, `model` is a string with the path to the saved onnx model. In the other cases, it is a torch.nn.Module or a tf.Module.

`input_data`: Iterable or Sequence

Input data needed to test the optimization performances (latency, throughput, accuracy loss, etc). It can consist of one or more data samples. Note that if `optimization_time` is set to "unconstrained," it would be preferable to provide at least 100 data samples to also activate `Speedster` techniques that require more data (pruning, etc.). See the Getting started section to learn more about the `input_data` depending on your input framework:

- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md#1-input-model-and-data)
- [Getting started with 🤗 HuggingFace optimization](getting_started/hf_getting_started.md#1-input-model-and-data)
- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md#1-input-model-and-data)
- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md#1-input-model-and-data)
- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md#1-input-model-and-data)

`metric_drop_ths`: float, optional

Maximum drop in your preferred metric (see "metric" section below). All the optimized models having a larger error with respect to the `metric_drop_ths` will be discarded. 

Default: 0.

`metric`: Callable, optional

Metric to be used for estimating the error that may arise from using optimization techniques and for evaluating if the error exceeds the `metric_drop_ths`.  `metric` accepts as input a string, a user-defined metric, or None. Metric accepts a string containing the name of the metric; it currently supports:

- "numeric_precision"
- "accuracy". 
- user-defined metric: function that takes as input the output of the original model and the one of the optimized model, and, if available, the original label. The function calculates and returns the reduction in the metric due to the optimization. 

Default: "numeric_precision". 

`optimization_time`: OptimizationTime, optional

The optimization time mode. It can be "constrained" or "unconstrained". In "constrained" mode, Speedster takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation. Note that most techniques activated in "unconstrained" mode require fine-tuning, and therefore it is recommended to provide at least 100 samples as input_data. 

Default: "constrained".

`dynamic_info`: Dict, optional

Dictionary containing dynamic axis information. It should contain as keys both "input" and "output" and as values two lists of dictionaries, where each dictionary represents dynamic axis information for an input/output tensor. The inner dictionary should have an integer as a key, i.e. the dynamic axis (also considering the batch size) and a string as a value giving it a tag, e.g., "batch_size.". 

Default: None.

`config_file`: str, optional

Configuration file containing the parameters needed to define the CompressionStep in the pipeline. 

Default: None.

`ignore_compilers`: List[str], optional

List of DL compilers ignored during optimization execution. The compiler name should be one among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite, bladedisc, torchscript, intel_neural_compressor . 

Default: None.

`ignore_compressors`: List[str], optional

List of DL compressors ignored during the compression stage. The compressor name should be one among sparseml and intel_pruning. 

Default: None.

`store_latencies`: bool, optional

Parameter that allows to store the latency for each compiler used by Speedster in a json file. The JSON is created in the working directory. 

Default: False.

`device`: str, optional

Device used for inference, it can be cpu or gpu/cuda (both gpu and cuda options are supported). A specific gpu can be selected using notation gpu:1 or cuda:1. gpu will be used if available, otherwise cpu. 

Default: None.

**Returns: Inference Learner**

Optimized version with the same interface of the input model. For example, optimizing a PyTorch model will return an InferenceLearner object that can be called exactly like a PyTorch model (either with model.forward(input) or model(input)). The optimized model will therefore take as input a torch.Tensors and return a torch.Tensors.

## Acceleration suggestions

If the speedup you obtained with the first optimization with `Speedster` is not enough, we suggest the following actions:

- Include more backends for optimization, i.e. set `--backend all`
- Increase the `metric_drop_ths` by 5%, if possible: see [Optimize_model API](#optimize_model-api)
- Verify that your device is supported by your version of speedster: see [Supported hardware](hardware.md)
- Try to accelerate your model on a different hardware or consider using the CloudSurfer module to automatically understand which is the best hardware for your model: see [CloudSurfer](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/cloud_surfer) module.

## Selecting which device to use: CPU, GPU and other accelerators.

Speedster currently supports the following devices: `CPUs`, `GPUs`, `TPUs` and `AWS Inferentia chips`.

The parameter `device` allows to select which device we want to use for inference. By default, `Speedster` will use the accelerator if available on the machine, otherwise it will use cpu. If we are running on a machine with an available accelerator and we want to optimize the model for cpu inference, we can use:

```python
from speedster import optimize_model

optimized_model = optimize_model(
  model, input_data=input_data, device="cpu"
)
```

If we are working on a multi-gpu machine and we want to use a specific gpu, we can use:

```python
from speedster import optimize_model

optimized_model = optimize_model(
  model, input_data=input_data, device="cuda:1"  # also device="gpu:1" is supported
)
```

The same applies also for TPUs and AWS Inferentia chips: 

```python
from speedster import optimize_model

optimized_model = optimize_model(
  model, input_data=input_data, device="tpu:1"  # use tpu #1
)

optimized_model = optimize_model(
  model, input_data=input_data, device="neuron:1"  # use Inferentia chip #1
)
```

## Optimization Time: constrained vs unconstrained

One of the first options that can be customized in `Speedster` is the `optimization_time` parameter. In order to optimize the model, `Speedster` will try a list of compilers which allow to keep the same accuracy of the original model. In addition to compilers, it can also use other techniques such as pruning, quantization, and other compression techniques which can lead to a little drop in accuracy and may require some time to complete. 

We defined two scenarios:

- **constrained**: only compilers and precision reduction techniques are used, so the compression step (the most time consuming one) is skipped. Moreover, in some cases the same compiler could be available for more than one pipeline, for example tensor RT is available both with PyTorch and ONNX backends. In the constrained scenario, each compiler will be used only once, so if for example we optimize a PyTorch model and tensor RT in the PyTorch pipeline manages to optimize the model, it won't be used again in the ONNX pipeline.

- **unconstrained**: in this scenario, `Speedster` will use all the compilers available, even if they appear in more than one backend. It also allows the usage of more time consuming techniques such as pruning and distillation. Note that for using many of the sophisticated techniques in the 'unconstrained' optimization, a small fine-tuning of the model will be needed. Thus, we highly recommend to provide as input_data at least 100 samples when selecting 'unconstrained' optimization.


##  Select specific compilers/compressors

The `optimize_model` functions accepts also the parameters `ignore_compilers` and `ignore_compressors`, which allow to skip specific compilers or compressors. 
The full list of available options is the following:
- _ignore_compilers_: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `onnx_tensor_rt`, `torchscript`, `onnxruntime`, `tflite`, `tvm`, `onnx_tvm`, `torch_tvm`, `bladedisc`, `openvino`, `intel_neural_compressor`, `torch_xla`, `torch_neuron`.
- _ignore_compressors_: `sparseml`, `intel_pruning`.

Some compilers, such as tensor RT, are available for both PyTorch and ONNX backends. For this reason in the list of compilers we have `tensor_rt` which skips both the PyTorch and ONNX pipelines, and `torch_tensor_rt` and `onnx_tensor_rt` which skip only the PyTorch and ONNX pipelines respectively.

If we want to skip the `tvm` and `bladedisc` optimizers, we could write:

```python
from speedster import optimize_model

optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    ignore_compilers=["tvm", "bladedisc"]
)
```

## Using dynamic shape

By default, a model optimized with `Speedster` will have a static shape. This means that it can be used in inference only with the same shape of the inputs provided to the `optimize_model` function during the optimization. The dynamic shape however is fully supported, and can be enabled with the `dynamic_info` parameter (see the [optimize_model API](#optimize_model-api) arguments to see how this parameter is defined.)

For each dynamic axis in the inputs, we need to provide the following information:
- the axis number (starting from 0, considering the batch size as the first axis)
- a tag that will be used to identify the axis
- the minimum, optimal and maximum sizes of the axis (some compilers will work also for shapes that are not in the range [min, max], but the performance may be worse)

Let's see an example of a model that takes two inputs, where the batch size must be dynamic, as well as the size on the third and fourth dimensions.

```python
import torch
import torchvision.models as models
from speedster import optimize_model

# Load a resnet as example
model = models.resnet50()

# Provide an input data for the model
input_data = [((torch.randn(1, 3, 256, 256),), torch.tensor([0])) for _ in range(100)]

# Set dynamic info
dynamic_info = {
    "inputs": [
        {
            0: {
                "name": "batch",
                "min_val": 1,
                "opt_val": 1,
                "max_val": 8,
            }, 
            2: {
                "name": "dim_image",
                "min_val": 128,
                "opt_val": 256,
                "max_val": 512,
            }, 
            3: {
                "name": "dim_image",
                "min_val": 128,
                "opt_val": 256,
                "max_val": 512,
            }, 
        }
    ],
    "outputs": [
        {0: "batch", 1: "out_dim"}
    ]
}

# Run Speedster optimization in one line of code
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained", 
    dynamic_info=dynamic_info
)
```

## Enable TensorrtExecutionProvider for ONNXRuntime on GPU

By default, `Speedster` will use the `CUDAExecutionProvider` for ONNXRuntime on GPU. If you want to use the `TensorrtExecutionProvider` instead, you must add the TensorRT installation path to the env variable LD_LIBRARY_PATH.
If you installed TensorRT through the nebullvm auto_installer, you can do it by running the following command in the terminal:

```bash
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"/<PATH_TO_PYTHON_FOLDER>/site-packages/tensorrt"
```

## Custom models

`Speedster` is designed to optimize models that take as inputs and return in output only tensors or np.ndarrays (and dictionaries/strings for huggingface). Some models may require instead a custom input, for example a dictionary where the keys are the names of the inputs and the values are the input tensors, or may return a dictionary as output. We can optimize such models with `Speedster` by defining a model wrapper.

Let's take the example of the detectron2 model which takes as input a tuple of tensors but returns a dictionary as output:

```python
 class BaseModelWrapper(torch.nn.Module):
    def __init__(self, core_model, output_dict):
        super().__init__()
        self.core_model = core_model
        self.output_names = [key for key in output_dict.keys()]
    
    def forward(self, *args, **kwargs):
        res = self.core_model(*args, **kwargs)
        return tuple(res[key] for key in self.output_names)


class OptimizedWrapper(torch.nn.Module):
    def __init__(self, optimized_model, output_keys):
        super().__init__()
        self.optimized_model = optimized_model
        self.output_keys = output_keys
    
    def forward(self, *args):
        res = self.optimized_model(*args)
        return {key: value for key, value in zip(self.output_keys, res)}

input_data = [((torch.randn(1, 3, 256, 256)), torch.tensor([0]))]

# Compute the original output of the model (in dict format) 
res = model_backbone(torch.randn(1, 3, 256, 256))

# Pass the model and the output sample to the wrapper
backbone_wrapper = BaseModelWrapper(model_backbone, res)

# Optimize the model wrapper
optimized_model = optimize_model(backbone_wrapper, input_data=input_data)

# Wrap the optimized model with a new wrapper to restore the original model output format
optimized_backbone = OptimizedWrapper(optimized_model, backbone_wrapper.output_names)

```

You can find other examples in the [notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) section available on GitHub.

## Store the performances of all the optimization techniques

`Speedster` internally tries all the techniques available on the target hardware and automatically chooses the fastest one. If you need more details on the inference times of each compiler, you can set the `store_latencies` parameter to `True`. A json file will be created in the working directory, listing all the results of the applied techniques and of the original model itself.

```python
# Run Speedster optimization in one line of code
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    store_latencies=True
)
```

## Set number of threads
When running multiple replicas of the model in parallel, it would be useful for CPU-optimized algorithms to limit the number of threads to use for each model. In `Speedster`, it is possible to set the maximum number of threads a single model can use with the environment variable `NEBULLVM_THREADS_PER_MODEL`. 

For instance, you can run:

```python
export NEBULLVM_THREADS_PER_MODEL = 2
```

for using just two CPU threads per model at inference time and during optimization.

================================================
FILE: optimization/speedster/docs/en/docs/benchmarks.md
================================================
# Benchmarks

!!! info
    In this section you are going to learn how `Speedster` accelerates the inference of various models on different hardware architecture.

Here we provide a preview of the following accelerated models:

- [Bert](#bert)
- [YoloV5](#yolov5)
- [EfficientNet](#efficientnet)
- [GPT2](#gpt2)
- [ResNet](#resnet)
- [Roberta](#roberta)

The above models are tested on very popular hardware architecture and instances:

- AWS - c5n,2xlarge
- AWS - c5,12xlarge
- AWS - c6i.12xlarge
- AWS - m6i,24xlarge
- NVIDIA T4
- NVIDIA V100
- NVIDIA 3090

## Bert
![bert](images/bert.png)

## YoloV5
![yolo](images/yolov5.png)

## EfficientNet
![yolo](images/efficientnet.png)

## GPT2
![yolo](images/gpt2.png)

## ResNet
![yolo](images/resnet.png)

## Roberta
![yolo](images/roberta.png)

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/diffusers_getting_started.md
================================================
# Getting started with Stable Diffusion optimization
In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the `Diffusers` library:

1. [Environment Setup](#1-input-model-and-data)
2. [Input your model and data](#2-input-model-and-data)
3. [Run the optimization](#3-run-the-optimization)
4. [Save your optimized model](#4-save-your-optimized-model)
5. [Load and run your optimized model in production](#5-load-and-run-your-optimized-model-in-production)

## 1) Environment Setup (GPU only)
In order to optimize a Stable Diffusion model, you have to ensure that your environment is correctly set up according to these requirements: `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`.

From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above.

There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.

For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully.

You can check your CUDA version with the following command:

```bash
nvidia-smi
```

If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads

You can check your TensorRT version with the following command:

```bash
python -c "import tensorrt; print(tensorrt.__version__)"
```

If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:
```
pip install -U tensorrt
```

You can finally check your PyTorch version  with the command
```bash
python -c "import torch; print(torch.__version__)"
```
If you have torch>=2.0.0, you can downgrade it by running:
```
pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
```

## 2) Input model and data

!!! info
    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). 


For Stable Diffusion models Speedster expects the input data to be a list of sentences: ```List[str]```

```python
import torch
from speedster import optimize_model
from diffusers import StableDiffusionPipeline


# Load Stable Diffusion 1.4 as example
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

if device == "cuda":
    # On GPU we load by default the model in half precision, because it's faster and lighter.
    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)
else:
    pipe = StableDiffusionPipeline.from_pretrained(model_id)

# Create some example input data
input_data = [
    "a photo of an astronaut riding a horse on mars",
    "a monkey eating a banana in a forest",
    "white car on a road surrounded by palm trees",
    "a fridge full of bottles of beer",
    "madara uchiha throwing asteroids against people"
]
```

Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.

## 3) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. 

The function takes the following arguments as inputs:

- `model`: model to be optimized in your preferred framework (A Diffusers pipe in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation 
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration

and returns the accelerated version of your model 🚀.

``` python
from speedster import optimize_model

# Run Speedster optimization
optimized_model = optimize_model(
    pipe, 
    input_data=input_data, 
    optimization_time="unconstrained",
    metric_drop_ths=0.05
)
```

Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.

At the end of the optimization, you are going to see the results in a summary table like the following:

![pt](../images/stable_diffusion.png)

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.

If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.

## 4) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:

```python
from speedster import save_model

save_model(optimized_model, "model_save_path")
```

Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.

## 5) Load and run your optimized model in production
Once the optimized model has been saved,  it can be loaded with the `load_model` function:
```python
from speedster import load_model

optimized_model = load_model("model_save_path", pipe=pipe)
```

In this case we must provide also the original pipe as argument to the load_function, Speedster will automatically load the optimized model and replace the original UNet inside the pipe.

The optimized model can be used for accelerated inference in the same way as the original model:

```python
# Use the accelerated version of your Stable Diffusion model in production
output = optimized_model(test_prompt).images[0]
```

!!! info
    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.

If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/hf_getting_started.md
================================================
# Getting started with HuggingFace optimization
In this section, we will learn about the 4 main steps needed to optimize your 🤗 HuggingFace models:

1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)

## 1) Input model and data

!!! info
    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). 

For HuggingFace models we support different types of input data depending on the architecture of your input model.

- [x]  For Decoder-only or Encoder-only architectures (Bert, GPT, etc), we support:

    - Dictionary
    - String

- [x]  For Encoder-Decoder architectures (T5 etc), we support: 
    - Dictionary


=== "Decoder-only or Encoder-only (Bert, GPT, etc)"
    **Input as Dictionary**

    ```python
    from transformers import AlbertModel, AlbertTokenizer

    # Load Albert as example
    model = AlbertModel.from_pretrained("albert-base-v1")
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")

    # Case 1: dictionary input format
    text = "This is an example text for the huggingface model."
    input_dict = tokenizer(text, return_tensors="pt")
    input_data = [input_dict for _ in range(100)]
    ```
    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.


    **Input as String**

    In the string case, the HuggingFace tokenizer must be given as input to the `optimize_model` in addition to the `input_data`, and the arguments for the tokenizer can be passed using the param `tokenizer_args`.

    ```python
    from transformers import AlbertModel, AlbertTokenizer

    # Load Albert as example
    model = AlbertModel.from_pretrained("albert-base-v1")
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")

    # Case 2: strings input format
    input_data = [
        "This is a test.",
        "Hi my name is John.",
        "The cat is on the table.",
    ]
    tokenizer_args = dict(
        return_tensors="pt",
        padding="longest",
        truncation=True,
    )
    ```
    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.

=== "Encoder-Decoder architectures (T5 etc)"
    For encoder-decoder architectures we support only `input_data` as Dictionary:
    ```python
    from transformers import T5Tokenizer, T5ForConditionalGeneration

    # Load T5 as example
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer = T5Tokenizer.from_pretrained("t5-small") 

    # Case 1: dictionary input format
    question = "What's the meaning of life?"
    answer = "The answer is:"
    input_dict = tokenizer(question, return_tensors="pt")
    input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids
    input_data = [input_dict for _ in range(100)]
    ```
    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.


## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. 

The function takes the following arguments as inputs:

- `model`: model to be optimized in your preferred framework (HuggingFace in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation 
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration

and returns the accelerated version of your model 🚀.

Depending on the format of your `input_data`, the `optimize_model` is as follows:

=== "Input as Dictionary"
    ```python
    from speedster import optimize_model

    # Run Speedster optimization
    optimized_model = optimize_model(
        model, 
        input_data=input_data, 
        optimization_time="constrained",
        metric_drop_ths=0.05
    )
    ```

=== "Input as String"
    ```python
    from speedster import optimize_model

    # Run Speedster optimization
    optimized_model = optimize_model(
        model, 
        input_data=input_data, 
        optimization_time="constrained", 
        metric_drop_ths=0.05,
        tokenizer=tokenizer,
        tokenizer_args={"return_tensors": "pt"}
    )
    ```

Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.

At the end of the optimization, you are going to see the results in a summary table like the following:

![pt](../images/pt_table.png)

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.

If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.

## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:

```python
from speedster import save_model

save_model(optimized_model, "model_save_path")
```

Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.

## 4) Load and run your optimized model in production
Once the optimized model has been saved,  it can be loaded with the `load_model` function:
```python
from speedster import load_model

optimized_model = load_model("model_save_path")
```

The optimized model can be used for accelerated inference in the same way as the original model:

```python
# Use the accelerated version of your HuggingFace model in production
output = optimized_model(**input_sample)
```

!!! info
    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.

If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/onnx_getting_started.md
================================================
# Getting started with ONNX optimization
In this section, we will learn about the 4 main steps needed to optimize your ONNX models:

1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)

## 1) Input model and data

!!! info
    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). 

```python
import numpy as np

# Load a resnet as example
# Model was downloaded from here: 
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx"

# Provide input data for the model    
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]
```

Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.

## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. 

The function takes the following arguments as inputs:

- `model`: model to be optimized in your preferred framework (ONNX in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation 
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration

and returns the accelerated version of your model 🚀.

``` python
from speedster import optimize_model

# Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)
```

Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.

At the end of the optimization, you are going to see the results in a summary table like the following:

![pt](../images/pt_table.png)

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.

If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.

## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:

```python
from speedster import save_model

save_model(optimized_model, "model_save_path")
```

Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.

## 4) Load and run your optimized model in production
Once the optimized model has been saved,  it can be loaded with the `load_model` function:
```python
from speedster import load_model

optimized_model = load_model("model_save_path")
```

The optimized model can be used for accelerated inference in the same way as the original model:

```python
# Use the accelerated version of your ONNX model in production
output = optimized_model(input_sample)
```

!!! info
    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.

If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/pytorch_getting_started.md
================================================
# Getting started with PyTorch optimization
In this section, we will learn about the 4 main steps needed to optimize PyTorch models:

1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)

## 1) Input model and data

!!! info
    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). 


For PyTorch models we support two types of input data:

* Custom data format
* PyTorch DataLoader

=== "Custom Data Format"
    Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```

    - Each element of the list is a tuple, which represents a batch of the dataset.
    - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.

    ``` python
    import torch
    import torchvision.models as models

    # Load a resnet as example
    model = models.resnet50()

    # Provide input data for the model    
    input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]
    ```

    See below further examples with custom format:
    ``` python
    # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 with labels
    input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]

    # Dataset for a model that takes 2 inputs, containing 100 batches of data with bs=5 with labels
    input_data = [((torch.randn(5, 3, 256, 256), torch.randn(5, 3, 256, 256), ), torch.tensor([0, 1, 0, 1, 1])) for _ in range(100)]

    # Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 without labels
    input_data = [((torch.randn(1, 3, 256, 256), ), ) for _ in range(100)]
    ```

    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.

=== "PyTorch DataLoader"
    We support the following DataLoader types:

    * Tensor only
    * Tensor and labels


    For models with multiple inputs, we support the following types:

    - input_1, input_2, ..., input_n, label
    - (input_1, input_2, ..., input_n), label

    ```python
    import torch
    import torchvision.models as models

    # Load a resnet as example
    model = models.resnet50()

    # Use your PyTorch DataLoader in any of the standard format
    input_data = <insert your PyTorch DataLoader here>
    ```

    Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.

## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. 

The function takes the following arguments as inputs:

- `model`: model to be optimized in your preferred framework (PyTorch in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation 
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration

and returns the accelerated version of your model 🚀.

``` python
from speedster import optimize_model

# Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)
```

Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.

At the end of the optimization, you are going to see the results in a summary table like the following:

![pt](../images/pt_table.png)

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.

If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.

## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:

```python
from speedster import save_model

save_model(optimized_model, "model_save_path")
```

Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.

## 4) Load and run your optimized model in production
Once the optimized model has been saved,  it can be loaded with the `load_model` function:
```python
from speedster import load_model

optimized_model = load_model("model_save_path")
```

The optimized model can be used for accelerated inference in the same way as the original model:

```python
# Use the accelerated version of your PyTorch model in production
output = optimized_model(input_sample)
```

!!! info
    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.

If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/tf_getting_started.md
================================================
# Getting started with TensorFlow optimization
In this section, we will learn about the 4 main steps needed to optimize TensorFlow models:

1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)

## 1) Input model and data

!!! info
    In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc). 

For TensorFlow models we support two types of input data:

* Custom data format
* TensorFlow DataLoader

=== "Custom Data Format"
    Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```

    - Each element of the list is a tuple, which represents a batch of the dataset.
    - In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.

    ``` python
    import tensorflow as tf
    from tensorflow.keras.applications.resnet50 import ResNet50

    # Load a resnet as example
    model = ResNet50()

    # Provide input data for the model    
    input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]
    ```

    Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.

=== "TensorFlow DataLoader"
    We support the following DataLoader types:

    * Tensor only
    * Tensor and labels


    For models with multiple inputs, we support the following types:

    - input_1, input_2, ..., input_n, label
    - (input_1, input_2, ..., input_n), label

    ```python
    import torch
    import torchvision.models as models

    # Load a resnet as example
    model = models.resnet50()

    # Use your TensorFlow DataLoader in any of the standard format
    input_data = <insert your TensorFlow DataLoader here>
    ```

    Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.

## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model. 

The function takes the following arguments as inputs:

- `model`: model to be optimized in your preferred framework (TensorFlow in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation 
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration

and returns the accelerated version of your model 🚀.

``` python
from speedster import optimize_model

# Run Speedster optimization
optimized_model = optimize_model(
    model, 
    input_data=input_data, 
    optimization_time="constrained",
    metric_drop_ths=0.05
)
```

Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.

At the end of the optimization, you are going to see the results in a summary table like the following:

![pt](../images/hf_table.png)

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.

If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.

## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:

```python
from speedster import save_model

save_model(optimized_model, "model_save_path")
```

Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.

## 4) Load and run your optimized model in production
Once the optimized model has been saved,  it can be loaded with the `load_model` function:

```python
from speedster import load_model

optimized_model = load_model("model_save_path")
```

The optimized model can be used for accelerated inference in the same way as the original model:

```python
# Use the accelerated version of your TensorFlow model in production
output = optimized_model(input_sample)
```

!!! info
    The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.

If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.


================================================
FILE: optimization/speedster/docs/en/docs/hardware.md
================================================
# Supported hardware

`Speedster` has been mostly tested on Nvidia GPUs and Intel/AMD CPUs. The library may also work with other hardware on which has not been tested. Please let us know if you find out that `Speedster` works well on other hardware or if you find issues.

Fully supported hardware:

- Intel CPU
- Nvidia GPU

Hardware we are currently integrating:

- Apple M1
- AMD CPU
- Intel GPU (open issue 👩‍💻)

================================================
FILE: optimization/speedster/docs/en/docs/installation.md
================================================
# Installation
In this installation guide we will learn:

- [Quick installation](#quick-installation) of `Speedster` with pip **(Recommended)** 

- [Selective installation](#optional-selective-installation-of-speedster-requirements) of the requirements **(Optional)**

- [Installation](#optional-download-docker-images-with-frameworks-and-optimizers) with Docker **(Optional)** 

- [Set up Speedster on custom DL devices](#set-up-speedster-on-custom-dl-devices) to run models on Google TPUs and AWS Inferentia Chips


## Quick installation 
You can easily install `Speedster` using pip.

    pip install speedster

Then make sure to install all the available deep learning compilers:

    python -m nebullvm.installers.auto_installer --compilers all


!!! info
    If you want to optimize PyTorch or HuggingFace models, PyTorch must be pre-installed in the environment before using the auto-installer, please install it from [this](https://pytorch.org/get-started/locally/) link. Moreover, for Mac computers with M1/M2 processors, please use a conda environment, or you may run into problems when installing some of the deep learning compilers.

Great, now you are ready to accelerate your model 🚀 Please visit the following pages to get started based on the DL framework of your input model:

- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md)
- [Getting started with 🤗 Hugging Face optimization](getting_started/hf_getting_started.md)
- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md)
- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md)
- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md)


## (Optional) Selective installation of Speedster requirements

By default, the `auto_installer` installs all the DL frameworks and compilers supported by `Speedster`. However, some of these may not be relevant to your use case. In this section, we explain how you can customize the installation of these libraries, avoiding those that are not needed.

To customize the libraries installation you have two options:

- [Use the auto-installer (recommended)](#use-the-auto-installer-recommended)
- [Install the libraries manually](#manual-installation)

### Use the auto-installer (recommended)
To understand how to selectively install your preferred libraries, let's examine the auto-installer API:

```bash
python -m nebullvm.installers.auto_installer 
    --frameworks <frameworks> 
    --extra-backends <backends> 
    --compilers <compilers>
```

!!! Description

    === "--frameworks"

        `frameworks` is used to specify the deep learning framework of your input model. The supported frameworks are `torch`, `tensorflow`, `onnx`, `huggingface` and `diffusers`.

        - if you want to optimize a model with a single DL framework, the code is as follows (example below for HuggingFace):
            
            ```python
            python -m nebullvm.installers.auto_installer --frameworks huggingface
            ```
            
            Please remember that for PyTorch optimization, you should pre-install PyTorch from the official [repo](https://pytorch.org/get-started/locally/).
                
        - if you want to optimize models in multiple input frameworks, you can include them separated with a space:
            ```python
            python -m nebullvm.installers.auto_installer --frameworks tensorflow torch
            ```

        - If you want to include all the frameworks, you can use `all` as the argument:

            ```python
            python -m nebullvm.installers.auto_installer --frameworks all
            ```

        Default: `all`.
    
    === "--extra-backends"

        After entering your input model, `Speedster` converts the input model from its original framework into an intermediate framework to be used during the optimization; we call these intermediate frameworks "backends." To learn more, see the section [Model Converter](https://docs.nebuly.com/Speedster/key_concepts/) in the docs. This conversion allows `Speedster` to apply all optimization techniques without being constrained by the input framework of your model.
            
        The supported backends are `torch`, `tensorflow` and `onnx`.
            
        You can specify multiple backends by separating them with a space. 
            
        - For example, if you want to install TensorFlow and ONNX as backends of an HugginFace model, the code is as follows:
            
            ```python
            python -m nebullvm.installers.auto_installer --frameworks huggingface --extra-backends tensorflow onnx
            ```python
            
        - If you want to install all the backends supported by the selected frameworks, you can use `all` as the argument.
        - If you don't want to install extra backends, you can set `--extra-backends none`.
            
        The extra-backends that you choose must be compatible with at least one of the input frameworks you previously selected with the argument `—-frameworks`, please see the table below to see the compatibility matrix. 

        Default: `all`.    

    === "--compilers"

        `compilers` is used to specify the deep learning compilers to be installed. The supported compilers are: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `openvino` and `intel_neural_compressor`. The compilers must be compatible with at least one of the backends selected with the argument `—-extra-backends`, please see the table below to see the compatibility matrix.

        - You can specify multiple compilers by separating them with a space. For example:
            
            ```python
            --compilers deepsparse tensor_rt
            ```
            
            will install DeepSparse and TensorRT. 
            
        - If you want to install all the compilers supported by the selected frameworks/backends, you can use `all` as the argument.

        Speedster also supports `torchscript`, `tf_lite`, and `onnxruntime` as built-in; these are preinstalled with their respective backends, so there is no need to include them in the list. Speedster also supports `tvm`, which is currently not supported by the automatic installer and must be installed manually; see the next section if you wish to include it.

        Default: `all`.


Let's see an example of how to use these three arguments:

```bash
python -m nebullvm.installers.auto_installer 
    --frameworks torch 
    --extra-backends all 
    --compilers all
```

This command will setup your environment to optimize PyTorch models, and will install all PyTorch supported backends and compilers.

The following table shows the supported combinations of frameworks, backends and compilers that you can install with the auto-installer:

| Framework    | Extra Backends            | Compilers                                                               |
|--------------|---------------------------|-------------------------------------------------------------------------|
| PyTorch      | ONNX                      | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |
| TensorFlow   | ONNX                      | TensorRT, OpenVINO                                                      |
| ONNX         | /                         | TensorRT, OpenVINO                                                      |
| Hugging Face | PyTorch, TensorFlow, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |
| Diffusers    | PyTorch, ONNX             | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |


!!! info
    Hugging Face models can be of two types, PyTorch-based or TensorFlow-based. For PyTorch-based models, it is necessary to include `torch` as an extra-backend. For TensorFlow-based models, you must include `tensorflow` as an extra-backend.

### Manual installation

If you want to manually install the requirements, this section collects links to the official installation guides for all frameworks and compilers supported by `Speedster`.

#### Deep Learning frameworks/backends
- PyTorch: https://pytorch.org/get-started/locally/
- TensorFlow: https://www.tensorflow.org/install
- ONNX: https://github.com/onnx/onnx#installation
- HuggingFace: https://huggingface.co/transformers/installation.html
- Diffusers: https://github.com/huggingface/diffusers#installation

#### Deep Learning compilers
- DeepSparse: https://github.com/neuralmagic/deepsparse#installation
- TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html
- Torch TensorRT: https://pytorch.org/TensorRT/getting_started/installation.html#installation
- ONNXRuntime: https://onnxruntime.ai/docs/install/#python-installs
- OpenVINO: https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#step-4-install-the-package
- Intel Neural Compressor: https://github.com/intel/neural-compressor#installation
- Apache TVM: https://tvm.apache.org/docs/install/index.html

#### Other requirements
- tf2onnx: https://github.com/onnx/tensorflow-onnx#installation (Install it if you want to convert TensorFlow models to ONNX)
- polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy#installation (Install it if you want to use TensorRT)
- onnx-simplifier: https://github.com/daquexian/onnx-simplifier#python-version (Install it if you want to use TensorRT)
- onnx_graphsurgeon: https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon#installation (Install it if you want to use TensorRT with Stable Diffusion)
- onnxmltools: https://github.com/onnx/onnxmltools#install (Install it if you want to convert models to ONNX)

## (Optional) Download Docker images with frameworks and optimizers

Instead of installing the frameworks and compilers needed for optimization, which can be a time-consuming task, you can simply download a Docker container with all compilers preinstalled.

To pull up the Docker image, run:

    docker pull nebulydocker/nebullvm:latest

and then run and access the Docker with:

    docker run -ti --gpus=all nebulydocker/nebullvm:latest

After optimizing the model, you may decide to deploy it to production. Note that you need to have the deep learning compiler used to optimize the model and other components inside the production Docker. For this reason, we have created several versions of the Docker nebullvm container in the [Docker Hub](https://hub.docker.com/repository/docker/nebulydocker/nebullvm), each containing only one compiler. Pull the image with the compiler that has optimized your model!

## Set up Speedster on custom DL devices

From version `0.10.0`, Speedster supports optimization of PyTorch models on `Google TPUs` and `AWS Inferentia` chips. 
For these devices, the user must ensure that the required libraries are installed on the machine. 
The following sections describe how to install the required libraries for each device.

### Google TPUs

In order to use a TPU, you must request a TPU-enabled VM from Google Cloud. You can consult the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en) 
for more information about how to create a TPU VM and how to get started with PyTorch on TPUs.

To use Speedster on Google TPUs, we will use the [`torch_xla`](https://github.com/pytorch/xla) library, which is already 
preinstalled in all the Google Cloud TPU VMs, you will find it in the base Python3 environment.

After creating the VM, you can follow these steps to set up Speedster:
- Check that the `torch_xla` library is installed in the base Python3 environment. You can do this by running `python3 -c "import torch_xla; print(torch_xla.__version__)"` in the VM console;
- Set TPU runtime configuration as explained in the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#set_tpu_runtime_configuration);
- [Optional] Check that the TPU is working by running the [official example](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#perform_a_simple_calculation);
- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on TPUs.

You are now ready to use Speedster on TPUs! Speedster will automatically detect the TPU device and will use the `torch_xla` library to optimize the model, comparing its performances with the original model running on the CPU.

### AWS Inferentia

For AWS Inferentia, you must first create an AWS EC2 instance with the `inf1` instance type. 
You can find more information about `inf1` instances in the [official documentation](https://aws.amazon.com/it/ec2/instance-types/inf1/).

!!! info
    AWS has recently released the `inf2` instance type, which is a more powerful version of `inf1`. For now `inf2` 
instances are only available in private preview, you can request them directly to AWS by filling this [form](https://pages.awscloud.com/EC2-Inf2-Preview.html).

To use Speedster on AWS Inferentia, we will use the [`torch-neuron`](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) library, that must be manually installed on `inf1` instances (on `inf2`instances it's already preinstalled if you use the PyTorch DLAMI provided by AWS).

You can find here the full guides to set up the EC2 instances and install the required libraries:
- `inf1`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/setup/pytorch-install.html#install-neuron-pytorch
- `inf2`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install

After creating the EC2 instance and installing `torch_neuron`, you can follow these steps to set up Speedster:
- Check that the `torch_neuron` library is installed, you can do this by running `python -c "import torch_neuron; print(torch_neuron.__version__)"` in the console (if using `inf1` instances, otherwise change `torch_neuron` with `torch_neuronx`);
- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on AWS Inferentia.

You are now ready to use Speedster on AWS Inferentia! Speedster will automatically detect the AWS Inferentia device and will use the `torch_neuron` library to optimize the model, comparing its performances with the original model running on the CPU.


================================================
FILE: optimization/speedster/docs/en/docs/key_concepts.md
================================================
# Key concepts

In this section we are going to learn the architectural design of the 4 building blocks of `Speedster`.

- [x]  **Converter**: converts the input model from its original framework to the framework backends supported by Speedster, namely PyTorch, TensorFlow, and ONNX. This allows the Compressor and Optimizer modules to apply any optimization technique to the model.
- [x]  **Compressor**: applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.
- [x]  **Optimizer**: converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.
- [x]  **Inference Learner**: takes the best performing compiled model and converts it to the same interface as the original input model.

![speedster_blocks](https://user-images.githubusercontent.com/42771598/213177175-a76908a2-5eef-4e82-9d54-0fc812131463.png)

The **compressor** stage leverages the following open-source projects:

- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.
- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.

The **compiler stage** leverages the following open-source projects:

- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.
- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.
- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.
- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.
- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator
- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.
- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.

## Model converter
!!! Definition
    The Converter converts the input model from its original input framework to the framework backends supported by `Speedster`. This conversion enables the Compressor and the Compiler modules to apply all the optimization techniques without being constrained by the framework of your input model.

![image info](images/converter.png)

`Speedster` supports deep learning models in the following input frameworks:

- Hugging Face
- Diffusers
- ONNX
- PyTorch
- TensorFlow

`Speedster` now includes 3 backends:

- **ONNX backend**, which supports models in any input framework.
- **PyTorch backend**, which supports input models in PyTorch and ONNX and Hugging Face. 
- **TensorFlow backend**, which supports input models in TensorFlow and ONNX.

As you notice, to date, not all cross-conversions from input frameworks to each `Speedster` backend are supported. 

Let's see a couple of examples to better understand the potenatiality of the Converter block:

1. PyTorch model as input: first of all Speedster will try the compilers available in the PyTorch backend pipeline, then it will convert it to ONNX and will try also the ones available in the ONNX backend optimization pipeline. Finally, the best one among them will be chosen and returned as the optimized model in your input framework (in this case PyTorch).

2. HuggingFace model as input: Let's assume that for your specific use case, the best optimization technique is a specific type of dynamic quantization only supported by PyTorch. If you feed a Hugging Face model into Speedster, the Converter will first transform your model into a PyTorch model. Speedster will then quantize it and finally return it as an Hugging Face model.

## Compressor

The compressor applies various compression techniques to the model:

- Block-wise un/structured sparsity (🎉 launched in 0.4.0 🎉)
- Knowledge distillation (to be supported)
- Layer replacement (to be supported)
- Low-rank compression (to be supported)
- Quantization-aware training (to be supported)
- SparseML (🎉 launched in 0.4.0 🎉)

![image info](images/compressor.png)

## Compiler

The Compiler block converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The different DL compilers perform both the low-level optimizations, which mostly consist of various quantization techniques, and graph optimizations. Finally, the model is compiled into binary.

![image info](images/compiler.png)

Supported deep learning compilers:

- Apache TVM
- BladeDISC (🎉 launched in 0.4.0 🎉)
- DeepSparse (🎉 launched in 0.4.0 🎉)
- MLIR (open pull request 👩‍💻)
- ONNX Runtime
- OpenVINO
- TensorRT
- TF Lite / XLA
- TorchScript

Supported low-level optimizations:

- Static quantization
- Dynamic quantization
- Half-precision
- Low-bit quantization on TVM (to be supported)

## Inference learner

The Learner, or Inference Learner, selects the most performing compiled model on your hardware and converts it to the same interface as the original input model.

![image info](images/learner.png)

================================================
FILE: optimization/speedster/docs/en/docs/notebooks.md
================================================
# Notebooks

In this section you can find optimization notebooks for multiple DL input models:

- HuggingFace
- Diffusers
- ONNX
- Pytorch
- Tensorflow

Please check out notebooks and tutorials on GitHub at [this](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) link.

================================================
FILE: optimization/speedster/docs/en/docs/overview.md
================================================
# Overview


`Speedster` is an open-source module designed to accelerate AI inference in just a few lines of code.
The library allows you to seamlessy modulate the inference performances of your AI models in terms of latency, throughput, model size, accuracy, cost and automatically applies the best set of optimization techniques along the software to hardware stack to meet your targets.

`Speedster` makes it easy to combine optimization techniques across the whole software to hardware stack, delivering best in class speed-ups. If you like the idea, give us a star to support the project ⭐

![speedster](https://user-images.githubusercontent.com/53374883/225600620-1cd84073-d9b3-43d1-84fa-c3e6c25eb915.png)

The core `Speedster` workflow consists of 3 steps:


- [x]  **Select**: input your model in your preferred DL framework and express your preferences regarding:
    - Accuracy loss: do you want to trade off a little accuracy for much higher performance?
    - Optimization time: stellar accelerations can be time-consuming. Can you wait, or do you need an instant answer?
- [x]  **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.
- [x]  **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just on steroids 🚀).

Now you are ready to start accelerating your models, visit the [Installation](installation.md) section to start right away!


================================================
FILE: optimization/speedster/docs/en/docs/telemetry.md
================================================
# Telemetry


`Speedster` is a young and rapidly evolving open-source project. There is plenty of room for improvement for Speedster to make your model achieve the very best performance on your hardware... and you may still find some bugs in the code 🪲

Contributions to this OSS project are warmly welcomed 🤗. We encourage you to check out the Contribution guidelines to understand how you can become an active contributor of the source code.

## Sharing feedback to improve Speedster

Open source is a unique resource for sharing knowledge and building great projects collaboratively with the OSS community. To support the continued development, upon installation of Speedster you could share the information strictly necessary to improve the performance of this open-source project and facilitate bug detection and fixing.

More specifically, you will foster project enhancement by sharing details of the optimization techniques used with Speedster and the performance achieved on your model and hardware.

**Which data do we collect?**

We make sure to collect as little data as possible to improve the open-source project:

- basic information about the environment
- basic information about the optimization

Please find below an example of telemetry collection:

```python
{
"nebullvm_version": "0.6.0",
"app_version": "0.0.1",
"model_id": "e33a1bbf-fcfd-4f5a-81c9-a9154c7e9343_-7088971112344091114",
"model_metadata": {
    "model_name": "ResNet",
    "model_size": "102.23 MB",
    "framework": "torch"
},
"hardware_setup": {
    "cpu": "Apple M1 Pro",
    "operative_system": "Darwin",
    "ram": "17.18 GB"
},
"optimizations": [
    {
        "compiler": "torch",
        "technique": "original",
        "latency": 0.03
    },
    {
        "compiler": "NUMPY_onnxruntime",
        "technique": "none",
        "latency": 0.01
    }
],
"ip_address": "1.1.1.1"
}
```

**How to opt-out?**

You can simply opt-out from telemetry collection by setting the environment variable `SPEEDSTER_DISABLE_TELEMETRY to 1`.

**Should I opt out?**

Being open-source, we have very limited visibility into the use of the tool unless someone actively contacts us or opens an issue on GitHub.

We would appreciate it if you would maintain telemetry, as it helps us improve the source code. In fact, it brings increasing value to the project and helps us to better prioritize feature development.

We understand that you may still prefer not to share telemetry data and we respect that desire. Please follow the steps above to disable data collection.

================================================
FILE: optimization/speedster/docs/en/mkdocs.yaml
================================================
site_name: Speedster

docs_dir: ./docs

nav:
  - Overview: overview.md
  - Installation: installation.md
  - Getting started:
    - PyTorch: getting_started/pytorch_getting_started.md
    - 🤗 HuggingFace: getting_started/hf_getting_started.md
    - 🧨 Stable Diffusion: getting_started/diffusers_getting_started.md
    - TensorFlow/Keras: getting_started/tf_getting_started.md
    - ONNX: getting_started/onnx_getting_started.md
  - Notebooks: notebooks.md
  - Key concepts: key_concepts.md
  - Supported hardware: hardware.md
  - Advanced options: advanced_options.md
  - Benchmarks: benchmarks.md
  - Telemetry: telemetry.md


================================================
FILE: optimization/speedster/notebooks/README.md
================================================
# **Jupyter notebooks**

This folder contains notebooks showing how to use the `Speedster` app to optimize several models. 

The following frameworks are supported:
- PyTorch
- HuggingFace
- Diffusers
- Tensorflow
- ONNX

Examples of how to use `Speedster` are shown for each of these frameworks.

In each folder we provide links to google colab where you can easily test the notebooks. 
If you want to test them on your own hardware, you can follow the guide below.

## 1. Setup
To test notebooks, we have to create an environment where all the required dependencies are installed.

First of all, clone the `nebullvm` repository:
```
git clone https://github.com/nebuly-ai/nebullvm.git
```
Next, navigate to the repo's root directory:
```
cd nebullvm
```

After cloning the repository there are two options: we can either install `Speedster` in a local environment or use a ready-to-use docker container.

### a. Using a local environment

Install `Speedster` library:
```
pip install speedster
```

Install deep learning compilers:
```
python -m nebullvm.installers.auto_installer \
    --frameworks all --compilers all
```

You can find additional options and details on the official [installation guide](https://docs.nebuly.com/modules/speedster/installation).

After everything has been installed, you can start a jupyter session with the following command:

```
jupyter notebook --allow-root --port 8888
```
And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`

Use the token listed in the output from running the jupyter command to log in, for example:

`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`

You can finally navigate to the `notebooks/speedster` folder and then to the folder of the framework that you want to try and start a notebook.


### b. Using a Docker container

Another very easy way to test the following notebooks is by using one of the docker containers released on [dockerhub](https://hub.docker.com/r/nebulydocker/nebullvm). 


Pull the most up-to-date container image that has all compilers and their dependencies preinstalled:
```
docker pull nebulydocker/nebullvm:latest
```
Once pulled, the container can be launched with the following command:
```
docker run --rm --gpus all -ti -p 8888:8888 -v $PWD:/nebullvm nebulydocker/nebullvm:latest
```
The `-v` option in the command above allows to persist all the changes that will be done to the notebooks inside the container.
Please note that, in order to enable gpu inside docker, you have to ensure that nvidia docker is installed. Please follow the "Setting up NVIDIA Container Toolkit" part from the 
official [installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
You can then check that the gpu can be seen inside the container by running `nvidia-smi` inside it, and checking that your gpu appears in the output.

Inside the container, we can then navigate to the notebooks folder:
```
cd /nebullvm/notebooks/speedster
```
We can then run a jupyter session with the following command:
```
jupyter notebook --allow-root --ip 0.0.0.0 --port 8888
```
And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`

Use the token listed in the output from running the jupyter command to log in, for example:

`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`

You can finally navigate to the folder of the framework that you want to try and start a notebook.

## 2. Contributions
At Nebuly we are always eager to see how our library manages to optimise more and more models. If you test nebullvm on your model and this is not already present among the notebooks, feel free to open a PR for us to add your notebook to the repository!


================================================
FILE: optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ef331be9",
   "metadata": {
    "id": "ef331be9"
   },
   "source": [
    "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f260653a",
   "metadata": {
    "id": "f260653a"
   },
   "source": [
    "# Accelerate Stable Diffusion with Speedster\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8bdf3af5",
   "metadata": {
    "id": "8bdf3af5"
   },
   "source": [
    "Hi and welcome 👋\n",
    "\n",
    "In this notebook we will discover how in just a few steps you can speed up the response time of Stable Diffusion inference using the Speedster module from the open-source library nebullvm. In the first section we will try using `Speedster` with the default configuration, then we will explore a more advanced option that involves the TensorRT plugins, that allow to accelerate Stable Diffusion further on GPU.\n",
    "\n",
    "Let's jump to the code."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cXXh1ifQ13mH",
   "metadata": {
    "id": "cXXh1ifQ13mH"
   },
   "source": [
    "# Installation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48aljCHu14-H",
   "metadata": {
    "id": "48aljCHu14-H"
   },
   "source": [
    "Install Speedster:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "QFQh3BVr1-GO",
   "metadata": {
    "id": "QFQh3BVr1-GO"
   },
   "outputs": [],
   "source": [
    "!pip install speedster"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a7a86b3",
   "metadata": {
    "id": "8a7a86b3"
   },
   "source": [
    "Install deep learning compilers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cffbfa32",
   "metadata": {
    "id": "cffbfa32"
   },
   "outputs": [],
   "source": [
    "!python -m nebullvm.installers.auto_installer --frameworks diffusers --compilers all"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "c2ab3de7",
   "metadata": {},
   "source": [
    "# Environment check (GPU only)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "61a1a445",
   "metadata": {},
   "source": [
    "**Please skip this section if you don't have a GPU**"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e2784bb8",
   "metadata": {},
   "source": [
    "If you want to optimize Stable Diffusion on a Nvidia GPU, in order to work properly, the following requirements must be installed on your machine:\n",
    "- `CUDA>=12.0`\n",
    "- `tensorrt>=8.6.0`\n",
    "- `torch<=1.13.1`"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e3bc8b4d",
   "metadata": {},
   "source": [
    "From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above to execute this notebook.\n",
    "\n",
    "There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.\n",
    "\n",
    "For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ec2267f0",
   "metadata": {},
   "source": [
    "First of all, Let's check the CUDA version installed on the machine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b78585",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import subprocess\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    cuda_version = subprocess.check_output([\"nvidia-smi\"])\n",
    "    cuda_version = int(cuda_version.decode(\"utf-8\").split(\"\\n\")[2].split(\"|\")[-2].split(\":\")[-1].strip().split(\".\")[0])\n",
    "    assert cuda_version >= 12, (\"This notebook requires CUDA>=12.0 to be executed, please upgrade your CUDA version.\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "015cfa92",
   "metadata": {},
   "source": [
    "If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "563779e6",
   "metadata": {},
   "source": [
    "Then, let's check the tensorrt version installed on the platform. Stable Diffusion optimization is supported starting from `tensorrt==8.6.0`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e385021d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorrt\n",
    "from nebullvm.tools.utils import check_module_version\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    assert check_module_version(tensorrt, \"8.6.0\"), (\"This notebook can be run only with tensorrt>=8.6.0, if using an older version you could have issues during the optimization. Please upgrade your version.\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "61da505b",
   "metadata": {},
   "source": [
    "If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:\n",
    "```\n",
    "pip install -U tensorrt\n",
    "```"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "3876bea4",
   "metadata": {},
   "source": [
    "Finally, let's check the PyTorch version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db83853f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "from nebullvm.tools.utils import check_module_version\n",
    "\n",
    "assert check_module_version(torch, max_version=\"1.13.1+cu117\"), (\"This notebook can be run only with torch<=1.13.1, if using an older version you could have issues during the optimization. Please downgrade your version.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73072506",
   "metadata": {
    "id": "73072506"
   },
   "source": [
    "## Model and Dataset setup"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "aeb2c521",
   "metadata": {},
   "source": [
    "Once we have ensured that the the required libraries are installed, we have to choose the version of Stable Diffusion we want to optimize, speedster officially supports the most used versions:\n",
    "- `CompVis/stable-diffusion-v1-4`\n",
    "- `runwayml/stable-diffusion-v1-5`\n",
    "- `stabilityai/stable-diffusion-2-1-base`\n",
    "- `stabilityai/stable-diffusion-2-1` (only on gpus with at least 22GB of Memory, if you want to try with a GPU with a lower memory, you have to uncomment `pipe.enable_attention_slicing()` in the cell below)\n",
    "\n",
    "Other Stable Diffusion versions from the Diffusers library should work but have never been tested. If you try a version not included among these and it works, please feel free to report it to us on [Discord](https://discord.com/invite/RbeQMu886J) so we can add it to the list of supported versions. If you try a version that does not work, you can open an issue and possibly a PR on [GitHub](https://github.com/nebuly-ai/nebullvm/issues)."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e4d55115",
   "metadata": {
    "id": "e4d55115"
   },
   "source": [
    "For this notebook, we are going to select Stable Diffusion 1.4. Let's download and load it using the diffusers API:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d633cf21",
   "metadata": {
    "id": "d633cf21",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from diffusers import StableDiffusionPipeline\n",
    "\n",
    "# Select Stable Diffusion version\n",
    "model_id = \"CompVis/stable-diffusion-v1-4\"\n",
    "\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "if device == \"cuda\":\n",
    "    # On GPU we load by default the model in half precision, because it's faster and lighter.\n",
    "    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n",
    "    # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n",
    "else:\n",
    "    pipe = StableDiffusionPipeline.from_pretrained(model_id)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "11aa0739",
   "metadata": {
    "id": "11aa0739"
   },
   "source": [
    "Let's now create an example dataset with some random sentences, that will be used later for the optimization process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbbfeeb2",
   "metadata": {
    "id": "cbbfeeb2"
   },
   "outputs": [],
   "source": [
    "input_data = [\n",
    "    \"a photo of an astronaut riding a horse on mars\",\n",
    "    \"a monkey eating a banana in a forest\",\n",
    "    \"white car on a road surrounded by palm trees\",\n",
    "    \"a fridge full of bottles of beer\",\n",
    "    \"madara uchiha throwing asteroids against people\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17040431",
   "metadata": {
    "id": "17040431"
   },
   "source": [
    "## Speed up inference with Speedster"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44ddc21d",
   "metadata": {
    "id": "44ddc21d"
   },
   "source": [
    "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9d934f6",
   "metadata": {
    "id": "f9d934f6"
   },
   "outputs": [],
   "source": [
    "from speedster import optimize_model, save_model, load_model"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2799e3e3",
   "metadata": {},
   "source": [
    "Let's move the pipe back to CPU to save up GPU memory, `Speedster` will automatically move it back to GPU when required."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45220cf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "\n",
    "# Move the pipe back to cpu\n",
    "pipe.to(\"cpu\")\n",
    "\n",
    "# Clean memory\n",
    "torch.cuda.empty_cache()\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76248033",
   "metadata": {
    "id": "76248033"
   },
   "source": [
    "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "75b339c3",
   "metadata": {},
   "source": [
    "**Optimisation of stable diffusion requires a lot of RAM. If you are running this notebook on google colab, make sure to use the high RAM option, otherwise the kernel may crash. If the kernel crashes also when using the high RAM option, please try adding also `\"torchscript\"` to the `ignore_compilers` list. \n",
    "If running on GPU, the optimization requires at least 16GB og GPU memory to exploit the best techniques for optimizing the model, otherwise it may fail with a Memory Error**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "zPC_EDwEJIM0",
   "metadata": {
    "id": "zPC_EDwEJIM0"
   },
   "outputs": [],
   "source": [
    "optimized_model = optimize_model(\n",
    "    model=pipe,\n",
    "    input_data=input_data,\n",
    "    optimization_time=\"unconstrained\",\n",
    "    ignore_compilers=[\"torch_tensor_rt\", \"tvm\"],  # Some compilers have issues with Stable Diffusion, so it's better to skip them.\n",
    "    metric_drop_ths=0.2,\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "fdae59d2",
   "metadata": {},
   "source": [
    "If running on GPU, here you should obtain a speedup of about 124% on the UNet. We run the optimization on a **3090Ti** and here are our results:\n",
    "- **Original Model (PyTorch, fp16): 51,557 ms/batch**\n",
    "- **Optimized Model (TensorRT, fp16): 23,055 ms/batch**\n",
    "\n",
    "If the optimized model you obtained is not a TensorRT one, probably there was an error during the optimization. If running on colab, it could happen that the standard gpu is not enough to run the optimization, so we suggest to select a premium gpu with more memory.\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "af9f86ac",
   "metadata": {},
   "source": [
    "If everything worked correctly, let's check the output of the optimized model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b640885",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_prompt = \"futuristic llama with a cyberpunk city on the background\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa443637",
   "metadata": {},
   "outputs": [],
   "source": [
    "optimized_model(test_prompt).images[0]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6e5b3b21",
   "metadata": {
    "id": "6e5b3b21"
   },
   "source": [
    "Let's run the prediction 10 times to calculate the average response time of the original model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09170c78",
   "metadata": {},
   "outputs": [],
   "source": [
    "if device == \"cuda\":\n",
    "    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n",
    "    # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n",
    "else:\n",
    "    pipe = StableDiffusionPipeline.from_pretrained(model_id)\n",
    "\n",
    "pipe.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3bc5c98",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "d3bc5c98",
    "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681"
   },
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "times = []\n",
    "\n",
    "# Warmup for 2 iterations\n",
    "for _ in range(2):\n",
    "    with torch.no_grad():\n",
    "        final_out = pipe(test_prompt).images[0]\n",
    "\n",
    "# Benchmark\n",
    "for _ in range(8):\n",
    "    st = time.time()\n",
    "    with torch.no_grad():\n",
    "        final_out = pipe(test_prompt).images[0]\n",
    "    times.append(time.time()-st)\n",
    "original_model_time = sum(times)/len(times)\n",
    "print(f\"Average response time for original Stable Diffusion 1.4: {original_model_time} s\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "3db0a7a1",
   "metadata": {
    "id": "3db0a7a1"
   },
   "source": [
    "Let's run the prediction 10 times to calculate the average response time of the optimized model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3e83997",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "a3e83997",
    "outputId": "7a416b14-f170-4df9-d416-026f06a7d980"
   },
   "outputs": [],
   "source": [
    "times = []\n",
    "\n",
    "for _ in range(2):\n",
    "    with torch.no_grad():\n",
    "        final_out = optimized_model(test_prompt).images[0]\n",
    "\n",
    "# Benchmark\n",
    "for _ in range(8):\n",
    "    st = time.time()\n",
    "    with torch.no_grad():\n",
    "        final_out = optimized_model(test_prompt).images[0]\n",
    "    times.append(time.time()-st)\n",
    "optimized_model_time = sum(times)/len(times)\n",
    "print(f\"Average response time for optimized Stable Diffusion 1.4: {optimized_model_time} s\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceb60d8c",
   "metadata": {
    "id": "ceb60d8c"
   },
   "source": [
    "## Save and reload the optimized model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9eda1a0",
   "metadata": {},
   "source": [
    "We can easily save to disk the optimized model with the following line:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62b6fcbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_model(optimized_model, \"model_save_path\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c968d51",
   "metadata": {},
   "source": [
    "We can then load again the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1340c49",
   "metadata": {},
   "outputs": [],
   "source": [
    "optimized_model = load_model(\"model_save_path\", pipe=pipe)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb234e5e",
   "metadata": {
    "id": "cb234e5e"
   },
   "source": [
    "Great! Was it easy? How are the results? Do you have any comments?\n",
    "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
    "\n",
    "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
    "\n",
    "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b77ff2ac",
   "metadata": {
    "id": "b77ff2ac"
   },
   "source": [
    "<center> \n",
    "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
    "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
    "</center>\n",
    "\n",
    "<center> \n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
    "</center>"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "gpuClass": "premium",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  },
  "vscode": {
   "interpreter": {
    "hash": "4ca44071b2152bc556aa4c839392f76fd4b80aa39d34257f2d304fa0d1d8b7d9"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/diffusers/Readme.md
================================================
# **Diffusers Optimization**

> :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).

This section contains all the available notebooks that show how to leverage Speedster to optimize Diffusers models.

## Notebooks:
| Notebook                                                                                                                                                                           | Description                                                                     |                                                                                                                                                                                                                                               |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Diffusers Stable Diffusion](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) | Show how to optimize with Speedster the Stable Diffusion models from Diffusers. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb) |

## Diffusers API quick view:

``` python
import torch
from speedster import optimize_model
from diffusers import StableDiffusionPipeline


# Load Stable Diffusion 1.4 as example
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

if device == "cuda":
    # On GPU we load by default the model in half precision, because it's faster and lighter.
    pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)
else:
    pipe = StableDiffusionPipeline.from_pretrained(model_id)

# Create some example input data
input_data = [
    "a photo of an astronaut riding a horse on mars",
    "a monkey eating a banana in a forest",
    "white car on a road surrounded by palm trees",
    "a fridge full of bottles of beer",
    "madara uchiha throwing asteroids against people"
]

# Run Speedster optimization
optimized_model = optimize_model(
    model=pipe,
    input_data=input_data,
    optimization_time="unconstrained",
    ignore_compilers=["torch_tensor_rt", "tvm"],
    metric_drop_ths=0.1,
)

# Try the optimized model
test_prompt = "futuristic llama with a cyberpunk city on the background"
res = optimized_model(test_prompt).images[0]
```


================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "ef331be9",
      "metadata": {
        "id": "ef331be9"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "f260653a",
      "metadata": {
        "id": "f260653a"
      },
      "source": [
        "# Accelerate Hugging Face PyTorch BERT with Speedster\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8bdf3af5",
      "metadata": {
        "id": "8bdf3af5"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d527d63b",
      "metadata": {
        "id": "d527d63b"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cXXh1ifQ13mH",
      "metadata": {
        "id": "cXXh1ifQ13mH"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "73072506",
      "metadata": {
        "id": "73072506"
      },
      "source": [
        "## Model and Dataset setup"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "cf24c4c4",
      "metadata": {},
      "source": [
        "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1cf8ff74",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\"  # Change this path according to your TensorRT location\n",
        "\n",
        "if os.path.exists(tensorrt_path):\n",
        "    os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
        "else:\n",
        "    print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e4d55115",
      "metadata": {
        "id": "e4d55115"
      },
      "source": [
        "We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d633cf21",
      "metadata": {
        "id": "d633cf21",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import BertTokenizer, BertModel\n",
        "\n",
        "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
        "model = BertModel.from_pretrained('bert-base-uncased', torchscript=True)\n",
        "\n",
        "# Move the model to gpu if available and set eval mode\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device).eval()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11aa0739",
      "metadata": {
        "id": "11aa0739"
      },
      "source": [
        "Let's create an example dataset with some random sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cbbfeeb2",
      "metadata": {
        "id": "cbbfeeb2"
      },
      "outputs": [],
      "source": [
        "import random\n",
        "\n",
        "sentences = [\n",
        "    \"Mars is the fourth planet from the Sun.\",\n",
        "    \"has a crust primarily composed of elements\",\n",
        "    \"However, it is unknown\",\n",
        "    \"can be viewed from Earth\",\n",
        "    \"It was the Romans\",\n",
        "]\n",
        "\n",
        "len_dataset = 100\n",
        "\n",
        "texts = []\n",
        "for _ in range(len_dataset):\n",
        "    n_times = random.randint(1, 30)\n",
        "    texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a09f9424",
      "metadata": {
        "id": "a09f9424"
      },
      "outputs": [],
      "source": [
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17040431",
      "metadata": {
        "id": "17040431"
      },
      "source": [
        "## Speed up inference with Speedster: no metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44ddc21d",
      "metadata": {
        "id": "44ddc21d"
      },
      "source": [
        "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9d934f6",
      "metadata": {
        "id": "f9d934f6"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76248033",
      "metadata": {
        "id": "76248033"
      },
      "source": [
        "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zPC_EDwEJIM0",
      "metadata": {
        "id": "zPC_EDwEJIM0"
      },
      "outputs": [],
      "source": [
        "dynamic_info = {\n",
        "    \"inputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "    ],\n",
        "    \"outputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch'},\n",
        "    ]\n",
        "}\n",
        "\n",
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "98c6ab09",
      "metadata": {
        "id": "98c6ab09"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "\n",
        "# Move inputs to gpu if available\n",
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e5b3b21",
      "metadata": {
        "id": "6e5b3b21"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the original model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3bc5c98",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d3bc5c98",
        "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "12c2df98",
      "metadata": {
        "id": "12c2df98"
      },
      "source": [
        "Let's see the output of the original model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4892a905",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4892a905",
        "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3db0a7a1",
      "metadata": {
        "id": "3db0a7a1"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the optimized model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3e83997",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a3e83997",
        "outputId": "7a416b14-f170-4df9-d416-026f06a7d980"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d884d61",
      "metadata": {
        "id": "0d884d61"
      },
      "source": [
        "Let's see the output of the optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "75611b2e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "75611b2e",
        "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Speed up inference with Speedster: metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b1950d5",
      "metadata": {
        "id": "7b1950d5"
      },
      "source": [
        "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "de5721d8",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "de5721d8",
        "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10"
      },
      "outputs": [],
      "source": [
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0fbfe6fa",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0fbfe6fa",
        "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original BERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f89b7e6d",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f89b7e6d",
        "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "10d17b5c",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "10d17b5c",
        "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6bf3d1fb",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6bf3d1fb",
        "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cb234e5e",
      "metadata": {
        "id": "cb234e5e"
      },
      "source": [
        "Great! Was it easy? How are the results? Do you have any comments?\n",
        "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
        "\n",
        "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
        "\n",
        "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "premium",
    "kernelspec": {
      "display_name": "nebullvm_new",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.15"
    },
    "vscode": {
      "interpreter": {
        "hash": "4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "ef331be9",
      "metadata": {
        "id": "ef331be9"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "f260653a",
      "metadata": {
        "id": "f260653a"
      },
      "source": [
        "# Accelerate Hugging Face PyTorch DistilBERT with Speedster\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8bdf3af5",
      "metadata": {
        "id": "8bdf3af5"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d527d63b",
      "metadata": {
        "id": "d527d63b"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cXXh1ifQ13mH",
      "metadata": {
        "id": "cXXh1ifQ13mH"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "73072506",
      "metadata": {
        "id": "73072506"
      },
      "source": [
        "## Model and Dataset setup"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "cf24c4c4",
      "metadata": {},
      "source": [
        "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1cf8ff74",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\"  # Change this path according to your TensorRT location\n",
        "\n",
        "if os.path.exists(tensorrt_path):\n",
        "    os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
        "else:\n",
        "    print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e4d55115",
      "metadata": {
        "id": "e4d55115"
      },
      "source": [
        "We chose DistilBERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d633cf21",
      "metadata": {
        "id": "d633cf21",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import DistilBertTokenizer, DistilBertModel\n",
        "\n",
        "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
        "model = DistilBertModel.from_pretrained(\"distilbert-base-uncased\", torchscript=True)\n",
        "\n",
        "# Move the model to gpu if available and set eval mode\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device).eval()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11aa0739",
      "metadata": {
        "id": "11aa0739"
      },
      "source": [
        "Let's create an example dataset with some random sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cbbfeeb2",
      "metadata": {
        "id": "cbbfeeb2"
      },
      "outputs": [],
      "source": [
        "import random\n",
        "\n",
        "sentences = [\n",
        "    \"Mars is the fourth planet from the Sun.\",\n",
        "    \"has a crust primarily composed of elements\",\n",
        "    \"However, it is unknown\",\n",
        "    \"can be viewed from Earth\",\n",
        "    \"It was the Romans\",\n",
        "]\n",
        "\n",
        "len_dataset = 100\n",
        "\n",
        "texts = []\n",
        "for _ in range(len_dataset):\n",
        "    n_times = random.randint(1, 30)\n",
        "    texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a09f9424",
      "metadata": {
        "id": "a09f9424"
      },
      "outputs": [],
      "source": [
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17040431",
      "metadata": {
        "id": "17040431"
      },
      "source": [
        "## Speed up inference with Speedster: no metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44ddc21d",
      "metadata": {
        "id": "44ddc21d"
      },
      "source": [
        "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9d934f6",
      "metadata": {
        "id": "f9d934f6"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76248033",
      "metadata": {
        "id": "76248033"
      },
      "source": [
        "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zPC_EDwEJIM0",
      "metadata": {
        "id": "zPC_EDwEJIM0"
      },
      "outputs": [],
      "source": [
        "dynamic_info = {\n",
        "    \"inputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'}\n",
        "    ],\n",
        "    \"outputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'}\n",
        "    ]\n",
        "}\n",
        "\n",
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "98c6ab09",
      "metadata": {
        "id": "98c6ab09"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "\n",
        "# Move inputs to gpu if available\n",
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e5b3b21",
      "metadata": {
        "id": "6e5b3b21"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the original model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3bc5c98",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d3bc5c98",
        "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "12c2df98",
      "metadata": {
        "id": "12c2df98"
      },
      "source": [
        "Let's see the output of the original model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4892a905",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4892a905",
        "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3db0a7a1",
      "metadata": {
        "id": "3db0a7a1"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the optimized model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3e83997",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a3e83997",
        "outputId": "7a416b14-f170-4df9-d416-026f06a7d980"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized DistilBERT (no metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d884d61",
      "metadata": {
        "id": "0d884d61"
      },
      "source": [
        "Let's see the output of the optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "75611b2e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "75611b2e",
        "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Speed up inference with Speedster: metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b1950d5",
      "metadata": {
        "id": "7b1950d5"
      },
      "source": [
        "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "de5721d8",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "de5721d8",
        "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10"
      },
      "outputs": [],
      "source": [
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0fbfe6fa",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0fbfe6fa",
        "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f89b7e6d",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f89b7e6d",
        "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "10d17b5c",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "10d17b5c",
        "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized DistilBERT (metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6bf3d1fb",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6bf3d1fb",
        "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cb234e5e",
      "metadata": {
        "id": "cb234e5e"
      },
      "source": [
        "Great! Was it easy? How are the results? Do you have any comments?\n",
        "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
        "\n",
        "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
        "\n",
        "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "premium",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "ef331be9",
      "metadata": {
        "id": "ef331be9"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "f260653a",
      "metadata": {
        "id": "f260653a"
      },
      "source": [
        "# Accelerate Hugging Face PyTorch GPT2 with Speedster\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8bdf3af5",
      "metadata": {
        "id": "8bdf3af5"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d527d63b",
      "metadata": {
        "id": "d527d63b"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cXXh1ifQ13mH",
      "metadata": {
        "id": "cXXh1ifQ13mH"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "73072506",
      "metadata": {
        "id": "73072506"
      },
      "source": [
        "## Model and Dataset setup"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "cf24c4c4",
      "metadata": {},
      "source": [
        "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1cf8ff74",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\"  # Change this path according to your TensorRT location\n",
        "\n",
        "if os.path.exists(tensorrt_path):\n",
        "    os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
        "else:\n",
        "    print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e4d55115",
      "metadata": {
        "id": "e4d55115"
      },
      "source": [
        "We chose GPT2 as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d633cf21",
      "metadata": {
        "colab": {
          "background_save": true
        },
        "id": "d633cf21",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import GPT2Tokenizer, GPT2Model\n",
        "\n",
        "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
        "model = GPT2Model.from_pretrained('gpt2', torchscript=True)\n",
        "\n",
        "# Move the model to gpu if available and set eval mode\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device).eval()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11aa0739",
      "metadata": {
        "id": "11aa0739"
      },
      "source": [
        "Let's create an example dataset with some random sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cbbfeeb2",
      "metadata": {
        "colab": {
          "background_save": true
        },
        "id": "cbbfeeb2"
      },
      "outputs": [],
      "source": [
        "import random\n",
        "\n",
        "sentences = [\n",
        "    \"Mars is the fourth planet from the Sun.\",\n",
        "    \"has a crust primarily composed of elements\",\n",
        "    \"However, it is unknown\",\n",
        "    \"can be viewed from Earth\",\n",
        "    \"It was the Romans\",\n",
        "]\n",
        "\n",
        "len_dataset = 100\n",
        "\n",
        "texts = []\n",
        "for _ in range(len_dataset):\n",
        "    n_times = random.randint(1, 30)\n",
        "    texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a09f9424",
      "metadata": {
        "colab": {
          "background_save": true
        },
        "id": "a09f9424"
      },
      "outputs": [],
      "source": [
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17040431",
      "metadata": {
        "id": "17040431"
      },
      "source": [
        "## Speed up inference with Speedster: no metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44ddc21d",
      "metadata": {
        "id": "44ddc21d"
      },
      "source": [
        "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9d934f6",
      "metadata": {
        "id": "f9d934f6"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76248033",
      "metadata": {
        "id": "76248033"
      },
      "source": [
        "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zPC_EDwEJIM0",
      "metadata": {
        "id": "zPC_EDwEJIM0"
      },
      "outputs": [],
      "source": [
        "dynamic_info = {\n",
        "    \"inputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'}\n",
        "    ],\n",
        "    \"outputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "    ] + [{0: 'batch', 2: 'num_tokens'} for i in range(24)]\n",
        "}\n",
        "\n",
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "98c6ab09",
      "metadata": {
        "id": "98c6ab09"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "\n",
        "# Move inputs to gpu if available\n",
        "encoded_inputs = [tokenizer(text, return_tensors=\"pt\").to(device) for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e5b3b21",
      "metadata": {
        "id": "6e5b3b21"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the original model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3bc5c98",
      "metadata": {
        "id": "d3bc5c98"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original GPT2: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "12c2df98",
      "metadata": {
        "id": "12c2df98"
      },
      "source": [
        "Let's see the output of the original model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4892a905",
      "metadata": {
        "id": "4892a905"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3db0a7a1",
      "metadata": {
        "id": "3db0a7a1"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the optimized model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3e83997",
      "metadata": {
        "id": "a3e83997"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized GPT2 (no metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d884d61",
      "metadata": {
        "id": "0d884d61"
      },
      "source": [
        "Let's see the output of the optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "75611b2e",
      "metadata": {
        "id": "75611b2e"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Speed up inference with Speedster: metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b1950d5",
      "metadata": {
        "id": "7b1950d5"
      },
      "source": [
        "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "de5721d8",
      "metadata": {
        "id": "de5721d8"
      },
      "outputs": [],
      "source": [
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0fbfe6fa",
      "metadata": {
        "id": "0fbfe6fa"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original GPT2: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f89b7e6d",
      "metadata": {
        "id": "f89b7e6d"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "10d17b5c",
      "metadata": {
        "id": "10d17b5c"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized GPT2 (metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6bf3d1fb",
      "metadata": {
        "id": "6bf3d1fb"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cb234e5e",
      "metadata": {
        "id": "cb234e5e"
      },
      "source": [
        "Great! Was it easy? How are the results? Do you have any comments?\n",
        "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
        "\n",
        "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
        "\n",
        "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "premium",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "ef331be9",
      "metadata": {
        "id": "ef331be9"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "f260653a",
      "metadata": {
        "id": "f260653a"
      },
      "source": [
        "# Accelerate Hugging Face T5 with Speedster\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8bdf3af5",
      "metadata": {
        "id": "8bdf3af5"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d527d63b",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d527d63b",
        "outputId": "57626bac-e458-487f-f4fa-a459627af296"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cXXh1ifQ13mH",
      "metadata": {
        "id": "cXXh1ifQ13mH"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "73072506",
      "metadata": {
        "id": "73072506"
      },
      "source": [
        "## Model and Dataset setup"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "cf24c4c4",
      "metadata": {},
      "source": [
        "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1cf8ff74",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\"  # Change this path according to your TensorRT location\n",
        "\n",
        "if os.path.exists(tensorrt_path):\n",
        "    os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
        "else:\n",
        "    print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e4d55115",
      "metadata": {
        "id": "e4d55115"
      },
      "source": [
        "We chose T5-efficient-base as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "NOgOmfdY_dav",
      "metadata": {
        "id": "NOgOmfdY_dav"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
        "import torch\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "model_name = \"google/t5-efficient-base\"\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torchscript=True).to(device)\n",
        "\n",
        "# set the model to eval mode\n",
        "_ = model.eval()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11aa0739",
      "metadata": {
        "id": "11aa0739"
      },
      "source": [
        "Let's create an example dataset with some random sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ghGcDNFtKt3X",
      "metadata": {
        "id": "ghGcDNFtKt3X"
      },
      "outputs": [],
      "source": [
        "texts = [\n",
        "    \"\"\"BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.\"\"\",\n",
        "    \"\"\"GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.\"\"\",\n",
        "    \"\"\"With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.\"\"\",\n",
        "    \"\"\"LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis.\"\"\",\n",
        "    \"\"\"XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.\"\"\"\n",
        "]\n",
        "texts = texts*20"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a09f9424",
      "metadata": {
        "id": "a09f9424"
      },
      "outputs": [],
      "source": [
        "encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17040431",
      "metadata": {
        "id": "17040431"
      },
      "source": [
        "## Speed up inference with Speedster: no metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44ddc21d",
      "metadata": {
        "id": "44ddc21d"
      },
      "source": [
        "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9d934f6",
      "metadata": {
        "id": "f9d934f6"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76248033",
      "metadata": {
        "id": "76248033"
      },
      "source": [
        "Usually Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. But for this type of models, we need to do some extra steps because current version of speedster don't have direct support for Encoder-Decoder Models. These type of models has both Encoder and Decoder. For Example, BERT models are Encoder models and GPT models are Decoder models, but T5 has both."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "i7sgUWjePN9i",
      "metadata": {
        "id": "i7sgUWjePN9i"
      },
      "outputs": [],
      "source": [
        "# First, we get the encoder and decoder from the model\n",
        "encoder = model.get_encoder()\n",
        "decoder = model.get_decoder()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "O7xaI1drQOQ0",
      "metadata": {
        "id": "O7xaI1drQOQ0"
      },
      "source": [
        "Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "nTUPdDchQLc1",
      "metadata": {
        "id": "nTUPdDchQLc1"
      },
      "outputs": [],
      "source": [
        "dynamic_info = {\n",
        "    \"inputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'}\n",
        "    ],\n",
        "    \"outputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "    ]\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zPC_EDwEJIM0",
      "metadata": {
        "id": "zPC_EDwEJIM0"
      },
      "outputs": [],
      "source": [
        "# Create the optimized encoder model seperately\n",
        "optimized_encoder_model = optimize_model(\n",
        "    model=encoder,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7Oa68a87Qjre",
      "metadata": {
        "id": "7Oa68a87Qjre"
      },
      "outputs": [],
      "source": [
        "# Create the optimized decoder model seperately\n",
        "optimized_decoder_model = optimize_model(\n",
        "    model=decoder,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "98c6ab09",
      "metadata": {
        "id": "98c6ab09"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "\n",
        "# Move inputs to gpu if available\n",
        "encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\").to(device) for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e5b3b21",
      "metadata": {
        "id": "6e5b3b21"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the original model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3bc5c98",
      "metadata": {
        "id": "d3bc5c98"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        encoder_out = encoder(**encoded_input)\n",
        "        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        encoder_out = encoder(**encoded_input)\n",
        "        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original T5: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "GU0SwykMTVAj",
      "metadata": {
        "id": "GU0SwykMTVAj"
      },
      "source": [
        "In Real world use cases, we pass the decoder output to `model.lm_head` to get the actual prediction, but here we are testing the performance improvements, so i am skipping that step."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "12c2df98",
      "metadata": {
        "id": "12c2df98"
      },
      "source": [
        "Let's see the output of the original model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4892a905",
      "metadata": {
        "id": "4892a905"
      },
      "outputs": [],
      "source": [
        "encoder(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "gx0naPVuSVrm",
      "metadata": {
        "id": "gx0naPVuSVrm"
      },
      "outputs": [],
      "source": [
        "decoder(**encoded_input,encoder_hidden_states=encoder_out[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3db0a7a1",
      "metadata": {
        "id": "3db0a7a1"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the optimized model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3e83997",
      "metadata": {
        "id": "a3e83997"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        encoder_out = optimized_encoder_model(**encoded_input)\n",
        "        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        encoder_out = optimized_encoder_model(**encoded_input)\n",
        "        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized T5 (no metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d884d61",
      "metadata": {
        "id": "0d884d61"
      },
      "source": [
        "Let's see the output of the optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "75611b2e",
      "metadata": {
        "id": "75611b2e"
      },
      "outputs": [],
      "source": [
        "optimized_encoder_model(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cpieoDfwS-V7",
      "metadata": {
        "id": "cpieoDfwS-V7"
      },
      "outputs": [],
      "source": [
        "optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Speed up inference with Speedster: metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b1950d5",
      "metadata": {
        "id": "7b1950d5"
      },
      "source": [
        "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "VwOLWZSZUM89",
      "metadata": {
        "id": "VwOLWZSZUM89"
      },
      "outputs": [],
      "source": [
        "optimized_encoder_model = optimize_model(\n",
        "    model=encoder,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "FIKn4V3dUIZB",
      "metadata": {
        "id": "FIKn4V3dUIZB"
      },
      "outputs": [],
      "source": [
        "optimized_decoder_model = optimize_model(\n",
        "    model=decoder,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tensor_rt\", \"tvm\"],  # TensorRT does not work for this model\n",
        "    dynamic_info=dynamic_info,\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0fbfe6fa",
      "metadata": {
        "id": "0fbfe6fa"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        encoder_out = encoder(**encoded_input)\n",
        "        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        encoder_out = encoder(**encoded_input)\n",
        "        decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original T5: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f89b7e6d",
      "metadata": {
        "id": "f89b7e6d"
      },
      "outputs": [],
      "source": [
        "encoder(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "oI1zjIBSUoIU",
      "metadata": {
        "id": "oI1zjIBSUoIU"
      },
      "outputs": [],
      "source": [
        "decoder(**encoded_input,encoder_hidden_states=encoder_out[0])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "10d17b5c",
      "metadata": {
        "id": "10d17b5c"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    with torch.no_grad():\n",
        "        encoder_out = optimized_encoder_model(**encoded_input)\n",
        "        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    with torch.no_grad():\n",
        "        encoder_out = optimized_encoder_model(**encoded_input)\n",
        "        decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized T5 (metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "4XFMC1S6zXTU",
      "metadata": {
        "id": "4XFMC1S6zXTU"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "OXHVr3EAzbT5",
      "metadata": {
        "id": "OXHVr3EAzbT5"
      },
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3M565P-zzaFB",
      "metadata": {
        "id": "3M565P-zzaFB"
      },
      "outputs": [],
      "source": [
        "save_model(optimized_encoder_model, \"encoder_model_save_path\")\n",
        "save_model(optimized_decoder_model, \"decoder_model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ee8CS_Evzg1j",
      "metadata": {
        "id": "ee8CS_Evzg1j"
      },
      "source": [
        "We can then load again the model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zOQ88SY_zg-A",
      "metadata": {
        "id": "zOQ88SY_zg-A"
      },
      "outputs": [],
      "source": [
        "optimized_encoder_model = load_model(\"encoder_model_save_path\")\n",
        "optimized_decoder_model = load_model(\"decoder_model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cb234e5e",
      "metadata": {
        "id": "cb234e5e"
      },
      "source": [
        "Great! Was it easy? How are the results? Do you have any comments?\n",
        "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
        "\n",
        "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
        "\n",
        "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "premium",
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ef331be9",
      "metadata": {
        "id": "ef331be9"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "f260653a",
      "metadata": {
        "id": "f260653a"
      },
      "source": [
        "# Accelerate Hugging Face TensorFlow BERT with Speedster\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8bdf3af5",
      "metadata": {
        "id": "8bdf3af5"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d527d63b",
      "metadata": {
        "id": "d527d63b"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cXXh1ifQ13mH",
      "metadata": {
        "id": "cXXh1ifQ13mH"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "73072506",
      "metadata": {
        "id": "73072506"
      },
      "source": [
        "## Model and Dataset setup"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "cf24c4c4",
      "metadata": {},
      "source": [
        "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1cf8ff74",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\"  # Change this path according to your TensorRT location\n",
        "\n",
        "if os.path.exists(tensorrt_path):\n",
        "    os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
        "else:\n",
        "    print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e4d55115",
      "metadata": {
        "id": "e4d55115"
      },
      "source": [
        "We chose BERT as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d633cf21",
      "metadata": {
        "id": "d633cf21",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "from transformers import BertTokenizer, TFBertModel\n",
        "\n",
        "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
        "model = TFBertModel.from_pretrained('bert-base-uncased')"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11aa0739",
      "metadata": {
        "id": "11aa0739"
      },
      "source": [
        "Let's create an example dataset with some random sentences"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cbbfeeb2",
      "metadata": {
        "id": "cbbfeeb2"
      },
      "outputs": [],
      "source": [
        "import random\n",
        "\n",
        "sentences = [\n",
        "    \"Mars is the fourth planet from the Sun.\",\n",
        "    \"has a crust primarily composed of elements\",\n",
        "    \"However, it is unknown\",\n",
        "    \"can be viewed from Earth\",\n",
        "    \"It was the Romans\",\n",
        "]\n",
        "\n",
        "len_dataset = 100\n",
        "\n",
        "texts = []\n",
        "for _ in range(len_dataset):\n",
        "    n_times = random.randint(1, 30)\n",
        "    texts.append(\" \".join(random.choice(sentences) for _ in range(n_times)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a09f9424",
      "metadata": {
        "id": "a09f9424"
      },
      "outputs": [],
      "source": [
        "encoded_inputs = [tokenizer(text, return_tensors=\"tf\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17040431",
      "metadata": {
        "id": "17040431"
      },
      "source": [
        "## Speed up inference with Speedster: no metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44ddc21d",
      "metadata": {
        "id": "44ddc21d"
      },
      "source": [
        "It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9d934f6",
      "metadata": {
        "id": "f9d934f6"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "76248033",
      "metadata": {
        "id": "76248033"
      },
      "source": [
        "Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zPC_EDwEJIM0",
      "metadata": {
        "id": "zPC_EDwEJIM0"
      },
      "outputs": [],
      "source": [
        "dynamic_info = {\n",
        "    \"inputs\": [\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "        {0: 'batch', 1: 'num_tokens'},\n",
        "    ],\n",
        "    \"outputs\": [\n",
        "        {0: \"batch\", 1: \"num_tokens\"},\n",
        "        {0: \"batch\"}\n",
        "    ]\n",
        "}\n",
        "\n",
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    ignore_compilers=[\"tvm\"],\n",
        "    dynamic_info=dynamic_info,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "98c6ab09",
      "metadata": {
        "id": "98c6ab09"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "\n",
        "encoded_inputs = [tokenizer(text, return_tensors=\"tf\") for text in texts]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e5b3b21",
      "metadata": {
        "id": "6e5b3b21"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the original model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3bc5c98",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d3bc5c98",
        "outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original DistilBERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "12c2df98",
      "metadata": {
        "id": "12c2df98"
      },
      "source": [
        "Let's see the output of the original model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4892a905",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4892a905",
        "outputId": "68d9b65f-e2cc-4998-8047-c9091f977698"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3db0a7a1",
      "metadata": {
        "id": "3db0a7a1"
      },
      "source": [
        "Let's run the prediction 100 times to calculate the average response time of the optimized model."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3e83997",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a3e83997",
        "outputId": "7a416b14-f170-4df9-d416-026f06a7d980"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized BERT (no metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d884d61",
      "metadata": {
        "id": "0d884d61"
      },
      "source": [
        "Let's see the output of the optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "75611b2e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "75611b2e",
        "outputId": "035d5c6d-fd7a-4506-af09-befcf9dd3b2d"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Speed up inference with Speedster: metric drop"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b1950d5",
      "metadata": {
        "id": "7b1950d5"
      },
      "source": [
        "This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "de5721d8",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "de5721d8",
        "outputId": "c9efff21-f963-47ff-e83d-a44615f90a10"
      },
      "outputs": [],
      "source": [
        "optimized_model = optimize_model(\n",
        "    model=model,\n",
        "    input_data=encoded_inputs,\n",
        "    optimization_time=\"constrained\",\n",
        "    dynamic_info=dynamic_info,\n",
        "    ignore_compilers=[\"tvm\"],\n",
        "    metric_drop_ths=0.1,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0fbfe6fa",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0fbfe6fa",
        "outputId": "ada293f5-9b54-4186-8e48-74b994d4b797"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    final_out = model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    final_out = model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "original_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for original BERT: {original_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f89b7e6d",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f89b7e6d",
        "outputId": "51e497e1-a533-432d-d68e-b373f0ef69cb"
      },
      "outputs": [],
      "source": [
        "model(**encoded_input)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "10d17b5c",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "10d17b5c",
        "outputId": "d5dc0acd-77e7-4054-b455-19343ff37951"
      },
      "outputs": [],
      "source": [
        "times = []\n",
        "\n",
        "# Warmup for 30 iterations\n",
        "for encoded_input in encoded_inputs[:30]:\n",
        "    final_out = optimized_model(**encoded_input)\n",
        "\n",
        "# Benchmark\n",
        "for encoded_input in encoded_inputs:\n",
        "    st = time.time()\n",
        "    final_out = optimized_model(**encoded_input)\n",
        "    times.append(time.time()-st)\n",
        "optimized_model_time = sum(times)/len(times)*1000\n",
        "print(f\"Average response time for optimized BERT (metric drop): {optimized_model_time} ms\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6bf3d1fb",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6bf3d1fb",
        "outputId": "6163d8ba-254f-47d2-a468-a921622a15ba"
      },
      "outputs": [],
      "source": [
        "optimized_model(**encoded_input)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cb234e5e",
      "metadata": {
        "id": "cb234e5e"
      },
      "source": [
        "Great! Was it easy? How are the results? Do you have any comments?\n",
        "Share your optimization results and thoughts with <a href=\"https://discord.gg/RbeQMu886J\" target=\"_blank\"> our community on Discord</a>, where we chat about Speedster and AI acceleration.\n",
        "\n",
        "Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
        "\n",
        "If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "premium",
    "kernelspec": {
      "display_name": "Python 3.9.15 ('nebullvm_new')",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:52:10) \n[Clang 14.0.6 ]"
    },
    "vscode": {
      "interpreter": {
        "hash": "4fbc45cd27f7d363500c2e8640d9fdb717da4e1d8e4954a68e42b53d65ee27af"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/huggingface/Readme.md
================================================
# **Hugging Face Optimization**

This section contains all the available notebooks that show how to leverage Speedster to optimize Hugging Face models.

Hugging Face hosts models that can use either PyTorch or TensorFlow as backend. Both the backends are supported by Speedster.

## Notebooks:
| Notebook                                                                                                                                                                                                 | Description                                                                                      |                                                                                                                                                                                                                                                                |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Hugging Face PyTorch GPT2](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb)             | Show how to optimize with Speedster the GPT2 model from Hugging Face with PyTorch backend.       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb)       |
| [Accelerate Hugging Face PyTorch BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb)             | Show how to optimize with Speedster the BERT model from Hugging Face with PyTorch backend.       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb)       |
| [Accelerate Hugging Face PyTorch DistilBERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) | Show how to optimize with Speedster the DistilBERT model from Hugging Face with PyTorch backend. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb) |                                                            |
| [Accelerate Hugging Face TensorFlow BERT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb)       | Show how to optimize with Speedster the BERT model from Hugging Face with TensorFlow backend.    | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb)    |
| [Accelerate Hugging Face PyTorch T5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb)                 | Show how to optimize with Speedster the T5 model from Hugging Face with PyTorch backend.         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb)         |

## Hugging Face API quick view:

``` python
from speedster import optimize_model
from transformers import AlbertModel, AlbertTokenizer

# Load Albert as example
model = AlbertModel.from_pretrained("albert-base-v1")
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")

# Case 1: dictionary input format
text = "This is an example text for the huggingface model."
input_dict = tokenizer(text, return_tensors="pt")  # set return_tensors="tf" or "np" for tensorflow models

# Run Speedster optimization
optimized_model = optimize_model(
  model, input_data=[input_dict]
)

## Warmup the model
## This step is necessary before the latency computation of the 
## optimized model in order to get reliable results.
# for _ in range(10):
#   optimized_model(**input_dict)

# Try the optimized model
res = optimized_model(**input_dict)

# # Case 2: strings input format
# input_data = [
#     "This is a test.",
#     "Hi my name is John.",
#     "The cat is on the table.",
# ]
# tokenizer_args = dict(
#     return_tensors="pt",  # set return_tensors="tf" or "np" for tensorflow models
#     padding="longest",
#     truncation=True,
# )
# 
# # Run Speedster optimization
# optimized_model = optimize_model(
#   model, input_data=input_data, tokenizer=tokenizer, tokenizer_args=tokenizer_args
# )
```


================================================
FILE: optimization/speedster/notebooks/huggingface/faster_transformer_bert.py
================================================
# %%
import logging
import random
import time

import speedster
import torch
from speedster import optimize_model

# %%
from nebullvm.operations.optimizations.compilers.faster_transformer.bert import (  # noqa: E501
    detect_and_swap_bert_model,
)

# %%
from nebullvm.operations.optimizations.compilers.utils import (
    get_faster_transformer_repo_path,
)
from transformers import BertTokenizer
from transformers.models.bert.modeling_bert import (
    BertForSequenceClassification as HFBertForSequenceClassification,
)

# %%
print(speedster.__file__)
lib_path = str(
    get_faster_transformer_repo_path()
    / "build"
    / "lib"
    / "libth_transformer.so"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# %%
# https://huggingface.co/bert-base-cased-finetuned-mrpc


# %%
def prepare_examples(tokenizer, len_dataset=1000):
    sentences = [
        "Mars is the fourth planet from the Sun.",
        "has a crust primarily composed of elements",
        "However, it is unknown",
        "can be viewed from Earth",
        "It was the Romans",
    ]
    texts = []
    for _ in range(len_dataset):
        n_times = random.randint(1, 30)
        texts.append(
            " ".join(random.choice(sentences) for _ in range(n_times))
        )
    encoded_inputs = [
        tokenizer(text, return_tensors="pt", truncation=True).to(device)
        for text in texts
    ]
    len(encoded_inputs), encoded_inputs[0].keys()
    fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length)
    fake_input_id.fill_(1)
    fake_input_id = fake_input_id.to(device)
    fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device)
    fake_type_id = fake_input_id.clone().detach()
    if data_type == "fp16":
        fake_mask = fake_mask.half()
    elif data_type == "bf16":
        fake_mask = fake_mask.bfloat16()
    return encoded_inputs, fake_input_id, fake_mask, fake_type_id


# %%
logger = logging.getLogger(__name__)
use_ths = use_torchscript = False
remove_padding = False
data_type = "fp16"  # "fp32", "fp16", "bf16"

per_gpu_eval_batch_size = 1
max_seq_length = 128
model_name_or_path = "bert-base-cased-finetuned-mrpc"


model = HFBertForSequenceClassification.from_pretrained(
    model_name_or_path, torchscript=True
)
model.eval().to(device)
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
encoded_inputs, fake_input_id, fake_mask, fake_type_id = prepare_examples(
    tokenizer
)


def optimize_no_trace(model, data_type="fp16"):
    model = detect_and_swap_bert_model(
        model, data_type="fp16", lib_path=lib_path, remove_padding=False
    )
    if data_type == "fp16":
        logger.info("Use fp16")
        model.half()
    elif data_type == "bf16":
        logger.info("Use bf16")
        model.bfloat16()
    return model.to(device)


def optimize_with_trace(
    model, data_type, per_gpu_eval_batch_size, max_seq_length
):
    model = optimize_no_trace(model, data_type)
    logger.info("Use TorchScript mode")
    fake_input_id = torch.LongTensor(per_gpu_eval_batch_size, max_seq_length)
    fake_input_id.fill_(1)
    fake_input_id = fake_input_id.to(device)
    fake_mask = torch.ones(per_gpu_eval_batch_size, max_seq_length).to(device)
    fake_type_id = fake_input_id.clone().detach()
    if data_type == "fp16":
        fake_mask = fake_mask.half()
    elif data_type == "bf16":
        fake_mask = fake_mask.bfloat16()
    model.eval()
    with torch.no_grad():
        model_ = torch.jit.trace(
            model, (fake_input_id, fake_mask, fake_type_id)
        )
    return model_


def benchmark(model, model_desc="original BERT"):
    times = []

    # Warmup for 30 iterations
    for encoded_input in encoded_inputs[:30]:
        with torch.no_grad():
            _ = model(**encoded_input)

    # Benchmark
    for encoded_input in encoded_inputs:
        st = time.perf_counter()
        with torch.no_grad():
            _ = model(**encoded_input)
        times.append(time.perf_counter() - st)
    original_model_time = sum(times) / len(times) * 1000
    print(f"Average response time for {model_desc}: {original_model_time} ms")


print(f"{encoded_inputs[0].keys()}")


benchmark(model, "BERT")
benchmark(model, "BERT")
data_type = "fp16"  # "fp32", "fp16", "bf16
per_gpu_eval_batch_size = 1
max_seq_length = 128
faster_model = optimize_no_trace(model, data_type)
benchmark(faster_model, "faster BERT (no metric drop)")
# Average response time for BERT: 4.741025467636064 ms
# Average response time for BERT: 4.686204055091366 ms

fastest_model = optimize_with_trace(
    model, data_type, per_gpu_eval_batch_size, max_seq_length
)

benchmark(fastest_model, "fastest BERT (no metric drop)")
# Average response time for faster BERT (no metric drop): 1.5583459960762411 ms # noqa: E501


# the above operations modifies `model` in-place
# so we need reload a fresh one to test speedster
model = HFBertForSequenceClassification.from_pretrained(
    model_name_or_path, torchscript=True
)
# Average response time for fastest BERT (no metric drop): 1.4657320715487003 ms # noqa: E501

model.eval().to(device)
dynamic_info = {
    "inputs": [
        {0: "batch", 1: "num_tokens"},
        {0: "batch", 1: "num_tokens"},
        {0: "batch", 1: "num_tokens"},
    ],
    "outputs": [{0: "batch", 1: "num_tokens"}],
}
speedster_optimized_model = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    # force it to use fastertransformer
    ignore_compilers=["tensor_rt", "tvm", "onnxruntime", "torchscript"],
    dynamic_info=dynamic_info,
)


benchmark(
    speedster_optimized_model, "speedster optimized BERT (no metric drop)"
)
benchmark(
    speedster_optimized_model, "speedster optimized BERT (no metric drop)"
)
# Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501
# Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501
speedster_optimized_model_fp16 = optimize_model(
    model=model,
    input_data=encoded_inputs,
    optimization_time="constrained",
    # force it to use fastertransformer
    ignore_compilers=["tensor_rt", "tvm", "onnxruntime", "torchscript"],
    dynamic_info=dynamic_info,
    metric_drop_ths=0.1,
)


benchmark(
    speedster_optimized_model_fp16, "speedster optimized BERT (metric drop)"
)
benchmark(
    speedster_optimized_model_fp16, "speedster optimized BERT (metric drop)"
)
# Average response time for speedster optimized BERT (no metric drop): 14.040142675396055 ms # noqa: E501
# Average response time for speedster optimized BERT (no metric drop): 3.4986357542220503 ms # noqa: E501


================================================
FILE: optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p5b0PzpW1xJq"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Accelerate ONNX ResNet50 with Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T9xuwZEHzN2K"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\n",
        "\n",
        "We will\n",
        "1. Install Speedster and the deep learning compilers used by the library.\n",
        "2. Speed up an ONNX ResNet50 without any loss of accuracy.\n",
        "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5Yc5KYo_YzE8"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HbFy2Aykz2Qo"
      },
      "source": [
        "# Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks onnx --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N5RXHoZl0p3p"
      },
      "source": [
        "# Optimization example with ONNX"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-Ju-VcRH01Mw"
      },
      "source": [
        "In the following example we will try to optimize a standard ONNX resnet50.\n",
        "\n",
        "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://nebuly.gitbook.io/nebuly/nebullvm/get-started).\n",
        "\n",
        "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "skxEuemn171G"
      },
      "source": [
        "## Scenario 1 - No accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wVRLXrDi2VaG"
      },
      "source": [
        "First of all we download the pretrained ONNX resnet50 model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6I5GDvWbZ-LJ",
        "outputId": "6ac09b39-9c6e-4d38-dfb6-35069938f9c1"
      },
      "outputs": [],
      "source": [
        "!wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vrkOvGfkaXk7"
      },
      "source": [
        "Then we optimize it with Speedster simple API"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RbgGruAeQcf"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from speedster import optimize_model, save_model, load_model\n",
        "\n",
        "# Load a resnet as example\n",
        "model = \"resnet50-v1-12.onnx\"\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\"\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = np.random.randn(1, 3, 224, 224).astype(np.float32)\n",
        "res_optimized = optimized_model(x)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i2IKNc2jbax8"
      },
      "source": [
        "We can print the type of the optimized model to see which compiler was faster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dFhqAhr0bcbZ",
        "outputId": "aa0b2fe9-2fa0-405b-8e44-3ebbf70f0e69"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_UuiqkEfcPy4"
      },
      "source": [
        "In our case, the optimized model type was NumpyONNXInferenceLearner, so this means that onnxruntime was the faster compiler.\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "E4759DQJcc15"
      },
      "source": [
        "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ktQaNfGqceOD"
      },
      "source": [
        "First of all, let's compute and print the original model result\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gUMlNAZrcj5-",
        "outputId": "3670f41f-b2db-4b55-dbf7-c9b0a0146c9d"
      },
      "outputs": [],
      "source": [
        "import onnx\n",
        "import onnxruntime as ort\n",
        "from typing import Dict, List\n",
        "\n",
        "\n",
        "def get_input_names(onnx_model: str):\n",
        "    model = onnx.load(onnx_model)\n",
        "    input_all = [node.name for node in model.graph.input]\n",
        "    return input_all\n",
        "\n",
        "\n",
        "def get_output_names(onnx_model: str):\n",
        "    model = onnx.load(onnx_model)\n",
        "    output_all = [node.name for node in model.graph.output]\n",
        "    return output_all\n",
        "\n",
        "\n",
        "def run_onnx_model(\n",
        "    onnx_model: str, session: ort.InferenceSession, input_tensors: List[np.ndarray], inputs: Dict, output_names: str\n",
        ") -> List[np.ndarray]:\n",
        "    \n",
        "    res = session.run(\n",
        "        output_names=output_names, input_feed=inputs\n",
        "    )\n",
        "    return list(res)\n",
        "\n",
        "\n",
        "session = ort.InferenceSession(\n",
        "    model,\n",
        "    providers=[\"CUDAExecutionProvider\", \"CPUExecutionProvider\"] # Change to [\"CPUExecutionProvider\"] if run on cpu\n",
        ")\n",
        "\n",
        "inputs = {\n",
        "    name: array\n",
        "    for name, array in zip(get_input_names(model), [x])\n",
        "}\n",
        "\n",
        "res_original = run_onnx_model(model, session, [x], inputs, get_output_names(model))\n",
        "res_original"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iU3dPwSTfWr_"
      },
      "source": [
        "Then, let's print the optimized model result that we computed before"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S1EKoJ75fVAh",
        "outputId": "73e7b127-e7d3-44a9-bd78-65961bd051df"
      },
      "outputs": [],
      "source": [
        "res_optimized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Lj4crPMmf_LX"
      },
      "source": [
        "Then, let's compute the average latency of the baseline model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rGNKr_ShgBbu"
      },
      "outputs": [],
      "source": [
        "import time"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "I2G4OzhCgG_D",
        "outputId": "a23eb4ea-fa0f-4221-a177-20876e452b53"
      },
      "outputs": [],
      "source": [
        "num_iters = 100\n",
        "\n",
        "# Warmup\n",
        "for i in range(10):\n",
        "  run_onnx_model(model, session, [x], inputs, get_output_names(model))\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  run_onnx_model(model, session, [x], inputs, get_output_names(model))\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "f-jmRjJvgW5V"
      },
      "source": [
        "Finally we compute the average latency for the optimized model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "51c3uaMcgaR-",
        "outputId": "1319a7bc-df1d-4f19-9426-3940ab4a7c5e"
      },
      "outputs": [],
      "source": [
        "# Warmup\n",
        "for i in range(10):\n",
        "  optimized_model(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  optimized_model(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tBeRKNTI3iyK"
      },
      "source": [
        "## Scenario 2 - Accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w3wutIzfAMe_"
      },
      "source": [
        "In this scenario, we set a max threshold for the accuracy drop to 2%"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fO1nGqpj3p7z"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from speedster import optimize_model\n",
        "\n",
        "# Load a resnet as example\n",
        "model = \"resnet50-v1-12.onnx\"\n",
        "\n",
        "# Provide an input data for the model\n",
        "# Note that in this case we should provide the model at least 100 data samples\n",
        "input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for i in range(100)]\n",
        "\n",
        "# Run nebullvm optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\", metric = \"accuracy\", metric_drop_ths = 0.02\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = np.random.randn(1, 3, 224, 224).astype(np.float32)\n",
        "res_optimized = optimized_model(x)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4UFtwZbEiLv3"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qFKHaHM6-GKm",
        "outputId": "73b95996-4d1f-4aa7-a96d-a40070bf36bd"
      },
      "outputs": [],
      "source": [
        "num_iters = 100\n",
        "\n",
        "# Warmup\n",
        "for i in range(10):\n",
        "  run_onnx_model(model, session, [x], inputs, get_output_names(model))\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  run_onnx_model(model, session, [x], inputs, get_output_names(model))\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J8g0aJRJiXA5"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_IbAW0KA4Fm5",
        "outputId": "67f44401-9568-4f38-802a-d81e3139af5a"
      },
      "outputs": [],
      "source": [
        "# Warmup\n",
        "for i in range(10):\n",
        "  optimized_model(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  optimized_model(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/onnx/Readme.md
================================================
# **ONNX Optimization**

This section contains all the available notebooks that show how to leverage Speedster to optimize ONNX models.

## Notebooks:
| Notebook                                                                                                                                                      | Description                                                          |                                                                                                                                                                                                                                       |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate ONNX Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model in ONNX format. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) |


## ONNX API quick view:

```python
import numpy as np
from speedster import optimize_model

# Load a resnet as example
# Model was downloaded from here: 
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx"

# Provide an input data for the model    
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]

# Run Speedster optimization
optimized_model = optimize_model(
  model, input_data=input_data, optimization_time="unconstrained"
)

# Try the optimized model
x = np.random.randn(1, 3, 224, 224).astype(np.float32)

## Warmup the model
## This step is necessary before the latency computation of the 
## optimized model in order to get reliable results.
# for _ in range(10):
#   optimized_model(x)

res_optimized = optimized_model(x)
```


================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p5b0PzpW1xJq"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Accelerate PyTorch ResNet50 with Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T9xuwZEHzN2K"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\n",
        "\n",
        "We will\n",
        "1. Install Speedster and the deep learning compilers used by the library.\n",
        "2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n",
        "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_0ZRCXCR9693",
        "outputId": "19096862-5c5c-4f9f-b2ad-3ce084ccf213"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HbFy2Aykz2Qo"
      },
      "source": [
        "### Installation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZPJHVZ74d8r2"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b0CLgQqxyrQi"
      },
      "source": [
        "Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\n",
        "\n",
        "The installation of the compilers may take a few minutes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GvK9mZSjeLU5"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N5RXHoZl0p3p"
      },
      "source": [
        "## Optimization example with Pytorch"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-Ju-VcRH01Mw"
      },
      "source": [
        "In the following example we will try to optimize a standard resnet50 loaded directly from torchvision.\n",
        "\n",
        "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n",
        "\n",
        "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "skxEuemn171G"
      },
      "source": [
        "### Scenario 1 - No accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wVRLXrDi2VaG"
      },
      "source": [
        "First we load the model and optimize it using the Speedster API:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RbgGruAeQcf"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torchvision.models as models\n",
        "from speedster import optimize_model, save_model, load_model\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load a resnet as example\n",
        "model = models.resnet50().to(device)\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\"\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = torch.randn(1, 3, 256, 256).to(device)\n",
        "model.eval()\n",
        "res_optimized = optimized_model(x)\n",
        "res_original = model(x)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JMiuufyu2gD3"
      },
      "source": [
        "We can print the type of the optimized model to see which compiler was faster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ifuLyQsM9697",
        "outputId": "c1534e0d-e5bb-4d44-91e9-652593751d52"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4WxcxrUC9698"
      },
      "source": [
        "In our case, the optimized model type was PytorchTensorRTInferenceLearner, so this means that Pytorch-TensorRT was the faster compiler."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iwHKfT349698"
      },
      "source": [
        "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-IMJpfcb9698"
      },
      "source": [
        "First of all, let's print the results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uI8Kd1Z49698",
        "outputId": "832d3053-d6c8-4cc2-9b48-a59dfaa45d33"
      },
      "outputs": [],
      "source": [
        "res_original"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0I_zSpv29698",
        "outputId": "a0ba566d-6730-4954-8dd0-eb47b549cbf1"
      },
      "outputs": [],
      "source": [
        "res_optimized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hBEtrYOd9699"
      },
      "source": [
        "Then, let's compare the performances:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GqxiCAbpfcwV"
      },
      "outputs": [],
      "source": [
        "from nebullvm.tools.benchmark import benchmark"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_0b0Bzwq-czD"
      },
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UqxzStjD2v0r"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dkt67_Orwlv4",
        "outputId": "fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7"
      },
      "outputs": [],
      "source": [
        "benchmark(model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AgOv-GqQ3KIC"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4PodpaDVfwzT",
        "outputId": "27a42560-93a2-4c19-e68d-360093fe914c"
      },
      "outputs": [],
      "source": [
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tBeRKNTI3iyK"
      },
      "source": [
        "## Scenario 2 - Accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w3wutIzfAMe_"
      },
      "source": [
        "In this scenario, we set a max threshold for the accuracy drop to 2%"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fO1nGqpj3p7z"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torchvision.models as models\n",
        "from speedster import optimize_model\n",
        "\n",
        "# Load a resnet as example\n",
        "model = models.resnet50().to(device)\n",
        "\n",
        "# Provide 100 random input data for the model  \n",
        "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\", metric=\"accuracy\", metric_drop_ths=0.02\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = torch.randn(1, 3, 256, 256).to(device)\n",
        "res = optimized_model(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qFKHaHM6-GKm"
      },
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yfW9kmHX-pGi"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0MMrL3959hli",
        "outputId": "2e8d27ec-a9f3-4f70-8c75-a0df974f2653"
      },
      "outputs": [],
      "source": [
        "benchmark(model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i3GqasOM-u8f"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_IbAW0KA4Fm5",
        "outputId": "48d83c89-5687-42aa-a3b8-6989bcb66aa6"
      },
      "outputs": [],
      "source": [
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p5b0PzpW1xJq"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Accelerate PyTorch VisionTransformer with Speedster"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "T9xuwZEHzN2K"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\n",
        "\n",
        "We will\n",
        "1. Install Speedster and the deep learning compilers used by the library.\n",
        "2. Speed up a PyTorch ViT without any loss of accuracy.\n",
        "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_0ZRCXCR9693",
        "outputId": "19096862-5c5c-4f9f-b2ad-3ce084ccf213"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HbFy2Aykz2Qo"
      },
      "source": [
        "### Installation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZPJHVZ74d8r2"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b0CLgQqxyrQi"
      },
      "source": [
        "Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\n",
        "\n",
        "The installation of the compilers may take a few minutes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GvK9mZSjeLU5"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N5RXHoZl0p3p"
      },
      "source": [
        "## Optimization example with Pytorch"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "-Ju-VcRH01Mw"
      },
      "source": [
        "In the following example we will try to optimize a ViT model loaded directly from vit_pytorch library.\n",
        "\n",
        "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n",
        "\n",
        "Let's first test the optimization without any loss in accuracy (metric_drop_ths=0, which is the default value), and then attempt to further accelerate it while constraining the loss of accuracy to a maximum of 2% (metric = 'accuracy', metric_drop_ths = 0.02)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "skxEuemn171G"
      },
      "source": [
        "### Scenario 1 - No accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wVRLXrDi2VaG"
      },
      "source": [
        "First we load the model and optimize it using the Speedster API:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RbgGruAeQcf"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from vit_pytorch import ViT\n",
        "from speedster import optimize_model, save_model, load_model\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load a ViT model\n",
        "model = ViT(\n",
        "    image_size = 256,\n",
        "    patch_size = 32,\n",
        "    num_classes = 1000,\n",
        "    dim = 1024,\n",
        "    depth = 6,\n",
        "    heads = 16,\n",
        "    mlp_dim = 2048,\n",
        "    dropout = 0.1,\n",
        "    emb_dropout = 0.1\n",
        ").to(device)\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\"\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = torch.randn(1, 3, 256, 256).to(device)\n",
        "model.to(device).eval()\n",
        "res_optimized = optimized_model(x)\n",
        "res_original = model(x)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JMiuufyu2gD3"
      },
      "source": [
        "We can print the type of the optimized model to see which compiler was faster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ifuLyQsM9697",
        "outputId": "c1534e0d-e5bb-4d44-91e9-652593751d52"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "4WxcxrUC9698"
      },
      "source": [
        "In our case, the optimized model type was TorchScriptInferenceLearner, so this means that TorchScriptCompiler was the faster compiler."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iwHKfT349698"
      },
      "source": [
        "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-IMJpfcb9698"
      },
      "source": [
        "First of all, let's print the results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uI8Kd1Z49698",
        "outputId": "832d3053-d6c8-4cc2-9b48-a59dfaa45d33"
      },
      "outputs": [],
      "source": [
        "res_original"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0I_zSpv29698",
        "outputId": "a0ba566d-6730-4954-8dd0-eb47b549cbf1"
      },
      "outputs": [],
      "source": [
        "res_optimized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hBEtrYOd9699"
      },
      "source": [
        "Then, let's compare the performances:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "GqxiCAbpfcwV"
      },
      "outputs": [],
      "source": [
        "from nebullvm.tools.benchmark import benchmark"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_0b0Bzwq-czD"
      },
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UqxzStjD2v0r"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dkt67_Orwlv4",
        "outputId": "fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7"
      },
      "outputs": [],
      "source": [
        "benchmark(model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AgOv-GqQ3KIC"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4PodpaDVfwzT",
        "outputId": "27a42560-93a2-4c19-e68d-360093fe914c"
      },
      "outputs": [],
      "source": [
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tBeRKNTI3iyK"
      },
      "source": [
        "## Scenario 2 - Accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w3wutIzfAMe_"
      },
      "source": [
        "In this scenario, we set a max threshold for the accuracy drop to 2%"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fO1nGqpj3p7z"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torchvision.models as models\n",
        "from speedster import optimize_model\n",
        "\n",
        "# Load a ViT model\n",
        "model = ViT(\n",
        "    image_size = 256,\n",
        "    patch_size = 32,\n",
        "    num_classes = 1000,\n",
        "    dim = 1024,\n",
        "    depth = 6,\n",
        "    heads = 16,\n",
        "    mlp_dim = 2048,\n",
        "    dropout = 0.1,\n",
        "    emb_dropout = 0.1\n",
        ").to(device)\n",
        "\n",
        "# Provide 100 random input data for the model  \n",
        "input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\", metric=\"accuracy\", metric_drop_ths=0.02\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = torch.randn(1, 3, 256, 256).to(device)\n",
        "res = optimized_model(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qFKHaHM6-GKm"
      },
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yfW9kmHX-pGi"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0MMrL3959hli",
        "outputId": "2e8d27ec-a9f3-4f70-8c75-a0df974f2653"
      },
      "outputs": [],
      "source": [
        "benchmark(model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i3GqasOM-u8f"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_IbAW0KA4Fm5",
        "outputId": "48d83c89-5687-42aa-a3b8-6989bcb66aa6"
      },
      "outputs": [],
      "source": [
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.16"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "3c977e4a",
      "metadata": {
        "id": "3c977e4a"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "6240f0ea",
      "metadata": {
        "id": "6240f0ea"
      },
      "source": [
        "# Accelerate PyTorch YOLOv5 with Speedster\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6cfcd562",
      "metadata": {
        "id": "6cfcd562"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
        "\n",
        "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "38171e92",
      "metadata": {},
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "okgu97ThVwnH",
      "metadata": {
        "id": "okgu97ThVwnH"
      },
      "source": [
        "### Install Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e62f5afa",
      "metadata": {
        "id": "e62f5afa"
      },
      "source": [
        "### Install and test YOLO"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b38d727d",
      "metadata": {
        "id": "b38d727d"
      },
      "source": [
        "Let's install YOLO."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f48f6a35",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f48f6a35",
        "outputId": "5b06307a-9196-4e5e-a542-1254d6c94ce2",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "! pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "92f49833",
      "metadata": {
        "id": "92f49833"
      },
      "source": [
        "We start by downloading the model from the Torch hub."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2dc46f67",
      "metadata": {
        "id": "2dc46f67"
      },
      "outputs": [],
      "source": [
        "import copy\n",
        "import time\n",
        "import types\n",
        "\n",
        "import torch"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ead6637d",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 248,
          "referenced_widgets": [
            "7f41159d22fe4ce7b8e7789a92478242",
            "2ecf6a6cfad64af698a88479ba95005b",
            "e7a2646ac0cd4afba67823799147ce13",
            "fd77306783b84b489b90d072a44a27d8",
            "94a4bc5454074b5c900186a60a950d19",
            "682cafb37aa34c75961d61d2665a50b7",
            "5e71284dc02f4346b217732643c90b86",
            "881f619ee75547a49c6d48fd3140721c",
            "56a1b99b282a4a63a64f48347963a5ab",
            "a59557bb103e4a3b96062c60d539db35",
            "65786546f69b420b9ec8451c97338f30"
          ]
        },
        "id": "ead6637d",
        "outputId": "8d44d380-535d-446c-fcb0-bb55ba9e9f84"
      },
      "outputs": [],
      "source": [
        "# Load Model\n",
        "model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "KcteQ5tsWy1v",
      "metadata": {
        "id": "KcteQ5tsWy1v"
      },
      "outputs": [],
      "source": [
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "37d07ab0",
      "metadata": {
        "id": "37d07ab0"
      },
      "source": [
        "## Optimization with Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "332cbc38",
      "metadata": {
        "id": "332cbc38"
      },
      "source": [
        "Now we are ready for optimizing the body of YOLOv5 using the `Speedster` function `optimize_model`."
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d1fc4d01",
      "metadata": {
        "id": "d1fc4d01"
      },
      "source": [
        "Speedster was built to be very easy to use. To optimize a model, you only need to specify the model, the batch size and input size for each input tensor, and a directory in which to save the optimized model. In the example, we chose the same directory in which this notebook runs.\n",
        "\n",
        "With the latest API, there are two ways to use Speedster:\n",
        "\n",
        "- Option A: Accelerate the model up to ~10 times without losing in performances (accuracy/precision/etc.)\n",
        "- Option B: Accelerate the model up to ~30 times with a pre-defined maximum loss in performances\n",
        "    \n",
        "To learn more about how to use Speedster, check out the <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#-speedster\" target=\"_blank\" style=\"text-decoration: none;\"> readme on GitHub </a>."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ceb07403",
      "metadata": {
        "id": "ceb07403"
      },
      "source": [
        "In this example, we provide the code to run option B."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "74f9f650",
      "metadata": {
        "id": "74f9f650"
      },
      "outputs": [],
      "source": [
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b729ccce",
      "metadata": {},
      "source": [
        "Let's load some example data to feed the optimize_model function"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "20c15b09",
      "metadata": {
        "id": "20c15b09"
      },
      "outputs": [],
      "source": [
        "from PIL import Image\n",
        "import requests\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8fcf6332",
      "metadata": {
        "id": "8fcf6332"
      },
      "outputs": [],
      "source": [
        "img_name = \"zidane.png\"\n",
        "imgs = ['https://ultralytics.com/images/zidane.jpg']  # batch of images\n",
        "Image.open(requests.get(imgs[0], stream=True).raw).save(img_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "178a31f1",
      "metadata": {
        "id": "178a31f1"
      },
      "outputs": [],
      "source": [
        "def read_and_crop(im, original_model, img_size):\n",
        "    p  =  next(original_model.parameters())\n",
        "    im = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im)\n",
        "    max_y, max_x = im.size\n",
        "    ptr_x = np.random.choice(max_x-img_size[0])\n",
        "    ptr_y = np.random.choice(max_y-img_size[1])\n",
        "    im = np.array(im.crop((ptr_y, ptr_x, ptr_y + img_size[1], ptr_x + img_size[0])))\n",
        "    x = np.expand_dims(im, axis=0)\n",
        "    x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2)))  # stack and BHWC to BCHW\n",
        "    x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32\n",
        "    return x"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "51757959",
      "metadata": {
        "id": "51757959"
      },
      "outputs": [],
      "source": [
        "input_data = [((read_and_crop(img_name, model, (640, 640)),), None) for _ in range(100)]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c01adfeb",
      "metadata": {
        "id": "c01adfeb"
      },
      "outputs": [],
      "source": [
        "model_optimized = optimize_model(\n",
        "    model=model,\n",
        "    input_data=input_data,\n",
        "    optimization_time=\"unconstrained\",\n",
        "    metric_drop_ths=0.05\n",
        ")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "495c1642",
      "metadata": {},
      "source": [
        "Let's compare the original model performance with the optimized one:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "82e39d5b",
      "metadata": {
        "id": "82e39d5b"
      },
      "outputs": [],
      "source": [
        "from nebullvm.tools.benchmark import benchmark\n",
        "\n",
        "original_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)\n",
        "print(\"Benchmark original model\")\n",
        "benchmark(original_model, input_data)\n",
        "\n",
        "print(\"Benchmark optimized model\")\n",
        "benchmark(model_optimized, input_data)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "f0d6d006",
      "metadata": {},
      "source": [
        "Let's ensure that the output of the original model is the same as the optimized model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "66c0dbab",
      "metadata": {},
      "outputs": [],
      "source": [
        "input_tensor = torch.randn(1, 3, 640, 640).to(device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bfe573fd",
      "metadata": {},
      "outputs": [],
      "source": [
        "model(input_tensor)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "89654058",
      "metadata": {},
      "outputs": [],
      "source": [
        "model_optimized(input_tensor)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b72bdf54",
      "metadata": {},
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ada71f91",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "99b3a9d0",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(model_optimized, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "6308ddd7",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9946f6b",
      "metadata": {},
      "outputs": [],
      "source": [
        "model_optimized = load_model(\"model_save_path\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "d50807de",
      "metadata": {
        "id": "d50807de"
      },
      "source": [
        "What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) "
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
      }
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "2ecf6a6cfad64af698a88479ba95005b": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_682cafb37aa34c75961d61d2665a50b7",
            "placeholder": "​",
            "style": "IPY_MODEL_5e71284dc02f4346b217732643c90b86",
            "value": "100%"
          }
        },
        "56a1b99b282a4a63a64f48347963a5ab": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "5e71284dc02f4346b217732643c90b86": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "65786546f69b420b9ec8451c97338f30": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "682cafb37aa34c75961d61d2665a50b7": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7f41159d22fe4ce7b8e7789a92478242": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_2ecf6a6cfad64af698a88479ba95005b",
              "IPY_MODEL_e7a2646ac0cd4afba67823799147ce13",
              "IPY_MODEL_fd77306783b84b489b90d072a44a27d8"
            ],
            "layout": "IPY_MODEL_94a4bc5454074b5c900186a60a950d19"
          }
        },
        "881f619ee75547a49c6d48fd3140721c": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "94a4bc5454074b5c900186a60a950d19": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a59557bb103e4a3b96062c60d539db35": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "e7a2646ac0cd4afba67823799147ce13": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_881f619ee75547a49c6d48fd3140721c",
            "max": 14808437,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_56a1b99b282a4a63a64f48347963a5ab",
            "value": 14808437
          }
        },
        "fd77306783b84b489b90d072a44a27d8": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_a59557bb103e4a3b96062c60d539db35",
            "placeholder": "​",
            "style": "IPY_MODEL_65786546f69b420b9ec8451c97338f30",
            "value": " 14.1M/14.1M [00:00&lt;00:00, 24.5MB/s]"
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb
================================================
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![New Release: Accelerate YOLOv8](assets/yolov8.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Accelerate Ultralytics YOLOv8 with Speedster"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6cfcd562",
   "metadata": {
    "id": "6cfcd562"
   },
   "source": [
    "Hi and welcome 👋\n",
    "\n",
    "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster module from the open-source library nebullvm.\n",
    "\n",
    "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
    "\n",
    "Let's jump to the code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%env CUDA_VISIBLE_DEVICES=0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Install Speedster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install speedster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Install Ultralytics YOLOv8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install ultralytics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load YOLOv8s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from ultralytics import YOLO\n",
    "\n",
    "yolo = YOLO('yolov8s.pt')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's load a test dummy data and see the original output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = torch.randn(1, 3, 640, 640)\n",
    "yolo.model(test_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The original YOLOv8 model return as output a tuple where the first element is a tensor and the second is a list of tensors. Speedster currently supports only models that return only tensors, so we need to create a wrapper to overcome this issue:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class YOLOWrapper(torch.nn.Module):\n",
    "    def __init__(self, yolo_model):\n",
    "        super().__init__()\n",
    "        self.model = yolo_model.model\n",
    "    \n",
    "    def forward(self, x, *args, **kwargs):\n",
    "        res = self.model(x)\n",
    "        return res[0], *tuple(res[1])\n",
    "        \n",
    "model_wrapper = YOLOWrapper(yolo)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## YOLOv8s Optimization with GPU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can now optimize the model using speedster:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from speedster import optimize_model\n",
    "\n",
    "# Provide some input data for the model    \n",
    "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n",
    "\n",
    "# Run Speedster optimization\n",
    "optimized_model = optimize_model(\n",
    "  model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can finally restore the original output format by wrapping the optimized model in a new class:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class OptimizedYOLO(torch.nn.Module):\n",
    "    def __init__(self, optimized_model):\n",
    "        super().__init__()\n",
    "        self.model = optimized_model\n",
    "    \n",
    "    def forward(self, x, *args, **kwargs):\n",
    "        res = self.model(x)\n",
    "        return res[0], list(res[1:])\n",
    "    \n",
    "optimized_wrapper = OptimizedYOLO(optimized_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimized_wrapper(test_data.cuda())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## YOLOv8s Optimization with CPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from speedster import optimize_model, save_model, load_model\n",
    "from ultralytics import YOLO\n",
    "\n",
    "yolo = YOLO('yolov8s.pt')\n",
    "model_wrapper = YOLOWrapper(yolo)\n",
    "\n",
    "# Provide some input data for the model    \n",
    "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n",
    "\n",
    "# Run Speedster optimization\n",
    "optimized_model = optimize_model(\n",
    "  model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True, device=\"cpu\"\n",
    ")\n",
    "\n",
    "optimized_wrapper = OptimizedYOLO(optimized_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimized_wrapper(test_data)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b72bdf54",
   "metadata": {},
   "source": [
    "## Save and reload the optimized model"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ada71f91",
   "metadata": {},
   "source": [
    "We can easily save to disk the optimized model with the following line:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99b3a9d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_model(optimized_model, \"model_save_path\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6308ddd7",
   "metadata": {},
   "source": [
    "We can then load again the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9946f6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "optimized_model = load_model(\"model_save_path\")\n",
    "optimized_wrapper = OptimizedYOLO(optimized_model)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d50807de",
   "metadata": {
    "id": "d50807de"
   },
   "source": [
    "What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b77ff2ac",
   "metadata": {
    "id": "b77ff2ac"
   },
   "source": [
    "<center> \n",
    "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
    "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
    "</center>\n",
    "\n",
    "<center> \n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
    "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
    "</center>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]"
  },
  "vscode": {
   "interpreter": {
    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wQS9kNoyjsKe"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Accelerate Fast AI ResNet34 with Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hBObeC3SmRwl"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the open-source library nebullvm."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "87jOeOOtktQy"
      },
      "source": [
        "### Fine-tune a fast.ai model\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XlVUVGOAlS6O"
      },
      "source": [
        "For the tutorial, we will use a fast.ai notebook for beginners in which we will classify whether the input image contains a cat (True label) or a dog (False label). Let's jump to the code.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9cFt-FEvlNkG"
      },
      "outputs": [],
      "source": [
        "from fastai.vision.all import *"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GqdMEBPZlmpu",
        "outputId": "18d8a166-9b5d-4c91-cbc7-c8591bd5c0d2"
      },
      "outputs": [],
      "source": [
        "path = untar_data(URLs.PETS)\n",
        "files = get_image_files(path/\"images\")\n",
        "\n",
        "def label_func(f): return f[0].isupper()\n",
        "\n",
        "dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(224), num_workers=0)\n",
        "dls.show_batch()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VrmI4VeZlhJG"
      },
      "source": [
        "After downloading a sample of images of dogs and cats, we fine-tune the fast.ai model.\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MJ8q9xxBlv1x",
        "outputId": "8169f902-3dd0-449c-c293-91fb7ab94003"
      },
      "outputs": [],
      "source": [
        "learn = cnn_learner(dls, resnet34, metrics=error_rate)\n",
        "learn.fine_tune(1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RBzr8_47lxsW",
        "outputId": "b87781d6-2826-4cc6-9fd3-57da5cdcbbd4"
      },
      "outputs": [],
      "source": [
        "valid_loss, error = learn.validate()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WSWq0il6l0eC"
      },
      "source": [
        "Now that we have fine-tuned the model, let's calculate the time required to run a prediction as an average over 100 tests.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "o_iMOqI_l6-Y"
      },
      "outputs": [],
      "source": [
        "import time"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "JNZXAgIYl883"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "times = []\n",
        "for _ in range(100):\n",
        "    st = time.time()\n",
        "    preds = learn.predict(files[0])\n",
        "    times.append((time.time()-st)*1000)\n",
        "fastai_vanilla_time = sum(times)/len(times)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "N9IDkfyDmADn",
        "outputId": "0113620d-4c77-4a9f-ae1e-e64b0cb32293"
      },
      "outputs": [],
      "source": [
        "print(f\"Average prediction time: {fastai_vanilla_time} ms,\\nPrediction: {preds}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hlwl87jRmBy2"
      },
      "outputs": [],
      "source": [
        "#learn.save(\".\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bes-NoZnmhyy"
      },
      "source": [
        "### Install nebullvm"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48aljCHu14-H",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install nebullvm:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QFQh3BVr1-GO",
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8a7a86b3",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cffbfa32",
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Data preparation"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zVfy0VBooG_J"
      },
      "source": [
        "Now we prepare the dataset so that it can be processed by Speedster."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RuUavpyooIBT"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "\n",
        "xs, ys = [], []\n",
        "for i, (x, y) in enumerate(dls.train):\n",
        "    if i >=100:\n",
        "        break\n",
        "    xs.append(x)\n",
        "    ys.append(y)\n",
        "xs = torch.cat(xs, dim=0)\n",
        "ys = torch.cat(ys, dim=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kkVzQVmgoMQh"
      },
      "outputs": [],
      "source": [
        "dl_nebullvm = [((x.unsqueeze(dim=0),), y.unsqueeze(0)) for x, y in zip(xs, ys)]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_Eb_AAeqoOUS"
      },
      "outputs": [],
      "source": [
        "original_model = learn.model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0siBvWcsnv49"
      },
      "source": [
        "### Unconstrained without accuracy loss (thus constrained)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ToxCH47qstn9"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torchvision.models as models\n",
        "from speedster import optimize_model, save_model, load_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "njoWqCSzvzpr"
      },
      "outputs": [],
      "source": [
        "# Load a resnet as example\n",
        "model = original_model\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = dl_nebullvm\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\",\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "# x = torch.randn(1, 3, 224, 224)\n",
        "# res = optimized_model(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GGRbJL6Xq6Ns"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "h75V23FSs2MZ"
      },
      "outputs": [],
      "source": [
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "R_QrrT0oq1i_"
      },
      "outputs": [],
      "source": [
        "res_optimized = optimized_model(x)\n",
        "res_optimized"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xtjV8pDYxIIl"
      },
      "outputs": [],
      "source": [
        "from nebullvm.tools.benchmark import benchmark\n",
        "\n",
        "benchmark(model, input_data)\n",
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lWJCMGGJxaG5"
      },
      "source": [
        "### Unconstrained with 2% accuracy loss"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "g9Huil4-xeX5"
      },
      "outputs": [],
      "source": [
        "# Load a resnet as example\n",
        "model = original_model\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = dl_nebullvm\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\", metric_drop_ths=0.02, metric=\"accuracy\"\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "# x = torch.randn(1, 3, 224, 224)\n",
        "# res = optimized_model(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cLxoOzxe4clI"
      },
      "outputs": [],
      "source": [
        "# Set the model to eval mode and move it to the available device\n",
        "model.eval()\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "c3QvxwUD4clI"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dRLd4QMJ4clI"
      },
      "outputs": [],
      "source": [
        "benchmark(model, input_data)\n",
        "benchmark(optimized_model, input_data)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "ceb60d8c",
      "metadata": {
        "id": "ceb60d8c"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "d9eda1a0",
      "metadata": {},
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "62b6fcbf",
      "metadata": {},
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "3c968d51",
      "metadata": {},
      "source": [
        "We can then load again the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1340c49",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/pytorch/Readme.md
================================================
# **PyTorch Optimization**

This section contains all the available notebooks that show how to leverage Speedster to optimize PyTorch models.

## Notebooks:
| Notebook                                                                                                                                                                   | Description                                                                   |                                                                                                                                                                                                                                             |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Torchvision Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from Torchvision. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) |
| [Accelerate Fast AI Resnet34](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb)     | Show how to optimize with Speedster a Resnet34 model loaded from Fast AI.     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) |
| [Accelerate PyTorch ViT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb)               | Show how to optimize with Speedster a PyTorch ViT model.                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb)      |
| [Accelerate Ultralytics YOLOv5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb)     | Show how to optimize with Speedster a YOLOv5 model from Ultralytics.          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb)   |
| [Accelerate Ultralytics YOLOv8](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb)     | Show how to optimize with Speedster a YOLOv8 model from Ultralytics.          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb)   |


## PyTorch API quick view:

``` python
import torch
import torchvision.models as models
from speedster import optimize_model

# Load a resnet as example
model = models.resnet50()

# Provide an input data for the model    
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]

# Run Speedster optimization
optimized_model = optimize_model(
  model, input_data=input_data, optimization_time="unconstrained"
)

# Try the optimized model
x = torch.randn(1, 3, 256, 256)

## Warmup the model
## This step is necessary before the latency computation of the 
## optimized model in order to get reliable results.
# for _ in range(10):
#   optimized_model(x)

res = optimized_model(x)
```


================================================
FILE: optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p5b0PzpW1xJq"
      },
      "source": [
        "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-KdJPm7M05Jc"
      },
      "source": [
        "# Accelerate Tensorflow ResNet50 with Speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T9xuwZEHzN2K"
      },
      "source": [
        "Hi and welcome 👋\n",
        "\n",
        "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\n",
        "\n",
        "We will\n",
        "1. Install Speedster and the deep learning compilers used by the library.\n",
        "2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n",
        "3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
        "\n",
        "Let's jump to the code."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KIeIvBPVLQuq"
      },
      "outputs": [],
      "source": [
        "%env CUDA_VISIBLE_DEVICES=0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HbFy2Aykz2Qo"
      },
      "source": [
        "### Installation"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "48aljCHu14-H"
      },
      "source": [
        "Install Speedster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QFQh3BVr1-GO"
      },
      "outputs": [],
      "source": [
        "!pip install speedster"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8a7a86b3"
      },
      "source": [
        "Install deep learning compilers:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cffbfa32"
      },
      "outputs": [],
      "source": [
        "!python -m nebullvm.installers.auto_installer --frameworks tensorflow --compilers all"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N5RXHoZl0p3p"
      },
      "source": [
        "## Optimization example with Tensorflow"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-Ju-VcRH01Mw"
      },
      "source": [
        "In the following example we will try to optimize a standard resnet50 loaded directly from keras.\n",
        "\n",
        "Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n",
        "\n",
        "Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "skxEuemn171G"
      },
      "source": [
        "### Scenario 1 - No accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wVRLXrDi2VaG"
      },
      "source": [
        "First we load the model and optimize it using the Speedster API:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RbgGruAeQcf"
      },
      "outputs": [],
      "source": [
        "# If you encountered any error, run the cell again\n",
        "import tensorflow as tf\n",
        "from tensorflow.keras.applications.resnet50 import ResNet50\n",
        "from speedster import optimize_model, save_model, load_model\n",
        "\n",
        "# Load a resnet as example\n",
        "model = ResNet50()\n",
        "\n",
        "# Provide an input data for the model    \n",
        "input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\"\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = tf.random.normal([1, 224, 224, 3])\n",
        "res_original = model.predict(x)\n",
        "res_optimized = optimized_model.predict(x)[0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NGrk6_jwRubP"
      },
      "source": [
        "We can print the type of the optimized model to see which compiler was faster:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cVMn6erJLQuu"
      },
      "outputs": [],
      "source": [
        "optimized_model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aT0BhdIKR7gY"
      },
      "source": [
        "In our case, the optimized model type was TensorflowNvidiaInferenceLearner, so this means that Tensor RT was the faster compiler."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JMiuufyu2gD3"
      },
      "source": [
        "After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Swpr-Wi5Si9a"
      },
      "source": [
        "First of all, let's print the results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MjGtKkeZSOc7"
      },
      "outputs": [],
      "source": [
        "res_original"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dhe94Tk3SSfn"
      },
      "outputs": [],
      "source": [
        "res_optimized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UqxzStjD2v0r"
      },
      "source": [
        "Then, let's compute the average latency of the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ELyTjg6_S4Us"
      },
      "outputs": [],
      "source": [
        "import time"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dkt67_Orwlv4"
      },
      "outputs": [],
      "source": [
        "num_iters = 100\n",
        "\n",
        "# Warmup\n",
        "for i in range(10):\n",
        "  model.predict(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  model.predict(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AgOv-GqQ3KIC"
      },
      "source": [
        "Finally we compute the average latency for the optimized model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4PodpaDVfwzT"
      },
      "outputs": [],
      "source": [
        "# Warmup\n",
        "for i in range(10):\n",
        "  optimized_model.predict(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  optimized_model.predict(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tBeRKNTI3iyK"
      },
      "source": [
        "### Scenario 2 - Accuracy drop"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w3wutIzfAMe_"
      },
      "source": [
        "In this scenario, we set a max threshold for the accuracy drop to 2%"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fO1nGqpj3p7z"
      },
      "outputs": [],
      "source": [
        "import tensorflow as tf\n",
        "from tensorflow.keras.applications.resnet50 import ResNet50\n",
        "from speedster import optimize_model\n",
        "\n",
        "# Load a resnet as example\n",
        "model = ResNet50()\n",
        "\n",
        "# Provide an input data for the model   \n",
        "# Note that in this case we should provide the model at least 100 data samples\n",
        "input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for i in range(100)]\n",
        "\n",
        "# Run Speedster optimization\n",
        "optimized_model = optimize_model(\n",
        "  model, input_data=input_data, optimization_time=\"unconstrained\", metric = \"accuracy\", metric_drop_ths = 0.02\n",
        ")\n",
        "\n",
        "# Try the optimized model\n",
        "x = tf.random.normal([1, 224, 224, 3])\n",
        "res_original = model.predict(x)\n",
        "res_optimized = optimized_model.predict(x)[0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yfW9kmHX-pGi"
      },
      "source": [
        "Here we compute the average throughput for the baseline model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0MMrL3959hli"
      },
      "outputs": [],
      "source": [
        "num_iters = 100\n",
        "\n",
        "# Warmup\n",
        "for i in range(10):\n",
        "  model.predict(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  model.predict(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i3GqasOM-u8f"
      },
      "source": [
        "Here we compute the average throughput for the optimized model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_IbAW0KA4Fm5"
      },
      "outputs": [],
      "source": [
        "# Warmup\n",
        "for i in range(10):\n",
        "  optimized_model.predict(x)\n",
        "\n",
        "start = time.time()\n",
        "for i in range(num_iters):\n",
        "  optimized_model.predict(x)\n",
        "stop = time.time()\n",
        "\n",
        "print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4XFMC1S6zXTU"
      },
      "source": [
        "## Save and reload the optimized model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OXHVr3EAzbT5"
      },
      "source": [
        "We can easily save to disk the optimized model with the following line:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3M565P-zzaFB"
      },
      "outputs": [],
      "source": [
        "save_model(optimized_model, \"model_save_path\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ee8CS_Evzg1j"
      },
      "source": [
        "We can then load again the model:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zOQ88SY_zg-A"
      },
      "outputs": [],
      "source": [
        "optimized_model = load_model(\"model_save_path\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "id": "b77ff2ac",
      "metadata": {
        "id": "b77ff2ac"
      },
      "source": [
        "<center> \n",
        "    <a href=\"https://discord.com/invite/RbeQMu886J\" target=\"_blank\" style=\"text-decoration: none;\"> Join the community </a> |\n",
        "    <a href=\"https://nebuly.gitbook.io/nebuly/welcome/questions-and-contributions\" target=\"_blank\" style=\"text-decoration: none;\"> Contribute to the library </a>\n",
        "</center>\n",
        "\n",
        "<center> \n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#key-concepts\" target=\"_blank\" style=\"text-decoration: none;\"> How speedster works </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#documentation\" target=\"_blank\" style=\"text-decoration: none;\"> Documentation </a> •\n",
        "    <a href=\"https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster#quick-start\" target=\"_blank\" style=\"text-decoration: none;\"> Quick start </a> \n",
        "</center>"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3.8.10 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    },
    "vscode": {
      "interpreter": {
        "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}


================================================
FILE: optimization/speedster/notebooks/tensorflow/Readme.md
================================================
# **Tensorflow Optimization**

This section contains all the available notebooks that show how to leverage Speedster to optimize Tensorflow models.

## Notebooks:
| Notebook                                                                                                                                                                   | Description                                                             |                                                                                                                                                                                                                                                   |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Keras Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from keras. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) |


## Tensorflow API quick view:

``` python
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from speedster import optimize_model

# Load a resnet as example
model = ResNet50()

# Provide an input data for the model    
input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]

# Run Speedster optimization
optimized_model = optimize_model(
  model, input_data=input_data, optimization_time="unconstrained"
)

# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)

## Warmup the model
## This step is necessary before the latency computation of the 
## optimized model in order to get reliable results.
# for _ in range(10):
#   optimized_model.predict(x)

res_optimized = optimized_model.predict(x)[0]
```


================================================
FILE: optimization/speedster/requirements.txt
================================================
nebullvm>=0.10.0
tabulate>=0.8.0


================================================
FILE: optimization/speedster/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages


REQUIREMENTS = [
    "nebullvm>=0.9.0",
    "tabulate>=0.8.0",
]

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")

setup(
    name="speedster",
    version="0.4.0",
    packages=find_packages(),
    install_requires=REQUIREMENTS,
    long_description=long_description,
    include_package_data=True,
    long_description_content_type="text/markdown",
)


================================================
FILE: optimization/speedster/speedster/__init__.py
================================================
from speedster.api.functions import optimize_model  # noqa: F401
from nebullvm.operations.inference_learners.utils import (  # noqa: F401
    load_model,
    save_model,
)


================================================
FILE: optimization/speedster/speedster/api/__init__.py
================================================


================================================
FILE: optimization/speedster/speedster/api/functions.py
================================================
import logging
from typing import (
    Union,
    Iterable,
    Sequence,
    Callable,
    Dict,
    List,
    Optional,
)

from nebullvm.config import DEFAULT_METRIC_DROP_THS
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.logger import debug_mode_enabled, LoggingContext

from speedster.root_op import SpeedsterRootOp

from nebullvm.tools.utils import check_device


def optimize_model(
    model: Union[torch.nn.Module, tf.Module, str],
    input_data: Union[Iterable, Sequence],
    metric_drop_ths: float = DEFAULT_METRIC_DROP_THS,
    metric: Union[str, Callable] = None,
    optimization_time: str = "constrained",
    dynamic_info: Dict = None,
    config_file: str = None,
    ignore_compilers: List[str] = None,
    ignore_compressors: List[str] = None,
    store_latencies: bool = False,
    device: Optional[str] = None,
    **kwargs,
):
    """Optimize the input model regardless of the framework it was used for
    implementing it. The optimized model given as output will share with the
    input one the same API, i.e. the optimized model will have the same
    interface as the original one.

    Args:
        model (Union[torch.Module, tf.Module, str]): The input model. It can be
            a torch or tensorflow model or a path to an onnx saved model.
        input_data (Iterable or Sequence): Input data to be used for
            optimizing the model. Note that if 'unconstrained' is selected as
            `optimization_time`, it would be beneficial to provide at least 100
            data samples in order to use all the techniques supported by
            Nebullvm. The data can be given in either as sequence (data can be
            accessed by "element", e.g. `data[i]`) or iterable (data needs to
            be accessed with loop, e.g. `for x in data`). PyTorch, TensorFlow
            and Onnx respectively accept input tensor in `torch.Tensor`,
            `tf.Tensor` and `np.ndarray` formats. Note that each input
            sample must be a tuple containing a tuple as first element, the
            `inputs`, and the `label` as second element. The `inputs` needs to
            be passed as tuple even if a single input is needed by the model
            (in this case the `inputs` tuple will contain just an element).
            HuggingFace models can take as data samples both dictionaries or
            strings. Strings will then be converted in data samples using the
            HuggingFace tokenizer which must be given as input when just a
            list of string is provided as input_data (tokenizers can be passed
            as extra arguments of this function using the keyword `tokenizer`).
        metric_drop_ths (float, optional): Maximum reduction in the
            selected metric accepted. No model with a higher error will be
            accepted, i.e. all optimized model having a larger error respect to
            the original one will be discarded, without even considering their
            possible speed-up. Default: None, i.e. no drop in metric accepted.
        metric (Union[Callable, str], optional): The metric to
            be used for accepting or refusing a precision-reduction
            optimization proposal. If none is given but a `metric_drop_ths` is
            received, the `nebullvm.measure.compute_relative_difference`
            metric will be used as default one. A user-defined metric can
            be passed as function accepting as inputs two tuples of tensors
            (produced by the baseline and the optimized model) and the related
            original labels.
            For more information see
            `nebullvm.measure.compute_relative_difference` and
            `nebullvm.measure.compute_accuracy_drop`. `metric`
            accepts as value also a string containing the metric name. At the
            current stage the supported metrics are `"numeric_precision"` and
            `"accuracy"`. Default: `"numeric_precision"`
        optimization_time (OptimizationTime, optional): The optimization time
            mode. It can be either 'constrained' or 'unconstrained'. For
            'constrained' mode just compilers and precision reduction
            techniques are used (no compression). 'Unconstrained' optimization
            allows the usage of more time-consuming techniques as pruning and
            distillation. Note that for using many of the sophisticated
            techniques in the 'unconstrained' optimization, a small fine-tuning
            of the model will be needed. Thus we highly recommend to give as
            input_data at least 100 samples for when selecting 'unconstrained'
            optimization. Default: 'constrained'.
        dynamic_info (Dict, optional): Dictionary containing info about the
            dynamic axis. It should contain as keys both "inputs" and "outputs"
            and as values two lists of dictionaries where each dictionary
            represents the dynamic axis information for an input/output tensor.
            The inner dictionary should have as key an integer, i.e. the
            dynamic axis (considering also the batch size) and as value a
            string giving a "tag" to it, e.g. "batch_size". Default: None
        config_file (str, optional): Configuration file containing the
            parameters needed for defining the CompressionStep in the pipeline.
            Default: None.
        ignore_compilers (List, optional): List containing the compilers to be
            ignored during the OptimizerStep. The compiler name should be one
            among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite,
            bladedisc, torchscript, intel_neural_compressor. Default: None.
        ignore_compressors (List, optional): List containing the compressors
            to be ignored during the CompressionStep. The compiler name should
            be one among . Default: None.
        store_latencies (bool, optional): Parameter that allows to save the
            latency for each compiler used by nebullvm. Default: False.
        device (str, optional): Device used, can be 'cpu' or 'gpu'. If not
            set, gpu will be used if available, otherwise cpu. Default: None

    Returns:
        InferenceLearner: Optimized version of the input model having the same
            interface, imported by its original framework. For instance a
            Pytorch model, when optimized, will return an InferenceLearner
            object that can be call exactly as a PyTorch model (either
            with `model.forward(input)` and `model(input)`), i.e. it will
            take as input and it will return `torch.Tensor`s.
    """
    root_op = SpeedsterRootOp()
    device = check_device(device)

    disable_log = True if not debug_mode_enabled() else False

    with LoggingContext(logging.getLogger(), disabled=disable_log):
        return root_op.to(device).execute(
            model=model,
            input_data=input_data,
            metric_drop_ths=metric_drop_ths,
            metric=metric,
            optimization_time=optimization_time,
            dynamic_info=dynamic_info,
            config_file=config_file,
            ignore_compilers=ignore_compilers,
            ignore_compressors=ignore_compressors,
            store_latencies=store_latencies,
            **kwargs,
        )


================================================
FILE: optimization/speedster/speedster/api/tests/__init__.py
================================================


================================================
FILE: optimization/speedster/speedster/api/tests/test_huggingface.py
================================================
from tempfile import TemporaryDirectory

from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.huggingface import (
    HuggingFaceInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from transformers import AlbertModel, TFAlbertModel, AlbertTokenizer

from speedster import optimize_model, load_model


def test_torch_huggingface_ort_input_text():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = AlbertModel.from_pretrained("albert-base-v1")

    # Move the model to gpu if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    input_data = [
        "this is a test",
        "hi my name is Valerio",
        "india is very far from italy",
    ]

    optimized_model = optimize_model(
        model=model,
        input_data=input_data,
        optimization_time="constrained",
        tokenizer=tokenizer,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        tokenizer_args=dict(
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt",
            return_token_type_ids=None,  # Sets to model default
            padding="longest",
            truncation=True,
        ),
    )

    # save and load
    with TemporaryDirectory() as tmp_dir:
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, HuggingFaceInferenceLearner)

        assert isinstance(loaded_model.get_size(), int)

    x = ["this is a test input to see if the optimized model works."]
    inputs = tokenizer(x, return_tensors="pt").to(device)
    model.to(device)
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        torch.mean(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        torch.mean(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


def test_torch_huggingface_ort_input_tensors():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = AlbertModel.from_pretrained("albert-base-v1")

    # Move the model to gpu if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    text = "hi my name is Valerio"
    inputs = tokenizer(text, return_tensors="pt").to(device)

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=[inputs for _ in range(10)],
        optimization_time="constrained",
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        dynamic_info=dynamic_info,
    )

    x = ["this is a test input to see if the optimized model works."]
    inputs = tokenizer(x, return_tensors="pt").to(device)
    model.to(device)
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        torch.mean(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        torch.mean(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


def test_torch_huggingface_torchscript_input_tensors():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = AlbertModel.from_pretrained("albert-base-v1", torchscript=True)

    # Move the model to gpu if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    text = "hi my name is Valerio"
    inputs = tokenizer(text, return_tensors="pt").to(device)

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=[inputs for _ in range(10)],
        optimization_time="constrained",
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "torchscript"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        dynamic_info=dynamic_info,
    )

    x = ["this is a test input to see if the optimized model works."]
    inputs = tokenizer(x, return_tensors="pt").to(device)
    model.to(device)
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert torch.mean(abs((res_original[0] - res_optimized[0]))) < 1e-2
    assert torch.mean(abs((res_original[1] - res_optimized[1]))) < 1e-2


def test_tensorflow_huggingface_ort_input_text_np():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = TFAlbertModel.from_pretrained("albert-base-v1")

    input_data = [
        "this is a test",
        "hi my name is Valerio",
        "india is very far from italy",
    ]

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=input_data,
        optimization_time="constrained",
        tokenizer=tokenizer,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        tokenizer_args=dict(
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="np",
            return_token_type_ids=None,  # Sets to model default
            padding="longest",
            truncation=True,
        ),
        dynamic_info=dynamic_info,
    )

    x = ["this is a test input to see if the optimized model works."]
    inputs = tokenizer(x, return_tensors="np")
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


def test_tensorflow_huggingface_ort_input_tensors_np():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = TFAlbertModel.from_pretrained("albert-base-v1")

    text = "hi my name is Valerio"
    inputs = tokenizer(text, return_tensors="np")

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=[inputs for _ in range(10)],
        optimization_time="constrained",
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        dynamic_info=dynamic_info,
    )

    x = ["Test to see if it works with a different output"]
    inputs = tokenizer(x, return_tensors="np")
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


def test_tensorflow_huggingface_ort_input_text_tf():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = TFAlbertModel.from_pretrained("albert-base-v1")

    input_data = [
        "this is a test",
        "hi my name is Valerio",
        "india is very far from italy",
    ]

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=input_data,
        optimization_time="constrained",
        tokenizer=tokenizer,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        tokenizer_args=dict(
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="tf",
            return_token_type_ids=None,  # Sets to model default
            padding="longest",
            truncation=True,
        ),
        dynamic_info=dynamic_info,
    )

    x = ["this is a test input to see if the optimized model works."]
    inputs = tokenizer(x, return_tensors="tf")
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


def test_tensorflow_huggingface_ort_input_tensors_tf():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
    model = TFAlbertModel.from_pretrained("albert-base-v1")

    text = "hi my name is Valerio"
    inputs = tokenizer(text, return_tensors="tf")

    dynamic_info = {
        "inputs": [
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
            {0: "batch", 1: "num_tokens"},
        ],
        "outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
    }

    optimized_model = optimize_model(
        model=model,
        input_data=[inputs for _ in range(10)],
        optimization_time="constrained",
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        dynamic_info=dynamic_info,
    )

    x = ["Test to see if it works with a different output"]
    inputs = tokenizer(x, return_tensors="tf")
    res_original = model(**inputs)
    res_optimized = optimized_model(**inputs)

    assert isinstance(optimized_model, HuggingFaceInferenceLearner)

    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["last_hidden_state"]
                    - res_optimized["last_hidden_state"]
                )
            )
        )
        < 1e-2
    )
    assert (
        tf.math.reduce_max(
            abs(
                (
                    res_original["pooler_output"]
                    - res_optimized["pooler_output"]
                )
            )
        )
        < 1e-2
    )


================================================
FILE: optimization/speedster/speedster/api/tests/test_onnx.py
================================================
import cpuinfo
from tempfile import TemporaryDirectory

import numpy as np
import pytest
import torch
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.onnx import (
    NumpyONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
    NumpyOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
    NumpyONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
    NumpyApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from torchvision import models

from speedster import optimize_model, load_model
from speedster.api.tests.utils import torch_to_onnx


def test_onnx_ort():
    with TemporaryDirectory() as tmp_dir:
        model = models.resnet18()
        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
        model_path = torch_to_onnx(model, input_data, tmp_dir)

        input_data = [
            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
            for i in range(100)
        ]

        # Run nebullvm optimization in one line of code
        optimized_model = optimize_model(
            model_path,
            input_data=input_data,
            ignore_compilers=[
                compiler
                for compiler in COMPILER_LIST
                if compiler != "onnxruntime"
            ],
            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        )

        with TemporaryDirectory() as tmp_dir:
            optimized_model.save(tmp_dir)
            loaded_model = load_model(tmp_dir)
            assert isinstance(loaded_model, NumpyONNXInferenceLearner)

            assert isinstance(loaded_model.get_size(), int)

            # Try the optimized model
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            )
            x = torch.randn(1, 3, 256, 256, requires_grad=False)
            model.to(device).eval()
            with torch.inference_mode():
                res_original = model(x.to(device))
            res_optimized = optimized_model(x.numpy())[0]

            assert (
                abs(
                    (res_original.detach().cpu().numpy() - res_optimized)
                ).max()
                < 1e-2
            )


def test_onnx_ort_quant():
    with TemporaryDirectory() as tmp_dir:
        model = models.resnet18()
        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
        model_path = torch_to_onnx(model, input_data, tmp_dir)

        input_data = [
            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
            for i in range(100)
        ]

        # Run nebullvm optimization in one line of code
        optimized_model = optimize_model(
            model_path,
            input_data=input_data,
            ignore_compilers=[
                compiler
                for compiler in COMPILER_LIST
                if compiler != "onnxruntime"
            ],
            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
            metric_drop_ths=2,
        )

        # Try the optimized model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device).eval()
        x = torch.randn(1, 3, 256, 256, requires_grad=False)
        with torch.inference_mode():
            res_original = model(x.to(device))
        res_optimized = optimized_model(x.numpy())[0]

        assert isinstance(optimized_model, NumpyONNXInferenceLearner)
        assert (
            abs((res_original.detach().cpu().numpy() - res_optimized)).max()
            < 1
        )


@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Skip because cuda is not available.",
)
def test_onnx_tensorrt():
    with TemporaryDirectory() as tmp_dir:
        model = models.resnet18()
        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
        model_path = torch_to_onnx(model, input_data, tmp_dir)

        input_data = [
            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
            for i in range(100)
        ]

        # Run nebullvm optimization in one line of code
        optimized_model = optimize_model(
            model_path,
            input_data=input_data,
            ignore_compilers=[
                compiler
                for compiler in COMPILER_LIST
                if compiler != "tensor_rt"
            ],
            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        )

        # Try the optimized model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        x = torch.randn(1, 3, 256, 256, requires_grad=False)
        model.to(device).eval()
        with torch.inference_mode():
            res_original = model(x.to(device))
        res_optimized = optimized_model(x.numpy())[0]

        assert isinstance(optimized_model, NumpyONNXTensorRTInferenceLearner)
        assert (
            abs((res_original.detach().cpu().numpy() - res_optimized)).max()
            < 1e-2
        )


@pytest.mark.skipif(
    "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
    reason="Openvino is only available for intel processors.",
)
def test_onnx_openvino():
    with TemporaryDirectory() as tmp_dir:
        model = models.resnet18()
        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
        model_path = torch_to_onnx(model, input_data, tmp_dir)

        input_data = [
            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
            for i in range(100)
        ]

        # Run nebullvm optimization in one line of code
        optimized_model = optimize_model(
            model_path,
            input_data=input_data,
            ignore_compilers=[
                compiler
                for compiler in COMPILER_LIST
                if compiler != "openvino"
            ],
            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
            device="cpu",
        )

        # Try the optimized model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        x = torch.randn(1, 3, 256, 256, requires_grad=False)
        model.to(device).eval()
        with torch.inference_mode():
            res_original = model(x.to(device))
        res_optimized = optimized_model(x.numpy())[0]

        assert isinstance(optimized_model, NumpyOpenVinoInferenceLearner)
        assert (
            abs((res_original.detach().cpu().numpy() - res_optimized)).max()
            < 1e-2
        )


@pytest.mark.skipif(
    not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_onnx_tvm():
    with TemporaryDirectory() as tmp_dir:
        model = models.resnet18()
        input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
        model_path = torch_to_onnx(model, input_data, tmp_dir)

        input_data = [
            ((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
            for i in range(100)
        ]

        # Run nebullvm optimization in one line of code
        optimized_model = optimize_model(
            model_path,
            input_data=input_data,
            ignore_compilers=[
                compiler for compiler in COMPILER_LIST if compiler != "tvm"
            ],
            ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        )

        # Try the optimized model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        x = torch.randn(1, 3, 256, 256, requires_grad=False)
        model.to(device).eval()
        with torch.inference_mode():
            res_original = model(x.to(device))
        res_optimized = optimized_model(x.numpy())[0]

        assert isinstance(optimized_model, NumpyApacheTVMInferenceLearner)
        assert (
            abs((res_original.detach().cpu().numpy() - res_optimized)).max()
            < 1e-2
        )


================================================
FILE: optimization/speedster/speedster/api/tests/test_pytorch.py
================================================
import cpuinfo
from tempfile import TemporaryDirectory

import pytest
import torch
import torchvision.models as models
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.blade_disc import (
    BladeDISCInferenceLearner,
)
from nebullvm.operations.inference_learners.onnx import (
    PytorchONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
    PytorchOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
    PytorchTensorRTInferenceLearner,
    PytorchONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
    TorchDynamoInferenceLearner,
)
from nebullvm.operations.inference_learners.torchscript import (
    TorchScriptInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
    PytorchApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import (
    tvm_is_available,
    bladedisc_is_available,
)

from speedster import optimize_model, load_model

from nebullvm.tools.utils import check_module_version


def test_torch_ort():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    with TemporaryDirectory() as tmp_dir:
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, PytorchONNXInferenceLearner)

        assert isinstance(loaded_model.get_size(), int)

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, PytorchONNXInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


def test_torch_ort_quant():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        metric_drop_ths=2,
    )

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, PytorchONNXInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 2


def test_torch_torchscript():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "torchscript"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, TorchScriptInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


@pytest.mark.skipif(
    not check_module_version(torch, min_version="2.0.0") or True,
    reason="Torch version is not supported",
)
def test_torch_torch_dynamo():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler
            for compiler in COMPILER_LIST
            if compiler != "torch_dynamo"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, TorchDynamoInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


@pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="Skip because cuda is not available.",
)
@pytest.mark.skipif(
    not check_module_version(torch, max_version="1.13.1+cu117"),
    reason="Skip because torch version is not supported.",
)
def test_torch_tensorrt():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "tensor_rt"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    x = torch.randn(1, 3, 256, 256).cuda()
    model.cuda().eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(
        optimized_model, PytorchTensorRTInferenceLearner
    ) or isinstance(optimized_model, PytorchONNXTensorRTInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


@pytest.mark.skipif(
    "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
    reason="Openvino is only available for intel processors.",
)
def test_torch_openvino():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "openvino"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        device="cpu",
    )

    # Try the optimized model
    x = torch.randn(1, 3, 256, 256)
    model.eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, PytorchOpenVinoInferenceLearner)
    assert torch.max(abs((res_original.cpu() - res_optimized))) < 1e-2


@pytest.mark.skipif(
    not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_torch_tvm():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "tvm"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


@pytest.mark.skipif(
    not bladedisc_is_available(),
    reason="Can't test bladedisc if it's not installed.",
)
def test_torch_bladedisc():
    model = models.resnet18()
    input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "bladedisc"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
    model.to(device).eval()
    res_original = model(x)
    res_optimized = optimized_model(x)[0]

    assert isinstance(optimized_model, BladeDISCInferenceLearner)
    assert torch.max(abs((res_original - res_optimized))) < 1e-2


================================================
FILE: optimization/speedster/speedster/api/tests/test_tensorflow.py
================================================
from tempfile import TemporaryDirectory

import cpuinfo
import pytest
import tensorflow as tf
from keras.applications import ResNet50
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.onnx import (
    TensorflowONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
    TensorflowOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
    TensorflowONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tensorflow import (
    TensorflowBackendInferenceLearner,
    TFLiteBackendInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
    TensorflowApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from nebullvm.tools.utils import gpu_is_available

from speedster import optimize_model, load_model

# Limit tensorflow gpu memory usage
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.set_visible_devices(gpus[0], "GPU")
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices("GPU")
            print(
                len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs"
            )
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


def test_tensorflow_ort():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    with TemporaryDirectory() as tmp_dir:
        optimized_model.save(tmp_dir)
        loaded_model = load_model(tmp_dir)
        assert isinstance(loaded_model, TensorflowONNXInferenceLearner)

        assert isinstance(loaded_model.get_size(), int)

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TensorflowONNXInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


def test_tensorflow_tf_backend():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "xla"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TensorflowBackendInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


@pytest.mark.skipif(
    gpu_is_available(),
    reason="TFLite does not support Nvidia GPUs",
)
def test_tensorflow_tflite():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "tflite"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        metric_drop_ths=0.1,
    )

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TFLiteBackendInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


@pytest.mark.skipif(
    not gpu_is_available(),
    reason="Skip because cuda is not available.",
)
def test_tensorflow_tensorrt():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "tensor_rt"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TensorflowONNXTensorRTInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


@pytest.mark.skipif(
    "intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
    reason="Openvino is only available for intel processors.",
)
def test_tensorflow_openvino():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "openvino"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
        device="cpu",
    )

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TensorflowOpenVinoInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


@pytest.mark.skipif(
    not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_tensorflow_tvm():
    model = ResNet50()
    input_data = [
        ((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
    ]

    # Run nebullvm optimization in one line of code
    optimized_model = optimize_model(
        model,
        input_data=input_data,
        ignore_compilers=[
            compiler for compiler in COMPILER_LIST if compiler != "tvm"
        ],
        ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
    )

    # Try the optimized model
    x = tf.random.normal([1, 224, 224, 3])
    res_original = model.predict(x)
    res_optimized = optimized_model.predict(x)[0]

    assert isinstance(optimized_model, TensorflowApacheTVMInferenceLearner)
    assert abs((res_original - res_optimized)).max() < 1e-2


================================================
FILE: optimization/speedster/speedster/api/tests/utils.py
================================================
import os
from pathlib import Path

from nebullvm.core.models import ModelParams, Device, DeviceType
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.tools.data import DataManager
from nebullvm.tools.utils import gpu_is_available


def torch_to_onnx(model, input_data, output_path):
    model_params = ModelParams(1, [], [], [])
    output_path = os.path.join(output_path, "model.onnx")
    device = Device(DeviceType.GPU if gpu_is_available() else DeviceType.CPU)
    convert_torch_to_onnx(
        model, DataManager(input_data), model_params, Path(output_path), device
    )

    return output_path


================================================
FILE: optimization/speedster/speedster/root_op.py
================================================
import json
import pickle
import sys
from typing import (
    Any,
    Union,
    Iterable,
    Sequence,
    Dict,
    Callable,
    List,
)

from loguru import logger
from nebullvm import setup_logger
from nebullvm.config import MIN_NUMBER
from nebullvm.core.models import OptimizeInferenceResult, DeviceType
from nebullvm.operations.base import Operation
from nebullvm.operations.optimizations.optimize_inference import (
    OptimizeInferenceOp,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.feedback_collector import FeedbackCollector
from tabulate import tabulate

from nebullvm.tools.hardware_utils import get_hw_setup
from nebullvm.tools.utils import (
    get_model_size_mb,
    get_model_name,
    generate_model_id,
)

SPEEDSTER_FEEDBACK_COLLECTOR = FeedbackCollector(
    url="https://nebuly.cloud/v1/store_speedster_results",
    disable_telemetry_environ_var="SPEEDSTER_DISABLE_TELEMETRY",
    app_version="0.4.0",
)


def _convert_technique(technique: str):
    if technique.lower() == "none":  # use fp32 instead of none
        technique = "fp32"
    elif technique == "HALF":
        technique = "fp16"
    elif technique == "STATIC":
        technique = "int8"
    else:
        technique = "int8_dynamic"
    return technique


def _get_model_len(model: Any):
    try:
        return len(pickle.dumps(model, -1))
    except Exception:
        logger.warning(
            "Cannot pickle input model. Unable to "
            "extract original model size"
        )
        # Model is not pickable
        return -1


class SpeedsterRootOp(Operation):
    def __init__(self):
        super().__init__()
        self.optimize_inference_op = OptimizeInferenceOp()
        self.set_feedback_collector(SPEEDSTER_FEEDBACK_COLLECTOR)

    def _send_feedback(
        self,
        optimization_result: OptimizeInferenceResult,
        store_latencies: bool = False,
    ):
        model_orig = optimization_result.original_model.model
        model_name = get_model_name(model_orig)
        model_info = {
            "model_name": model_name,
            "model_size": f"{get_model_size_mb(model_orig)} MB",
            "framework": optimization_result.original_model.framework.value,
        }
        self.feedback_collector.store_info(
            key="model_id", value=generate_model_id(model_orig)
        )
        self.feedback_collector.store_info(
            key="model_metadata", value=model_info
        )
        self.feedback_collector.store_info(
            key="hardware_setup", value=get_hw_setup(self.device).__dict__
        )
        optimizations = self.feedback_collector.get("optimizations")
        original_model_dict = {
            "compiler": optimization_result.original_model.framework.value,
            "technique": "original",
            "latency": optimization_result.original_model.latency_seconds,
        }
        optimizations.insert(0, original_model_dict)
        self.feedback_collector.send_feedback()

        if store_latencies:
            model_id = self.feedback_collector.get("model_id", "")
            with open(
                f"{model_name}_latencies_{model_id[:10]}.json", "w"
            ) as f:
                json.dump(
                    {
                        "optimizations": optimizations,
                    },
                    f,
                )
        self.feedback_collector.reset("optimizations")
        self.feedback_collector.reset("model_id")
        self.feedback_collector.reset("model_metadata")

    def execute(
        self,
        model: Any,
        input_data: Union[Iterable, Sequence, DataManager],
        metric_drop_ths: float = None,
        metric: Union[str, Callable] = None,
        optimization_time: str = "constrained",
        dynamic_info: Dict = None,
        config_file: str = None,
        ignore_compilers: List[str] = None,
        ignore_compressors: List[str] = None,
        store_latencies: bool = False,
        **kwargs,
    ):
        self.logger.info(
            "Running Speedster on {}{}".format(
                self.device.type.name,
                f":{self.device.idx}"
                if self.device.type is not DeviceType.CPU
                else "",
            )
        )

        result = self.optimize_inference_op.to(self.device).execute(
            model=model,
            input_data=input_data,
            metric_drop_ths=metric_drop_ths,
            metric=metric,
            optimization_time=optimization_time,
            dynamic_info=dynamic_info,
            config_file=config_file,
            ignore_compilers=ignore_compilers,
            ignore_compressors=ignore_compressors,
            store_latencies=store_latencies,
            **kwargs,
        )

        if result.optimized_model is None:
            return None

        opt_metric_drop = (
            f"{result.metric_drop:.4f}"
            if result.metric_drop > MIN_NUMBER
            else "0"
        )

        self._send_feedback(result, store_latencies=store_latencies)

        table = [
            [
                "backend",
                result.original_model.framework.name,
                result.optimized_model.inference_learner.name,
                "",
            ],
            [
                "latency",
                f"{result.original_model.latency_seconds:.4f} sec/batch",
                f"{result.optimized_model.latency_seconds:.4f} sec/batch",
                f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x",  # noqa: E501
            ],
            [
                "throughput",
                f"{result.original_model.throughput:.2f} " f"data/sec",
                f"{result.optimized_model.throughput:.2f} " f"data/sec",
                f"{result.optimized_model.throughput / result.original_model.throughput:.2f}x",  # noqa: E501
            ],
            [
                "model size",
                f"{result.original_model.size_mb:.2f} MB",
                f"{result.optimized_model.size_mb:.2f} MB",
                f"{min(int((result.optimized_model.size_mb-result.original_model.size_mb) / result.original_model.size_mb * 100), 0)}%"  # noqa: E501
                if result.original_model.size_mb > 0
                else "NA",
            ],
            ["metric drop", "", opt_metric_drop, ""],
            [
                "techniques",
                "",
                f"{_convert_technique(result.optimized_model.technique)}",
                "",
            ],
        ]
        headers = [
            "Metric",
            "Original Model",
            "Optimized Model",
            "Improvement",
        ]

        # change format to the logger, avoiding printing verbose info
        # to the console (as date, time, etc.)
        self.logger.remove()
        handler_id = self.logger.add(
            sys.stdout, format="<level>{message}</level>"
        )
        hw_info = get_hw_setup(self.device)
        hw_name = (
            hw_info.cpu
            if self.device.type is DeviceType.CPU
            else hw_info.accelerator
        )
        self.logger.info(
            (
                f"\n[Speedster results on {hw_name}]\n"
                f"{tabulate(table, headers, tablefmt='heavy_outline')}"
            )
        )

        if (
            result.original_model.latency_seconds
            / result.optimized_model.latency_seconds
            < 2
        ):
            self.logger.warning(
                f"\nMax speed-up with your input parameters is "
                f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x. "  # noqa: E501
                f"If you want to get a faster optimized model, "
                f"see the following link for some suggestions: "
                f"https://docs.nebuly.com/Speedster/advanced_"
                f"options/#acceleration-suggestions\n"
            )

        self.logger.remove(handler_id)
        setup_logger()

        return result.optimized_model.inference_learner


================================================
FILE: optimization/speedster/speedster/speedster.py
================================================
from nebullvm.apps.base import App

from speedster.root_op import SpeedsterRootOp


class SpeedsterApp(App):
    def __init__(self):
        super().__init__()
        self.root_op = SpeedsterRootOp()

    def execute(self, *args, **kwargs):
        return self.root_op.execute(*args, **kwargs)


================================================
FILE: optimization/speedster/speedster/tests/__init__.py
================================================


================================================
FILE: optimization/speedster/speedster/tests/test_root_op.py
================================================
from nebullvm.core.models import OptimizeInferenceResult

from speedster.root_op import SpeedsterRootOp


def test_root_op_no_optim_model(mocker):
    root_op = SpeedsterRootOp()

    mocker.patch.object(
        root_op.optimize_inference_op,
        "execute",
        return_value=OptimizeInferenceResult(
            original_model=mocker.MagicMock(),
            optimized_model=None,
            hardware_setup=mocker.MagicMock(),
        ),
    )

    res = root_op.execute(
        model=None,
        input_data=mocker.MagicMock(),
        metric_drop_ths=None,
        metric="latency",
        optimization_time=mocker.MagicMock(),
        dynamic_info=None,
        config_file=None,
        ignore_compilers=None,
        ignore_compressors=None,
        store_latencies=False,
    )

    assert res is None


def test_root_op_optim_model(mocker):
    root_op = SpeedsterRootOp()

    mocker.patch.object(
        root_op.optimize_inference_op,
        "execute",
        return_value=OptimizeInferenceResult(
            original_model=mocker.MagicMock(
                latency_seconds=1, throughput=1, size_mb=1
            ),
            optimized_model=mocker.MagicMock(
                metric_drop=0.1, latency_seconds=1, size_mb=1, throughput=1
            ),
            hardware_setup=mocker.MagicMock(),
        ),
    )

    mocker.patch.object(root_op, "_send_feedback")

    res = root_op.execute(
        model=None,
        input_data=mocker.MagicMock(),
        metric_drop_ths=None,
        metric="latency",
        optimization_time=mocker.MagicMock(),
        dynamic_info=None,
        config_file=None,
        ignore_compilers=None,
        ignore_compressors=None,
        store_latencies=False,
    )

    assert res is not None


================================================
FILE: optimization/speedster/speedster/utils.py
================================================


================================================
FILE: optimization/speedster/speedster.toml
================================================
[build-system]
requires = [
    "setuptools>=42",
    "wheel"
]
build-backend = "setuptools.build_meta"