Repository: nebuly-ai/optimate
Branch: main
Commit: a6d302f912b4
Files: 306
Total size: 1.6 MB
Directory structure:
gitextract_7q29s3ew/
├── .gitignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── README.md
├── monitoring/
│ └── nebuly/
│ └── __init__.py
└── optimization/
├── .github/
│ └── workflows/
│ └── tests.yml
├── chatllama/
│ ├── LICENSE
│ ├── README.md
│ ├── artifacts/
│ │ ├── config/
│ │ │ ├── config.yaml
│ │ │ ├── ds_config.json
│ │ │ └── peft_config.yaml
│ │ ├── datasets/
│ │ │ ├── actor_dataset.json
│ │ │ ├── reward_dataset.json
│ │ │ └── rlhf_dataset.json
│ │ ├── download_dataset.py
│ │ ├── extend_rlhf_dataset.py
│ │ ├── generate_actor_dataset.py
│ │ ├── generate_rewards.py
│ │ ├── main.py
│ │ └── templates.json
│ ├── chatllama/
│ │ ├── __init__.py
│ │ ├── langchain_modules/
│ │ │ ├── __init__.py
│ │ │ └── prompt_templates.py
│ │ ├── llama_model.py
│ │ └── rlhf/
│ │ ├── __init__.py
│ │ ├── actor.py
│ │ ├── config.py
│ │ ├── dataset.py
│ │ ├── model_list.py
│ │ ├── model_loader.py
│ │ ├── reward.py
│ │ ├── trainer.py
│ │ └── utils.py
│ └── setup.py
├── cloud_surfer/
│ └── README.md
├── forward_forward/
│ ├── README.md
│ ├── forward_forward/
│ │ ├── __init__.py
│ │ ├── api/
│ │ │ ├── __init__.py
│ │ │ └── functions.py
│ │ ├── app.py
│ │ ├── operations/
│ │ │ ├── __init__.py
│ │ │ ├── build_models.py
│ │ │ ├── data.py
│ │ │ ├── fetch_operations.py
│ │ │ └── trainers.py
│ │ ├── root_op.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── labels.py
│ │ ├── modules.py
│ │ └── utils.py
│ ├── requirements.txt
│ └── setup.py
├── large_speedster/
│ └── README.md
├── nebullvm/
│ ├── .pre-commit-config.yaml
│ ├── CONTRIBUTING.md
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── azure-pipelines.yml
│ ├── docker_build.sh
│ ├── docs/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── conf.py
│ │ ├── index.rst
│ │ ├── modules/
│ │ │ ├── api.rst
│ │ │ ├── converters.rst
│ │ │ ├── index.rst
│ │ │ ├── inference_learners.rst
│ │ │ ├── installers.rst
│ │ │ └── optimizers.rst
│ │ └── requirements-docs.txt
│ ├── nebullvm/
│ │ ├── __init__.py
│ │ ├── api/
│ │ │ └── __init__.py
│ │ ├── apps/
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── config.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── models.py
│ │ │ ├── tests/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_models.py
│ │ │ └── types.py
│ │ ├── installers/
│ │ │ ├── __init__.py
│ │ │ ├── auto_installer.py
│ │ │ ├── install_bladedisc.sh
│ │ │ ├── install_fastertransformer.sh
│ │ │ ├── install_tensor_rt.sh
│ │ │ ├── install_tvm.sh
│ │ │ ├── install_tvm_prerequisites.sh
│ │ │ ├── installers.py
│ │ │ ├── tests/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_install_frameworks.py
│ │ │ └── tvm_installers/
│ │ │ ├── arm/
│ │ │ │ └── config.cmake
│ │ │ ├── arm_cuda/
│ │ │ │ └── config.cmake
│ │ │ ├── x86/
│ │ │ │ └── config.cmake
│ │ │ └── x86_cuda/
│ │ │ └── config.cmake
│ │ ├── operations/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── conversions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── converters.py
│ │ │ │ ├── huggingface.py
│ │ │ │ ├── pytorch.py
│ │ │ │ ├── tensorflow.py
│ │ │ │ └── utils.py
│ │ │ ├── fetch_operations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── local.py
│ │ │ ├── inference_learners/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── blade_disc.py
│ │ │ │ ├── builders.py
│ │ │ │ ├── deepsparse.py
│ │ │ │ ├── faster_transformer.py
│ │ │ │ ├── huggingface.py
│ │ │ │ ├── neural_compressor.py
│ │ │ │ ├── onnx.py
│ │ │ │ ├── openvino.py
│ │ │ │ ├── tensor_rt.py
│ │ │ │ ├── tensorflow.py
│ │ │ │ ├── torch_dynamo.py
│ │ │ │ ├── torch_neuron.py
│ │ │ │ ├── torch_xla.py
│ │ │ │ ├── torchscript.py
│ │ │ │ ├── tvm.py
│ │ │ │ └── utils.py
│ │ │ ├── measures/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── measures.py
│ │ │ │ └── utils.py
│ │ │ └── optimizations/
│ │ │ ├── __init__.py
│ │ │ ├── compilers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── deepsparse.py
│ │ │ │ ├── faster_transformer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── bert/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── checkpoint_quantization.py
│ │ │ │ │ │ └── modeling_bert.py
│ │ │ │ │ └── gpt/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── utils/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── gpt_decoder.py
│ │ │ │ │ └── huggingface_gpt_convert.py
│ │ │ │ ├── intel_neural_compressor.py
│ │ │ │ ├── onnxruntime.py
│ │ │ │ ├── openvino.py
│ │ │ │ ├── quantizations/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── intel_neural_compressor.py
│ │ │ │ │ ├── onnx.py
│ │ │ │ │ ├── openvino.py
│ │ │ │ │ ├── pytorch.py
│ │ │ │ │ ├── tensor_rt.py
│ │ │ │ │ ├── tensorflow.py
│ │ │ │ │ ├── tvm.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── tensor_rt.py
│ │ │ │ ├── tensorflow.py
│ │ │ │ ├── torch_dynamo.py
│ │ │ │ ├── torch_neuron.py
│ │ │ │ ├── torch_xla.py
│ │ │ │ ├── torchscript.py
│ │ │ │ ├── tvm.py
│ │ │ │ └── utils.py
│ │ │ ├── compressors/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── intel.py
│ │ │ │ ├── scripts/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── neural_magic_training.py
│ │ │ │ └── sparseml.py
│ │ │ ├── optimize_inference.py
│ │ │ ├── optimizers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── optimizers.py
│ │ │ ├── tests/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_deepsparse.py
│ │ │ │ ├── test_intel_neural_compressor.py
│ │ │ │ ├── test_onnxruntime.py
│ │ │ │ ├── test_openvino.py
│ │ │ │ ├── test_tensor_rt.py
│ │ │ │ ├── test_tensorflow.py
│ │ │ │ ├── test_torch_dynamo.py
│ │ │ │ ├── test_torchscript.py
│ │ │ │ ├── test_tvm.py
│ │ │ │ └── utils.py
│ │ │ └── utils.py
│ │ ├── optional_modules/
│ │ │ ├── __init__.py
│ │ │ ├── blade_disc.py
│ │ │ ├── deepsparse.py
│ │ │ ├── diffusers.py
│ │ │ ├── dummy.py
│ │ │ ├── huggingface.py
│ │ │ ├── neural_compressor.py
│ │ │ ├── onnx.py
│ │ │ ├── onnxruntime.py
│ │ │ ├── onnxsim.py
│ │ │ ├── openvino.py
│ │ │ ├── tensor_rt.py
│ │ │ ├── tensorflow.py
│ │ │ ├── torch.py
│ │ │ ├── torch_neuron.py
│ │ │ ├── torch_tensorrt.py
│ │ │ ├── torch_xla.py
│ │ │ ├── tvm.py
│ │ │ └── utils.py
│ │ └── tools/
│ │ ├── __init__.py
│ │ ├── adapters.py
│ │ ├── benchmark.py
│ │ ├── data.py
│ │ ├── diffusers.py
│ │ ├── feedback_collector.py
│ │ ├── hardware_utils.py
│ │ ├── huggingface.py
│ │ ├── logger.py
│ │ ├── onnx.py
│ │ ├── pytorch.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_data.py
│ │ │ ├── test_hardware_utils.py
│ │ │ └── test_utils.py
│ │ ├── tf.py
│ │ ├── transformations.py
│ │ ├── utils.py
│ │ └── venv.py
│ ├── nebullvm.toml
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── setup.py
├── open_alpha_tensor/
│ ├── README.md
│ ├── config.json
│ ├── main.py
│ ├── open_alpha_tensor/
│ │ ├── __init__.py
│ │ ├── api/
│ │ │ ├── __init__.py
│ │ │ └── functions.py
│ │ ├── config.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── actors/
│ │ │ │ ├── __init__.py
│ │ │ │ └── stage.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── basis_change.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── generation.py
│ │ │ │ └── utils.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alpha_tensor.py
│ │ │ │ ├── attention.py
│ │ │ │ ├── extras.py
│ │ │ │ ├── heads.py
│ │ │ │ └── torso.py
│ │ │ └── training.py
│ │ ├── operations/
│ │ │ ├── __init__.py
│ │ │ ├── checkpoint_op.py
│ │ │ ├── model_op.py
│ │ │ └── training_op.py
│ │ └── root_op.py
│ ├── resources/
│ │ └── open_alpha_tensor.md
│ └── setup.py
├── optimate/
│ └── README.md
└── speedster/
├── README.md
├── docs/
│ └── en/
│ ├── docs/
│ │ ├── advanced_options.md
│ │ ├── benchmarks.md
│ │ ├── getting_started/
│ │ │ ├── diffusers_getting_started.md
│ │ │ ├── hf_getting_started.md
│ │ │ ├── onnx_getting_started.md
│ │ │ ├── pytorch_getting_started.md
│ │ │ └── tf_getting_started.md
│ │ ├── hardware.md
│ │ ├── installation.md
│ │ ├── key_concepts.md
│ │ ├── notebooks.md
│ │ ├── overview.md
│ │ └── telemetry.md
│ └── mkdocs.yaml
├── notebooks/
│ ├── README.md
│ ├── diffusers/
│ │ ├── Accelerate_Stable_Diffusion_with_Speedster.ipynb
│ │ └── Readme.md
│ ├── huggingface/
│ │ ├── Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb
│ │ ├── Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb
│ │ ├── Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb
│ │ ├── Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb
│ │ ├── Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb
│ │ ├── Readme.md
│ │ └── faster_transformer_bert.py
│ ├── onnx/
│ │ ├── Accelerate_ONNX_ResNet50_with_Speedster.ipynb
│ │ └── Readme.md
│ ├── pytorch/
│ │ ├── Accelerate_PyTorch_ResNet50_with_Speedster.ipynb
│ │ ├── Accelerate_PyTorch_ViT_with_Speedster.ipynb
│ │ ├── Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb
│ │ ├── Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb
│ │ ├── Accelerate_fast_ai_Resnet34_with_Speedster.ipynb
│ │ └── Readme.md
│ └── tensorflow/
│ ├── Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb
│ └── Readme.md
├── requirements.txt
├── setup.py
├── speedster/
│ ├── __init__.py
│ ├── api/
│ │ ├── __init__.py
│ │ ├── functions.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ ├── test_huggingface.py
│ │ ├── test_onnx.py
│ │ ├── test_pytorch.py
│ │ ├── test_tensorflow.py
│ │ └── utils.py
│ ├── root_op.py
│ ├── speedster.py
│ ├── tests/
│ │ ├── __init__.py
│ │ └── test_root_op.py
│ └── utils.py
└── speedster.toml
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
optimization/nebullvm/docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
.idea
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# MacOS DS_Store
.DS_Store
# Pickle folder
.pkl_memoize_py3
# Folder where optimized models are stored
optimized_model
# Config file for tests coverage
.coveragerc
================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Nebuly"
given-names: "S.r.l"
- family-names: "Fiori"
given-names: "Diego"
orcid: "https://orcid.org/0000-0003-1910-0565"
- family-names: "Sofi"
given-names: "Valerio"
orcid: "https://orcid.org/0000-0001-5978-897X"
title: "nebullvm"
version: 0.4.3
date-released: 2022-10-10
url: "https://github.com/nebuly-ai/nebullvm"
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
overall community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or
advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
social@nebuly.ai.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series
of actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within
the community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.
================================================
FILE: README.md
================================================
# OptiMate
**[Legacy]**
This repository is now in a legacy phase and is no longer actively maintained. Although the source code is still available in the Git history, there will be no additional updates or official support.
**[About Nebuly]**
Our team is fully committed on creating the best user-experience platform for LLMs so that companies can understand user behavior at scale when interacting with their LLM-based products.
- To learn more on how to get started, visit our [official documentation](https://docs.nebuly.com/welcome/overview)
- If you need enterprise support, please contact us [here](https://www.nebuly.com/nebuly-book-a-demo)
**[About optimate]**
We have open-sourced a couple of internal projects to the community, but we are not currently maintaining them. Optimate is a collection of libraries designed to help you optimize your AI models. It is an open-source project developed by Nebuly AI but is **not actively maintained**.
The tools available to assist you in your optimization are:
✅ [Speedster](https://github.com/nebuly-ai/optimate/tree/main/optimization/speedster): reduce inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs)
✅ [Nos](https://github.com/nebuly-ai/nos): reduce infrastructure costs by leveraging real-time dynamic partitioning and elastic quotas to maximize the utilization of your Kubernetes GPU cluster
✅ [ChatLLaMA](https://github.com/nebuly-ai/optimate/tree/main/optimization/chatllama): reduce hardware and data costs by leveraging fine-tuning optimization techniques and RLHF alignment
================================================
FILE: monitoring/nebuly/__init__.py
================================================
================================================
FILE: optimization/.github/workflows/tests.yml
================================================
name: Run tests
on:
push:
branches:
- "main"
paths-ignore:
- ".github/**"
- "*.md"
- "docs/**"
- "notebooks/**"
pull_request:
branches:
- "main"
paths-ignore:
- ".github/**"
- "*.md"
- "docs/**"
- "notebooks/**"
jobs:
test_on_ubuntu_cpu:
runs-on: ubuntu-20.04
strategy:
matrix:
# Run in all these versions of Python
python-version: [ 3.8, 3.9, "3.10" ]
steps:
# Checkout the latest code from the repo
- name: Checkout repo
uses: actions/checkout@v2
# Setup which version of Python to use
- name: Set Up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
# Display the Python version being used
- name: Display Python version
run: python -c "import sys; print(sys.version)"
# Install nebullvm
- name: Install nebullvm
run: |
python -m pip install --upgrade pip
pip install .
# Install Speedster
- name: Install Speedster
run: |
cd apps/accelerate/speedster
pip install .
cd ../../..
# Install PyTorch
- name: Install PyTorch
run: python -m pip install torch==2.0.0
# Install compilers except tvm
- name: Install deep learning compilers
run: python -m nebullvm.installers.auto_installer --compilers all
# Install requirements for testing
- name: Install requirements for testing
run: pip install -r "requirements-dev.txt"
# Run api tests
- name: Run api tests
run: |
export SPEEDSTER_DISABLE_TELEMETRY=1
cd apps/accelerate/speedster
pytest
cd ../../..
# Run components tests
- name: Run components tests
run: |
cd nebullvm
pytest
cd ../
# test_on_windows_cpu:
# runs-on: windows-latest
#
# strategy:
# matrix:
# # Run in all these versions of Python
# python-version: [ 3.8, 3.9, "3.10" ]
#
# steps:
# # Checkout the latest code from the repo
# - name: Checkout repo
# uses: actions/checkout@v2
# # Setup which version of Python to use
# - name: Set Up Python ${{ matrix.python-version }}
# uses: actions/setup-python@v2
# with:
# python-version: ${{ matrix.python-version }}
# # Display the Python version being used
# - name: Display Python version
# run: python -c "import sys; print(sys.version)"
# # Install nebullvm
# - name: Install nebullvm
# run: |
# python -m pip install --upgrade pip
# pip install .
# # Install Speedster
# - name: Install Speedster
# run: |
# cd apps/accelerate/speedster
# pip install .
# cd ../../..
# - name: Install PyTorch
# run: python -m pip install torch==2.0.0
# # Install compilers except tvm
# - name: Install deep learning compilers
# run: python -m nebullvm.installers.auto_installer --compilers all
# # Install requirements for testing
# - name: Install requirements for testing
# run: pip install -r "requirements-dev.txt"
# # Run api tests
# - name: Run api tests
# run: |
# $env:SPEEDSTER_DISABLE_TELEMETRY=1
# cd apps/accelerate/speedster
# pytest
# cd ../../..
# # Run components tests
# - name: Run components tests
# run: |
# cd nebullvm
# pytest
# cd ../
#
================================================
FILE: optimization/chatllama/LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Copyright (C)
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
================================================
FILE: optimization/chatllama/README.md
================================================
# **🦙 ChatLLaMA**
> :warning: Please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's form.
`ChatLLaMA` 🦙 is a library that allows you to efficiently leverage LLMs fine-tuning capabilities using your own data and the least amount of compute possible.
Its purpose is to give developers peace of mind, by abstracting the efforts required for computational optimization and for the collection of large amounts of data.
If you like the project, please show your support by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers).
## Quick install
You can install the package with pip:
```bash
pip install chatllama-py
```
Then you need to install the Llama models cloned from [Meta's repository](https://github.com/facebookresearch/llama):
```bash
git clone https://github.com/facebookresearch/llama.git
cd llama
pip install -r requirements.txt
pip install -e .
```
Follow the instructions in the Llama repository to download the model weights and tokenizer.
## What can ChatLLaMA help with?
`ChatLLaMA` 🦙 has been designed to help developers with various use cases, all related to RLHF training and optimized inference. These are some of the use cases that better resonate with our community wishlist:
- I want to train an efficient ChatGPT-like assistant on my local hardware infrastructure using a limited amount of data;
- I want to create my own personalized version of ChatGPT-like assistant without costs getting out of control;
- I want to understand which model architecture (LLaMA, OPT, GPTJ, etc.) best fits my requirements in terms of hardware, compute budget, and performance;
## Getting started
In this Getting Started we will set up a local RLHF training that will allow you to create your own ChatGPT-like assistant. In this example, we used OPT-1.3B, wherever possible we used open-source datasets and ran the training on a NVIDIA A100. If you want to use other models or hardware, we recommend reading the [supported models](#supported-models), [hardware requirements](#hardware-requirements) and [dataset preparation](#dataset-preparation) sections. In this example, we ran a few epochs of the training; this took a few hours. Any feedback on total training time, on any hardware, would be greatly appreciated. Please share your experience with our community on our Discord channel.
To quickly get you started, we will focus on 3 key steps:
1. Download YAML files to customize your training process. Please note that all the parameters of the library can be managed in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml);
2. Prepare the 3 datasets needed to train the actor model, the reward model and perform RLHF;
3. Train the models on your local infrastructure.
1 - YAML download
First, let’s get the artifacts for running ChatLLaMA. The artifacts contain:
- [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml): config file for model and data set. This allows you to 1) select the model you prefer (LLaMA, OPT, BLOOM, etc) 2) change all the hyperparameters of the training process;
- [`ds_config.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/ds_config.json): config file to define DeepSpeed training parameters;
- [`peft_config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/peft_config.yaml): config file to define PEFT parameters; PEFT is used for efficient training with Hugging Face models. It can be used for setting the LoRA parameters as rank and precision.
- [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json): synthetic data generation templates that can be used to personalize the creation of the dataset. The templates are used for feeding LLMs during the data generation. Note that the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file contains a dictionary having as *keys* the training steps (`actor`, `reward`, `rlhf`) and as *values* a string containing the personalization requests of the user. For more details see the [dataset preparation](#dataset-preparation) section;
- [`main.py`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/main.py): file to train the model.
```bash
wget -O artifacts.zip https://nbllabartifacts.blob.core.windows.net/chatllama/artifacts.zip\?sp\=r\&st\=2023-03-08T14:53:24Z\&se\=2100-03-08T22:53:24Z\&spr\=https\&sv\=2021-06-08\&sr\=b\&sig\=jqr%2B2ZkR0SW9RjV0pDOdQ%2BDulLXLjbZ36vmNd4XxxyQ%3D
unzip artifacts.zip
```
Once you have run the command above, you will find the all artificats in the [`artifacts/`](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama/artifacts) directory. Now you can move on to the next section regarding the dataset preparation.
2 - Dataset preparation
Before training the model, we need to prepare 3 datasets:
- `actor_training_data`: this is the JSON dataset used in the supervised fine-tuning. It consists of examples of unlabelled conversations, e.g. collection of prompts and responses;
- `rlhf_training_data`: this is the JSON dataset used for RLHF training. It consists of a collection of possible input user prompts;
- `reward_training_data`: this is the JSON dataset used to train the reward model. It consists of responses with associated scores.
In this example, we are using only publicly available dataset and synthetic generation; if you want to use your own data instead, please see the [Dataset preparation](#dataset-preparation) section.
First, let’s download the `actor_training_data` and the `rlhf_training_data`:
```bash
python artifacts/download_dataset.py ARLHF --path ./datasets --number_of_samples 200
```
Finally, let’s create the `reward_training_data` using `davinci-003` for synthetic data generation.
```bash
export OPENAI_API_KEY=YOUR_API_KEY
python artifacts/generate_rewards.py ./datasets/reward_training_data.json
```
> :warning: Creating the `reward_training_data` with `davinci-003` is not free, i.e. it costs a few $$. If you prefer avoiding external paid APIs, we suggest using HuggingFace’s models (e.g. flan_t5_xl) as described in more detail in the [Supported models](#supported-models) section.
>
> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".
At this point, we have successfully created the 3 datasets. We can therefore move on to the final section and start the training.
3 - Training
You can train the 3 models in separate steps:
- Train the Reward Model
```bash
python artifacts/main.py artifacts/config/config.yaml --type REWARD
```
- Pre-Train the Actor Model
```bash
python artifacts/main.py artifacts/config/config.yaml --type ACTOR
```
- Training the Actor with reinforcement learning.
```bash
python artifacts/main.py artifacts/config/config.yaml --type RL
```
or, equivantly, the 3 trainings can also be pipelined using the flag ALL.
```bash
python artifacts/main.py artifacts/config/config.yaml --type ALL
```
Note that the path to the datasets and the training hyper-parameters of the training process are specified in the [`config.yaml`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/config/config.yaml) file.
## Contributing and Roadmap
As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see our [Roadmap page](https://github.com/users/nebuly-ai/projects/1/views/1) for more information on how to get involved.
You can participate in the following ways:
1. Submit an issue or PR on GitHub
2. Join our [Discord group](https://discord.gg/77d5kGSa8e) to chat
## Supported models
Actor models
We support models that can be run efficiently with a limited amount of compute, such as LLaMA and 🤗 transformers. These are the models with less than 20B parameters currently supported :
- LLaMA: 7B and 13B, please note this library does NOT contain LLaMA’s weights; to access the weights, you need to apply to Meta's [form](https://forms.gle/jk851eBVbX1m5TAv5).
- GPTJ: 6B
- GPTNeoX: 1.3B, 20B
- **(⚠️WIP)** Flan-T5: 80M, 259M, 780M, 3B, 11B
- OPT: 125M, 359M, 1.3B, 2.7B, 6.7B, 13B
- BLOOM: 560M, 1.1B, 1.7B, 3B, 7.1B
- BLOOMZ: 560M, 1.1B, 1.7B, 3B, 7.1B
- Galactica: 125M, 1.3B, 6.7B
Reward models
We suggest using models under 6B from 🤗 transformers:
- GPT2: 124M, 355M, 774M, 1.5B
- OPT: 125M, 359M, 1.3B, 2.7B
- GPTJ: 6B
- BLOOMZ: 560M, 1.1B, 1.7B, 3B
- **(⚠️WIP)** OpenAssistant [pre-trained reward models](https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2)
Synthetic data generation models
We support both APIs from OpenAI and 🤗 transformers:
- OpenAI: da-vinci-003, gpt-3.5-turbo **(⚠️WIP)**
- HuggingFace: Flan-T5 (3B and 11B)
> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".
:watninh
If you need support for different models, please open an issue and we will get to work.
## Hardware requirements
Training
Larger actor models require more powerful hardware. Here is a rough hardware recommendation table, suggesting the right type of hardware for different actor model sizes:
- 125M to 1.3B → 1x Nvidia 3090/4090
- 1.3B to 3B → 1x Nvidia A100 (80Gb)
- 3B with DeepSpeed CPU off-loading → 1x Nvidia 3090/4090
- 3B to 7B with DeepSpeed ZeRO → 4x Nvidia T4
- 3B to 13B → 4x Nvidia A100 (80Gb)
- 13B to 20B with DeepSpeed ZeRO → 4x Nvidia A100 (80Gb)
- 13B to 20B → 8x Nvidia A100 (80Gb)
Inference
**(⚠️WIP)** When it comes to inference optimization, ChatLLaMA will support the following optimization techniques:
- [ ] DeepSpeed ZeRO
- [ ] FlexGen
- [ ] HF Accelerate
- [ ] PyTorch Vanilla
Please note that inference optimization has yet to be implemented. If you would like to contribute, please see the **issue roadmap**, community contributions are always welcome 😊.
## Dataset preparation
To successfully train a ChatLLaMA assistant, you need 3 different datasets: `actor_training_data`, `rlhf_training_data` and `reward_training_data`.
Dataset for supervised fine-tuning of the actor model
The `actor_training_data` is a collection of prompts with the associated responses as highlighted below:
```json
[
{
"user_input": "here the input of the user",
"completion": "here the model completion"
}
]
```
ChatLLaMA supports 4 different options to prepare the `actor_training_data`:
* Use 100% synthetic data
The dataset can be synthetically generated by running the following command:
```bash
python artifacts/generate_actor_dataset.py
```
> :warning: Note that this command will require a subscription to OpenAI. Generating the full dataset with `davinci-003` could cost approximately ~200$.
>
> :warning: if using OpenAI's API, please be aware of OpenAI's terms of use stating that it is forbidden to "use the Services to develop foundation models or other large scale models that compete with OpenAI".
Alternatively, you can generate the dataset for free using 🤗 tranformers as described in the section [Supported models](#supported-models).
* Use one of the open source datasets with assistant interactions
Currently, we support:
- [Anthropic HH RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf): this dataset consists of structured question/answer pairs with an LLM chatbot that includes selected and rejected answers;
- [Stanford Human Preferences Dataset (SHP)](https://huggingface.co/datasets/stanfordnlp/SHP): this dataset is curated from selected "ask" subreddits, and includes questions that span a wide range of question/answer pairs based on the most upvoted responses. Please note that, unlike HH RLHF, this dataset is not intended to reduce harassment by selecting the ideal chatbot response, but instead weights the most helpful human responses.
The datasets can be downloaded running the following command:
```bash
python artifacts/download_dataset.py --path --number_of_samples
```
Where:
- `` could be "SHP" for the StanfordNLP/SHP dataset or "ARLHF" for the Anthropic/hh-rlhf dataset;
- `` is the folder path to where the datasets are going to be created;
- `` is the number of samples of which the reward_dataset.json is composed.
* Use 100% personalized dataset
The user provides his own personalized full dataset. Datasets must be JSON files with the following format:
```
[
{
"user_input": "here the input of the user",
"completion": "here the model completion"
}
]
```
Where the list contains multiple dictionaries, and each dictionary corresponds to a data sample. We suggest using more than 1000 data samples to run the actor training.
* (⚠️WIP) Create the full dataset augmenting few custom data samples
The dataset can be generated synthetically from a few prompt+response examples provided by the user (few =>10).
Dataset for RLHF
The dataset for RLHF consists just of prompt examples:
```json
[
{
"user_input": "here the example of user input"
}
]
```
It can be provided in 2 different ways:
* Few examples provided by the user and dataset synthetically expanded using LLM
You need to add the key `rlhf` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file with the information about the task you want to perform and extra context needed by the LLM for the generation. Here is an example of template:
```json
{
"rlhf": "Here is the template for the generating RLHF prompts. The task we want to perform is ..."
}
```
*Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json)*
* The user provides the full dataset with possible interactions with the model
The dataset needs to contain more than 1000 prompt examples:
```json
[
{
"user_input": "here the example of user input"
}
]
```
The file must be named `rlhf_training_data.json`.
Dataset to train the reward model
The `reward_training_data` is a collection of i) prompts, ii) completion and iii) score of the completion assigned accordingly to the user feedback (the Human Feedback in RLHF).
```json
[{
"user_input": "...",
"completion": "...",
"score": 1
},
...
]
```
We support 3 different options to prepare the `reward_training_data`:
- Fully Synthetic Score Generation
In this case the reward dataset can be synthetically scored using a LLM as Human Feedback. We recommend the `reward_training_data` having at least 100 data samples.
```json
[{
"user_input": "...",
"completion": "...",
"score": None
},
...
]
```
A LLM model is used to assign the score to each entry.
The LLM needs a prompt template containing all the instructions to evaluate the generated text. To do this, you should add the key `reward` to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file. Here is an example:
```json
{
"reward": "Here is the template for the reward model. The rules are:\n\n1.Rule 1\n\n2. Rule 2"
}
```
If no template is provided the default one is used. You can find the default template in `artifacts/generate_rewards.py`. Note that all templates must be saved in a single JSON file named [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json).
Once you have the unlabelled dataset, you can generate the scores by running the following command:
```bash
python artifacts/generate_rewards.py --model --temperature --max_tokens --reward_template
```
Where:
- `` path to the reward dataset to be scored;
- `` model to use for the reward. Default and suggested text-davinci-003 (More to come);
- `` temperature used to score the model; temperature=0.1;
- `` max_tokens of the generation;
- `` is the path to the [`templates.json`](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/artifacts/templates.json) file containing the template to be used for generating the reward. If no path is provided, the default template will be used.
- The user provides their personalized full dataset
Datasets must be JSON files in the following format:
```json
[
{
"user_input": "here type the user input",
"completion": "here type the completion",
"score": 4.0
},
{
"user_input": "here type the user input",
"completion": "random garbage",
"score": 0.0
}
]
```
Note that at least 100 data samples are required in this case. The file must be named `reward_training_data.json`
- **(⚠️WIP)** Few examples provided by the user and dataset synthetically expanded using LLM
# License
See the [LICENSE](https://github.com/nebuly-ai/nebullvm/blob/main/apps/accelerate/chatllama/LICENSE) file.
================================================
FILE: optimization/chatllama/artifacts/config/config.yaml
================================================
---
trainer_config:
# learning rates
actor_lr: 0.000005
critic_lr: 0.000009
# PPO Hyperparameters
actor_eps_clip: 0.2
critic_eps_clip: 0.2
beta_s: 0.02
# coefficient for the discounted rewards
gamma_discounted: 1
# path to examples to be sampled (training dataset) see rlhf_dataset.json
examples_path: "./datasets/rlhf_training_data.json"
# number of episodes and generation performed for each episode
# in the train() method
num_episodes: 100
max_timesteps: 32
# number of timesteps after which the learn() method is called
# (to update the weights)
update_timesteps: 32
# number of example sampled at each timestep
num_examples: 1
# batch and epochs for the training
batch_size: 1
epochs: 1
# number of episodes after which update the checkpoints in RL training
checkpoint_steps: 1000
# here specify the name of the actor_rl checkpoint from which resume
# during actor RL training. If null load the last one.
checkpoint_name: null
actor_config:
model: "facebook/opt-1.3b"
model_folder: "./models"
tokenizer_path: "path-to-tokenizer"
train_dataset_path: "./datasets/actor_training_data.json"
validation_dataset_path: null
# froze model embedding during training
froze_embeddings: True
# use fairscale layers to build the model instead of vanilla pytorch
# only for llama
use_fairscale: False
# max sequence length for the actor (i.e. prompt + completion) it depends on
# the model used.
max_sequence_length: 2048
# max tokens generated by the actor (completion only)
max_tokens: 2048
# minimum number of tokens generated by the actor
min_tokens: 100
# additional prompt tokens to be used for template or as safety
additonal_prompt_tokens: 20
# temperature for the actor
temperature: 0.1
batch_size: 2
# number iteration after print
iteration_per_print: 1
lr: 0.000009
epochs: 1
# number of backpropagation after saving the checkpoints
checkpoint_steps: 5000
# number of checkpoints to keep while removing the older
# (keep memory consumption of checkpoints reasonable)
n_checkpoints_to_keep: 5
# here specify the name of the actor checkpoint from which resume
# during actor training. If null load the last one.
checkpoint_name: null
# deepspeed settings
deepspeed_enable: False
deepspeed_config_path: "./artifacts/config/ds_config.json"
# accelerate settings
accelerate_enable: False
# use_peft - the parameters of PEFT can be modified in the peft_config.yaml
peft_enable: False
peft_config_path: "./artifacts/config/peft_config.yaml"
reward_config:
# model to be chosen are gp2-large, bart-base, longformer-base-4096
# more can be simply added in the reward.py __init__()
model: "facebook/opt-125m"
model_folder: "./models"
# hidden size of the additional ffw head to produce the scores
model_head_hidden_size: 2048
max_sequence_length: 2048
train_dataset_path: "./datasets/reward_training_data.json"
validation_dataset_path: null
batch_size: 8
epochs: 1
iteration_per_print: 1
# steps after which the checkpoint are saved
checkpoint_steps: 10000
# here specify the name of the reward checkpoint from which resume
# during reward training. If null load the last one.
checkpoint_name: null
lr: 0.000009
# deepspeed settings
deepspeed_enable: False
deepspeed_config_path: "./artifacts/config/ds_config.json"
# accelerate settings
accelerate_enable: False
critic_config:
# model to be chosen are gp2-large, bart-base, longformer-base-4096
# more can be simply added in the reward.py __init__()
model: "facebook/opt-125m"
# hidden size of the additional ffw head to produce the scores
model_head_hidden_size: 2048
max_sequence_length: 2048
model_folder: "./models"
# here specify the name of the critic checkpoint from which resume
# during critic training. If null load the last one.
checkpoint_name: null
================================================
FILE: optimization/chatllama/artifacts/config/ds_config.json
================================================
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"fp16": {
"enabled": false,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients" : true,
"offload_param": {
"device": "cpu",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 5,
"buffer_size": 1e8,
"max_in_cpu": 1e9
},
"offload_optimizer": {
"device": "cpu",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 4,
"fast_init": false
},
"stage3_max_live_parameters" : 1e9,
"stage3_max_reuse_distance" : 1e9,
"stage3_prefetch_bucket_size" : 5e8,
"stage3_param_persistence_threshold" : 1e6,
"sub_group_size" : 1e12,
"elastic_checkpoint" : true,
"stage3_gather_16bit_weights_on_model_save": true,
"ignore_unused_parameters": true,
"round_robin_gradients": true
}
}
================================================
FILE: optimization/chatllama/artifacts/config/peft_config.yaml
================================================
---
inference_mode: False
r: 8
lora_alpha: 32
lora_dropout: 0.1
================================================
FILE: optimization/chatllama/artifacts/datasets/actor_dataset.json
================================================
[
{
"user_input": "here the input of the user",
"completion": "here the model completion"
}
]
================================================
FILE: optimization/chatllama/artifacts/datasets/reward_dataset.json
================================================
[
{
"user_input": "here type the user input",
"completion": "here type the completion",
"score": 4.0
},
{
"user_input": "here type the user input",
"completion": "if score is null, it can be evaluated by davinci using reward_trainer.distill()",
"score": null
}
]
================================================
FILE: optimization/chatllama/artifacts/datasets/rlhf_dataset.json
================================================
[
{
"user_input": "here the example of user input"
}
]
================================================
FILE: optimization/chatllama/artifacts/download_dataset.py
================================================
import argparse
import os
from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset
if __name__ == "__main__":
# Setup argument parser
parser = argparse.ArgumentParser(
prog="generate_rewards.py",
description="Generate rewards using LangChain and LLMs",
)
parser.add_argument(
"dataset_name",
help="dataset name it can be. SSHP: stanfordnlp/SHP or ",
choices=["SHP", "ARLHF"],
)
parser.add_argument(
"-p",
"--path",
help="Specify the path for the dataset",
default="./datasets",
)
parser.add_argument(
"-n",
"--number_of_samples",
help="Specify the number of samples for the reward dataset",
default=200,
)
args = parser.parse_args()
if os.path.exists(args.path) is False:
os.mkdir(args.path)
try:
n_samples = int(args.number_of_samples)
except ValueError:
raise ValueError("Number of samples should be an integer")
if args.dataset_name == "SHP":
dataset = StanfordNLPSHPDataset()
dataset.save_dataset(args.path, n_samples)
elif args.dataset_name == "ARLHF":
dataset = AnthropicRLHF()
dataset.save_dataset(
args.path,
n_samples,
)
================================================
FILE: optimization/chatllama/artifacts/extend_rlhf_dataset.py
================================================
import os.path
import numpy as np
from langchain import OpenAI, LLMChain, PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
def _get_template_and_variables(prompt: str, with_examples: bool):
if with_examples:
template = prompt + "\n\nExample: {example}"
variables = ["example"]
else:
template = prompt
variables = []
return template, variables
def use_langchain_model(
user_prompt: str,
model_name: str,
temperature: float = 0.7,
max_tokens: int = 2048,
with_examples: bool = False,
) -> LLMChain:
llm = OpenAI(
model_name=model_name, temperature=temperature, max_tokens=max_tokens
)
template, input_variables = _get_template_and_variables(
user_prompt, with_examples=with_examples
)
prompt_template = PromptTemplate(
template=template,
input_variables=input_variables,
)
return LLMChain(llm=llm, prompt=prompt_template)
class HuggingFaceChain:
def __init__(
self, model_name: str, user_prompt: str, with_examples: bool = False
):
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.prompt, self.input_variables = _get_template_and_variables(
user_prompt, with_examples=with_examples
)
def run(self, **kwargs):
prompt = self.prompt.format(**kwargs)
input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
output = self.model.generate(
input_ids, max_length=100, num_beams=5, early_stopping=True
)
return self.tokenizer.decode(output[0], skip_special_tokens=True)
def use_huggingface_model(
user_prompt: str,
model_name: str,
with_examples: bool = False,
) -> HuggingFaceChain:
return HuggingFaceChain(
model_name, user_prompt, with_examples=with_examples
)
def main():
import json
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument(
"--model",
type=str,
help="Model name.",
default="google/flan-t5-xl",
)
parser.add_argument("--templates", type=str, help="Path to templates.")
parser.add_argument("--num_prompts", type=int, default=1000)
parser.add_argument(
"--data_dir", type=str, help="Path where data are stored"
)
args = parser.parse_args()
model_name = args.model
templates_path = args.templates
data_dir = args.data_dir
with open(os.path.join(data_dir, "rlhf_training_data.json"), "r") as f:
examples = json.load(f)
with open(templates_path, "r") as f:
templates = json.load(f)
user_prompt = templates.get("rlhf")
if user_prompt is None:
raise ValueError("No rlhs template found.")
if "davinci" in model_name:
chain = use_langchain_model(
user_prompt, model_name, with_examples=True
)
else:
if "t5" not in model_name:
raise ValueError("Only Flan-t5 models are supported for HF.")
chain = use_huggingface_model(
user_prompt, model_name, with_examples=True
)
for i in range(args.num_prompts):
example = np.random.choice(examples)
new_example = chain.run(example=example["user_input"])
example_dict = {"user_input": new_example}
examples.append(example_dict)
with open(os.path.join(data_dir, "rlhf_training_data.json"), "w") as f:
json.dump(examples, f)
if __name__ == "__main__":
main()
================================================
FILE: optimization/chatllama/artifacts/generate_actor_dataset.py
================================================
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain.chains.conversation.memory import (
ConversationBufferWindowMemory,
)
from chatllama.langchain_modules.prompt_templates import (
PERSON_CHATBOT_TEMPLATE,
AI_CHATBOT_TEMPLATE,
)
CONVERSATION_LENGTH = 20
def create_conversation(human_agent: LLMChain, bot_agent: LLMChain):
conversation = []
chatbot_output = ""
for i in range(CONVERSATION_LENGTH):
# Human agent goes first
human_output = human_agent.run(chatbot_input=chatbot_output)
conversation.append(f"Human: {human_output}")
chatbot_output = bot_agent.run(human_input=human_output)
conversation.append(f"AI: {chatbot_output}")
return "\n".join(conversation)
def build_agents():
# be aware that too long completions will not fit the sequence length
# of possible critic or reward models ...
llm = OpenAI(max_tokens=2048, temperature=0.7)
human_template = PromptTemplate(**PERSON_CHATBOT_TEMPLATE)
human_agent = LLMChain(
llm=llm,
prompt=human_template,
memory=ConversationBufferWindowMemory(k=4),
)
bot_template = PromptTemplate(**AI_CHATBOT_TEMPLATE)
bot_agent = LLMChain(
llm=llm,
prompt=bot_template,
memory=ConversationBufferWindowMemory(k=4),
)
return human_agent, bot_agent
def get_sub_conversations(conversation: str, system_prompt: str):
interactions = conversation.split("AI:")
sub_conversations = []
for i in range(len(interactions) - 1):
user_input = system_prompt + "AI:".join(interactions[: i + 1])
completion = interactions[i + 1].split("Human:")[0].strip()
sub_conversations.append(
{"user_input": user_input, "completion": completion}
)
return sub_conversations
def main():
import json
import os
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--num_conversations", type=int, default=1000)
parser.add_argument("--output_dir", type=str, default="conversations")
parser.add_argument("--templates", type=str, default=None)
args = parser.parse_args()
if args.templates is not None:
with open(args.templates, "r") as f:
templates = json.load(f)
template = templates["actor"]
else:
template = ""
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
for conv in range(args.num_conversations):
human_agent, bot_agent = build_agents()
conversation = create_conversation(human_agent, bot_agent)
with open(
os.path.join(args.output_dir, f"conversation_{conv}.txt"), "w"
) as f:
f.write(conversation)
# convert the conversations to a single json file
data = []
for conv in range(args.num_conversations):
with open(
os.path.join(args.output_dir, f"conversation_{conv}.txt"), "r"
) as f:
conversation = f.read()
sub_conversations = get_sub_conversations(conversation, template)
data.extend(sub_conversations)
with open(
os.path.join(args.output_dir, "actor_training_data.json"), "w"
) as f:
json.dump(data, f)
if __name__ == "__main__":
main()
================================================
FILE: optimization/chatllama/artifacts/generate_rewards.py
================================================
import argparse
import json
from langchain import OpenAI, LLMChain, PromptTemplate
class ScoreGenerator:
def __init__(
self,
llm_model: str,
llm_temperature: float,
llm_max_tokens: int,
reward_template: dict,
) -> None:
self.llm_max_tokens = llm_max_tokens
self.llm_temperature = llm_temperature
self.llm_model = llm_model
# initialize LLM and LangChain
openai_llm = OpenAI(
model_name=llm_model,
temperature=llm_temperature,
max_tokens=llm_max_tokens,
)
# Customaize your own Reward template by changing the
# prompt_template
prompt_template = PromptTemplate(**reward_template)
print(prompt_template)
self.llm = LLMChain(llm=openai_llm, prompt=prompt_template)
def distill(
self,
dataset_path: str,
) -> None:
"""Parse the dataset and assign scores using LLMs
then save back the dataset with the uploaded scores
"""
print("Assigning scores to the reward dataset...")
# load the dataset
with open(dataset_path, "r") as f:
train_data = json.load(f)
# for each element of the dataset, assing a score.
for i, data in enumerate(train_data):
if data.get("score", None) is None:
user_input = data["user_input"]
completion = data["completion"]
print(
f"#### Data {i}:\n"
f"#### User_input:\n {user_input}\n"
f"#### Completion:\n {completion}\n"
)
prompt_tokens = (
data["user_input"]
+ data["completion"]
+ self.llm.prompt.template
)
prompt_len = int(len(prompt_tokens.split(" ")) / 0.75)
# 80% of the max length as safety margin
if prompt_len > self.llm_max_tokens * 0.8:
print(
f"The prompt of the data {i} is too long\n"
f"tokens: {prompt_len}\n"
f"max_tokens: {self.llm_max_tokens * 0.8}"
)
continue
score = self.llm.run(
user_input=data["user_input"],
completion=data["completion"],
).strip()
# TODO: extract from score the float value with a regex
try:
score = float(score)
except Exception:
print(
f"The score returned by the LLM for the"
f"data, {i}, is not a float float:\n{score}"
)
continue
data["score"] = score
print(f"### Score: {score} \n\n")
# remove all the data that have no score
train_data = [data for data in train_data if data.get("score", None)]
# save the dataset back
print("Writing the updated dataset back to disk ... ")
with open(dataset_path, "w") as f:
json.dump(train_data, f)
print("Score Assignment Completed")
if __name__ == "__main__":
REWARD_TEMPLATE = dict(
template=(
"You have to evaluate the following chat with a score"
"between 0 and 5"
"You MUST evaluate: text quality, content quality and"
"coherence.\n"
"You MUST return only the number that represents your"
"judgment.\n"
"The input of the user is: {user_input}\n"
"The output of the chatbot is: {completion}\n"
"The score is:\n"
),
input_variables=["user_input", "completion"],
)
# Setup argument parser
parser = argparse.ArgumentParser(
prog="generate_rewards.py",
description="Generate rewards using LangChain and LLMs",
)
parser.add_argument("dataset_path", help="Path to the dataset")
parser.add_argument(
"-m",
"--model",
help="Specify the model to be used",
default="text-davinci-003",
)
parser.add_argument(
"-t",
"--temperature",
help="Specify the temperature of the score assignment",
default=0.5,
)
parser.add_argument(
"-k",
"--max_tokens",
help="Specify the max tokens of the score assignement",
default=2048,
)
parser.add_argument(
"-r",
"--reward_template",
help="Specify the reward template to be used",
default=None,
)
# parse arguments
args = parser.parse_args()
if args.reward_template:
templates = json.loads(args.reward_template)
if templates.get("reward", None) is None:
rw_template = REWARD_TEMPLATE
else:
rw_template = templates["reward"]
else:
rw_template = REWARD_TEMPLATE
score_generator = ScoreGenerator(
args.model, args.temperature, args.max_tokens, rw_template
)
score_generator.distill(args.dataset_path)
================================================
FILE: optimization/chatllama/artifacts/main.py
================================================
import argparse
from chatllama.rlhf.actor import ActorTrainer
from chatllama.rlhf.config import Config
from chatllama.rlhf.dataset import BaseDataset
from chatllama.rlhf.reward import RewardTrainer
from chatllama.rlhf.trainer import RLTrainer
# Setup argument parser
parser = argparse.ArgumentParser(
prog="main.py", description="RLHF Training of ChatBots"
)
parser.add_argument("configfile", help="Path to config.yaml file")
parser.add_argument(
"-t",
"--type",
help=(
"Specify the training type. RL: Training of the model using RL."
"ACTOR: Training of the actor model. "
"REWARD: Training of the reward model."
"RL: The whole pipeline with the three training steps"
),
default="ALL",
choices=["ALL", "RL", "ACTOR", "REWARD"],
)
parser.add_argument(
"-a", "--actor", help="Specify actor model by name", default=None
)
parser.add_argument(
"-r", "--reward", help="Specify reward model by name", default=None
)
parser.add_argument("--local_rank", help="Local rank parameter for deepspeed", default=None)
# parse arguments
args = parser.parse_args()
# load config.yaml with all the project informations
config = Config(args.configfile)
# overwrite config if specified differently
if args.actor is not None:
config.actor.model = args.actor
if args.reward is not None:
config.reward.model = args.reward
# perform the desired training
if args.type == "RL":
max_seq = min(
config.actor.max_sequence_length,
config.reward.max_sequence_length,
config.critic.max_sequence_length,
)
config.actor.max_sequence_length = max_seq
BaseDataset.clean_dataset(config)
rlhf_trainer = RLTrainer(config)
rlhf_trainer.train()
elif args.type == "ACTOR":
BaseDataset.clean_dataset(config.actor)
actor_trainer = ActorTrainer(config.actor)
actor_trainer.train()
elif args.type == "REWARD":
BaseDataset.clean_dataset(config.reward)
reward_trainer = RewardTrainer(config.reward)
reward_trainer.train()
elif args.type == "ALL":
reward_trainer = RewardTrainer(config.reward)
reward_trainer.train()
actor_trainer = ActorTrainer(config.actor)
actor_trainer.train()
rlhf_trainer = RLTrainer(config)
rlhf_trainer.train()
================================================
FILE: optimization/chatllama/artifacts/templates.json
================================================
{
"rlhf": "You are an AI assistant used to generate possible prompts instructions for a chatbot, here is an example of conversation."
}
================================================
FILE: optimization/chatllama/chatllama/__init__.py
================================================
================================================
FILE: optimization/chatllama/chatllama/langchain_modules/__init__.py
================================================
================================================
FILE: optimization/chatllama/chatllama/langchain_modules/prompt_templates.py
================================================
REWARD_TEMPLATE = dict(
template=(
"You have to evaluate the following chat with a score between 0 and 5"
"You MUST evaluate: text quality, content quality and"
"coherence.\n"
"You MUST return only the number that represents your"
"judgment.\n"
"The assignement is:\n{user_input}\n"
"The completion is:\n{completion}\n"
),
input_variables=["user_input", "completion"],
)
AI_CHATBOT_TEMPLATE = dict(
template=(
"Assistant is a large language model trained by Meta and Nebuly.ai\n"
"Assistant is designed to be able to assist with a wide range of "
"tasks, from answering simple questions to providing in-depth "
"explanations and discussions on a wide range of topics. As a "
"language model, Assistant is able to generate human-like text "
"based on the input it receives, allowing it to engage in "
"natural-sounding conversations and provide responses that are "
"coherent and relevant to the topic at hand.\n\n"
"Assistant is constantly learning and improving, and its capabilities "
"are constantly evolving. It is able to process and understand large "
"amounts of text, and can use this knowledge to provide accurate and "
"informative responses to a wide range of questions. Additionally, "
"Assistant is able to generate its own text based on the input it "
"receives, allowing it to engage in discussions and provide "
"explanations and descriptions on a wide range of topics.\n\n"
"Overall, Assistant is a powerful tool that can help with a wide "
"range of tasks and provide valuable insights and information on a "
"wide range of topics. Whether you need help with a specific "
"question or just want to have a conversation about a particular "
"topic, Assistant is here to assist.\n\n{history}\n\n"
"Human: {human_input}\n"
"Assistant:"
),
input_variables=["history", "human_input"],
)
PERSON_CHATBOT_TEMPLATE = dict(
template=(
"You are a human chatting with a chatbot. The chatbot is a large "
"language model trained by Meta and Nebuly-ai\n"
"The chatbot is designed to be able to assist you with a wide range "
"of tasks, from answering simple questions to providing in-depth "
"explanations and discussions on a wide range of topics. You are a "
"human and you are testing the chatbot. Ask the chatbot questions and"
"see how it responds. You can also ask the chatbot to tell you a "
"story."
"\n\n{history}\n\n"
"Chatbot: {chatbot_input}\n"
"Human:"
),
input_variables=["history", "chatbot_input"],
)
================================================
FILE: optimization/chatllama/chatllama/llama_model.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms
# of the GNU General Public License version 3.
import json
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple, List, Union, Optional
import deepspeed
import torch
import torch.distributed
import torch.nn.functional as F
import fairscale.nn.model_parallel.initialize as fs_init
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from fairscale.nn.model_parallel.layers import (
ParallelEmbedding,
RowParallelLinear,
ColumnParallelLinear,
)
from torch import nn
from transformers import AutoTokenizer
from llama import Tokenizer
from llama.generation import sample_top_p
class MyTokenizer:
"""Masked tokenizer of hugging face to be similar to the one of meta,
just used for testing purposes.
"""
def __init__(self, model_path: Optional[str] = None):
if model_path is None:
self.sp_model = AutoTokenizer.from_pretrained("gpt2")
else:
self.sp_model = AutoTokenizer.from_pretrained(model_path)
self.n_words = self.sp_model.vocab_size
self.bos_id = self.sp_model.bos_token_id
self.eos_id = self.sp_model.eos_token_id
self.pad_id = self.sp_model.eos_token_id
def encode(
self,
s: str,
bos: bool = True,
eos: bool = True,
truncation: bool = True,
) -> List[int]:
output = self.sp_model.encode(s, truncation=truncation)
t = list(output)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: List[int]) -> str:
input = torch.as_tensor(t)
output = self.sp_model.decode(input)
return output
class HFLikeTokenizer:
def __init__(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
# assign attributes from real tokenizer to masked one
self.pad_id = self.tokenizer.pad_id
self.eos_id = self.tokenizer.eos_id
self.bos_id = self.tokenizer.bos_id
# mask attribute to be similar to hugging face
self.eos_token_id = self.tokenizer.eos_id
self.pad_token_id = self.tokenizer.pad_id
# to match hugging face attribute
self.pad_token_id = self.pad_id
def create_sequence_mask(self, tokens: torch.Tensor) -> torch.Tensor:
mask = torch.where(
tokens == self.tokenizer.pad_id,
torch.zeros_like(tokens),
torch.ones_like(tokens),
)
mask = torch.where(
tokens == self.tokenizer.bos_id, torch.zeros_like(tokens), mask
)
mask = torch.where(
tokens == self.tokenizer.eos_id, torch.zeros_like(tokens), mask
)
return mask
def __call__(self, texts: Union[List[str], str], *args, **kwargs):
if isinstance(texts, str):
text = self.tokenizer.encode(texts, bos=True, eos=True)
tokens = torch.tensor(text).long()
mask = torch.ones_like(tokens)
else:
texts = [
self.tokenizer.encode(text, bos=True, eos=True)
for text in texts
]
max_len = max(len(text) for text in texts)
tokens = torch.full(
(len(texts), max_len), self.tokenizer.pad_id
).long()
for i, text in enumerate(texts):
tokens[i, -len(text) :] = torch.tensor( # noqa E203
text
).long()
# TODO: decide how eos and bos should be handled - i need to mask
# them? or not?
mask = self.create_sequence_mask(tokens)
for i in range(tokens.shape[0]):
current_tokens = tokens[i, mask[i] == 1]
tokens[
i, -len(current_tokens) - 1 : -1 # noqa E203
] = current_tokens
mask = self.create_sequence_mask(tokens)
# convert `pad_id` from -1 to 0, otherwise embedding will cause out
# of bounds.
tokens = torch.where(
tokens == self.tokenizer.pad_id,
torch.zeros_like(tokens),
tokens,
)
output = {
"input_ids": tokens,
"attention_mask": mask,
}
return output
def decode(self, tokens):
return self.tokenizer.decode(tokens)
@dataclass
class ModelArgs:
"""This class is a modification of the ModelArgs class implemented in
the LLaMA repo. The class has been modified for training, since the
original one just supports inference.
"""
dim: int = 512
n_layers: int = 8
n_heads: int = 8
# defined later by tokenizer
vocab_size: int = -1
# make SwiGLU hidden layer size multiple of large power of 2
multiple_of: int = 256
norm_eps: float = 1e-5
max_batch_size: int = 32
max_seq_len: int = 1024
# added attributes
froze_embeddings: bool = True
use_fairscale: bool = True
class RMSNorm(torch.nn.Module):
"""This class is a modification of the RMSNorm class implemented in
the LLaMA repo. The class has been modified for training, since the
original one just supports inference.
"""
def __init__(self, dim: int, eps: float = 1e-6):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(dim))
def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
def forward(self, x):
output = self._norm(x.float()).type_as(x)
return output * self.weight
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
freqs = 1.0 / (
theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
)
t = torch.arange(end, device=freqs.device) # type: ignore
freqs = torch.outer(t, freqs).float() # type: ignore
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
return freqs_cis
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
ndim = x.ndim
assert 0 <= 1 < ndim
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
shape = [
d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
]
return freqs_cis.view(*shape)
def apply_rotary_emb(
xq: torch.Tensor,
xk: torch.Tensor,
freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
return xq_out.type_as(xq), xk_out.type_as(xk)
class Attention(nn.Module):
"""This class is a modification of the Attention class implemented in
the LLaMA repo. The class has been modified for training, since the
original one just supports inference.
"""
def __init__(self, args: ModelArgs):
super().__init__()
if args.use_fairscale:
self.n_local_heads = (
args.n_heads // fs_init.get_model_parallel_world_size()
)
else:
self.n_local_heads = args.n_heads
self.head_dim = args.dim // args.n_heads
if args.use_fairscale:
self.wq = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=lambda x: x,
)
self.wk = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=lambda x: x,
)
self.wv = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=lambda x: x,
)
self.wo = RowParallelLinear(
args.n_heads * self.head_dim,
args.dim,
bias=False,
input_is_parallel=True,
init_method=lambda x: x,
)
else:
self.wq = nn.Linear(
args.dim, args.n_heads * self.head_dim, bias=False
)
self.wk = nn.Linear(
args.dim, args.n_heads * self.head_dim, bias=False
)
self.wv = nn.Linear(
args.dim, args.n_heads * self.head_dim, bias=False
)
self.wo = nn.Linear(
args.n_heads * self.head_dim, args.dim, bias=False
)
self.dim_cache = (
args.max_batch_size,
args.max_seq_len,
self.n_local_heads,
self.head_dim,
)
self.cache_k = torch.zeros(self.dim_cache).cuda()
self.cache_v = torch.zeros(self.dim_cache).cuda()
def forward(
self,
x: torch.Tensor,
kv_mask: torch.Tensor,
freqs_cis: torch.Tensor,
cache_k: Optional[torch.Tensor] = None,
cache_v: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
start_pos = 0 # Temporary
bsz, seqlen, _ = x.shape
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
# Modified code to allow training, caching is not good for training
if (cache_k is None and cache_v is not None) or (
cache_k is not None and cache_v is None
):
raise ValueError("cache_k is None while cache_v is not None")
if cache_k is None:
keys = xk
values = xv
else:
cache_k.to(xk.device)
cache_v.to(xv.device)
cache_k[:bsz, start_pos : start_pos + seqlen] = xk # noqa E203
cache_v[:bsz, start_pos : start_pos + seqlen] = xv # noqa E203
keys = self.cache_k[:bsz, : start_pos + seqlen] # noqa E203
values = self.cache_v[:bsz, : start_pos + seqlen] # noqa E203
xq = xq.transpose(1, 2)
keys = keys.transpose(1, 2)
values = values.transpose(1, 2)
scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(
self.head_dim
)
if kv_mask is not None:
scores = scores + kv_mask
scores = F.softmax(scores.float(), dim=-1).type_as(xq)
output = torch.matmul(scores, values)
output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
if cache_k is None:
return self.wo(output), None, None
else:
return self.wo(output), self.cache_k, self.cache_v
class FeedForward(nn.Module):
"""This class is a modification of the FeedForward class implemented in
the LLaMA repo. The class has been modified for training, since the
original one just supports inference.
"""
def __init__(
self, dim: int, hidden_dim: int, multiple_of: int, use_fairscale: bool
):
super().__init__()
hidden_dim = int(2 * hidden_dim / 3)
hidden_dim = multiple_of * (
(hidden_dim + multiple_of - 1) // multiple_of
)
if use_fairscale:
self.w1 = ColumnParallelLinear(
dim,
hidden_dim,
bias=False,
gather_output=False,
init_method=lambda x: x,
)
self.w2 = RowParallelLinear(
hidden_dim,
dim,
bias=False,
input_is_parallel=True,
init_method=lambda x: x,
)
self.w3 = ColumnParallelLinear(
dim,
hidden_dim,
bias=False,
gather_output=False,
init_method=lambda x: x,
)
else:
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
def forward(self, x):
return self.w2(F.silu(self.w1(x)) * self.w3(x))
class TransformerBlock(nn.Module):
"""This class is a modification of the TransformerBlock class
implemented in the LLaMA repo. The class has been modified for training,
since the original one just supports inference.
"""
def __init__(self, layer_id: int, args: ModelArgs):
super().__init__()
self.n_heads = args.n_heads
self.dim = args.dim
self.head_dim = args.dim // args.n_heads
self.attention = Attention(args)
self.feed_forward = FeedForward(
dim=args.dim,
hidden_dim=4 * args.dim,
multiple_of=args.multiple_of,
use_fairscale=args.use_fairscale,
)
self.layer_id = layer_id
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.use_fairscale = args.use_fairscale
def forward(
self,
x: torch.Tensor,
attention_mask: torch.Tensor,
freqs_cis: torch.Tensor,
cache_k: Optional[torch.Tensor] = None,
cache_v: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
# modified from orignal code to enable external cache
attention_mask = attention_mask[:, None, :, :]
if self.use_fairscale:
attention_mask = attention_mask.expand(
-1,
self.n_heads // fs_init.get_model_parallel_world_size(),
-1,
-1,
)
else:
attention_mask = attention_mask.expand(-1, self.n_heads, -1, -1)
attn, cache_k, cache_v = self.attention.forward(
self.attention_norm(x), attention_mask, freqs_cis, cache_k, cache_v
)
h = x + attn
out = h + self.feed_forward.forward(self.ffn_norm(h))
return out, cache_k, cache_v
class Transformer(nn.Module):
"""This class is a modification of the Transformer class implemented in
the LLaMA repo. The class has been modified for training, since the
original one just supports inference. The generate method was inspired by
the generate function you can find in `llama.generation`.
"""
def __init__(self, params: ModelArgs):
super().__init__()
self.params = params
self.vocab_size = params.vocab_size
self.n_layers = params.n_layers
if params.use_fairscale:
self.n_local_heads = (
params.n_heads // fs_init.get_model_parallel_world_size()
)
else:
self.n_local_heads = params.n_heads
self.head_dim = params.dim // params.n_heads
dim = (
params.max_batch_size,
params.max_seq_len,
self.n_local_heads,
self.head_dim,
)
self.cache_k = [torch.zeros(dim) for _ in range(self.n_layers)]
self.cache_v = [torch.zeros(dim) for _ in range(self.n_layers)]
if params.use_fairscale:
self.tok_embeddings = ParallelEmbedding(
params.vocab_size, params.dim, init_method=lambda x: x
)
else:
self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
if params.froze_embeddings:
for param in self.tok_embeddings.parameters():
param.requires_grad = False
self.layers = torch.nn.ModuleList()
for layer_id in range(params.n_layers):
self.layers.append(TransformerBlock(layer_id, params))
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
if params.use_fairscale:
self.output = ColumnParallelLinear(
params.dim,
params.vocab_size,
bias=False,
init_method=lambda x: x,
)
else:
self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
# TODO: How too modify this for training?
self.freqs_cis = precompute_freqs_cis(
self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
)
def forward(
self, tokens: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
attention_mask = attention_mask.detach()
logits = self._forward(tokens, attention_mask)
return logits
def _forward(
self, tokens: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
_bsz, seqlen = tokens.shape
h = self.tok_embeddings(tokens)
self.freqs_cis = self.freqs_cis.to(h.device)
# TEMPORARY FIX, need to understand how to manage the positioning
# embedding and the batch size with the current padding and masking.
start_pos = 1
freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] # noqa E203
# mask has size (bsz, seqlen). It should be transformed in
# (bsz, seqlen, seqlen)
# if the mask is a boolean tensor, convert it to int
if attention_mask.dtype == torch.bool:
attention_mask = attention_mask.long()
kv_mask = attention_mask[:, None, :].expand(_bsz, seqlen, seqlen)
kv_mask = torch.tril(kv_mask, diagonal=0)
kv_mask = 1 - kv_mask
kv_mask = (
torch.where(
kv_mask == 1, kv_mask.new_tensor(-9223372036854775808), kv_mask
)
.detach()
.long()
)
for i, layer in enumerate(self.layers):
if not self.training:
cache_k = self.cache_k[i]
cache_v = self.cache_v[i]
h, cache_k, cache_v = layer(
h, kv_mask, freqs_cis, cache_k, cache_v
)
else:
h, _, _ = layer(h, kv_mask, freqs_cis)
if not self.training:
self.cache_k[i] = cache_k.detach()
self.cache_v[i] = cache_v.detach()
h = self.norm(h)
output = self.output(h)
return output
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
max_new_tokens: int,
temperature: float,
top_p: float = 1.0,
no_repeat_ngram_size=None,
):
generated_tokens = []
for cur_pos in range(max_new_tokens):
logits = self._forward(input_ids, attention_mask)[:, -1, :]
if temperature > 0:
probs = torch.softmax(logits / temperature, dim=-1)
next_token = sample_top_p(probs, top_p)
else:
next_token = torch.argmax(logits, dim=-1)
next_token = next_token.reshape(-1)
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
attention_mask = torch.cat(
[attention_mask, torch.ones_like(next_token).unsqueeze(1)],
dim=1,
)
generated_tokens.append(next_token)
sequences = torch.concat(
(input_ids, torch.stack(generated_tokens, dim=1)), dim=1
)
return sequences
def setup_model_parallel() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))
print("local_rank:", local_rank, "world_size:", world_size)
torch.distributed.init_process_group("nccl")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size
def setup_model_deepspeed() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))
deepspeed.init_distributed()
torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size
def load_checkpoints(
ckpt_dir: str, local_rank: int, world_size: int
) -> Tuple[dict, dict]:
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert world_size == len(checkpoints), (
f"Loading a checkpoint for MP={len(checkpoints)} but world "
f"size is {world_size}"
)
ckpt_path = checkpoints[local_rank]
print("Loading")
checkpoint = torch.load(ckpt_path, map_location="cpu")
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())
return checkpoint, params
def load_model(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
froze_embeddings: bool,
use_fairscale: bool,
max_batch_size: int = 32,
) -> Tuple[Transformer, HFLikeTokenizer]:
checkpoint, params = load_checkpoints(ckpt_dir, local_rank, world_size)
model_args: ModelArgs = ModelArgs(
max_seq_len=1024, max_batch_size=max_batch_size, **params
)
model_args.froze_embeddings = froze_embeddings
model_args.use_fairscale = use_fairscale
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
torch.set_default_tensor_type(torch.FloatTensor)
model.load_state_dict(checkpoint, strict=False)
tokenizer = HFLikeTokenizer(tokenizer)
return model, tokenizer
def load_tokenizer(tokenizer_path: str):
tokenizer = Tokenizer(model_path=tokenizer_path)
return tokenizer
def load_tokenizer_test(tokenizer_path: Optional[str] = None):
tokenizer = MyTokenizer(model_path=tokenizer_path)
return tokenizer
def load_model_test(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
froze_embeddings: bool,
use_fairscale: bool,
max_batch_size: int = 32,
) -> Tuple[Transformer, HFLikeTokenizer]:
# test the model with hf tokenizer
model_args = ModelArgs()
model_args.froze_embeddings = froze_embeddings
model_args.use_fairscale = use_fairscale
tokenizer = MyTokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
model = Transformer(model_args).cuda()
tokenizer = HFLikeTokenizer(tokenizer)
return model, tokenizer
================================================
FILE: optimization/chatllama/chatllama/rlhf/__init__.py
================================================
"""RLHF implementation inspired to Lucidrains' implementation."""
================================================
FILE: optimization/chatllama/chatllama/rlhf/actor.py
================================================
import json
import yaml
import os
import shutil
import deepspeed
import torch
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Tuple
from einops import rearrange
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader, Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
)
from chatllama.rlhf.config import ConfigActor
from chatllama.rlhf.model_list import (
hf_models_causal_lm,
llama_models,
hf_models,
)
from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.utils import TrainingStats
class ActorModel(torch.nn.Module):
"""Actor model that generates the augmented prompt from the initial
user_input. The aim is to train this model to generate better prompts.
Attributes:
model: The model from LLaMA to be used
tokenizer: The LLaMA tokenizer
config (ConfigActor): Configuration for the actor model
Methods:
load: Load the model from a path
save: Save the model to a path
forward: Compute the action logits for a given sequence.
generate: Generate a sequence from a given prompt
"""
def __init__(self, config: ConfigActor) -> None:
super().__init__()
# save config
self.config = config
# initialize the self.model
if config.model in llama_models:
# llama module might not be present when HF models are used
from chatllama.llama_model import (
load_model,
setup_model_parallel,
) # noqa
local_rank, world_size = setup_model_parallel()
# use load_model_test for testing
self.model, self.tokenizer = load_model(
ckpt_dir=config.model_folder,
tokenizer_path=config.tokenizer_path,
local_rank=local_rank,
world_size=world_size,
froze_embeddings=config.froze_embeddings,
use_fairscale=config.use_fairscale,
max_batch_size=config.batch_size,
)
elif config.model in hf_models_causal_lm:
self.tokenizer = self.load_tokenizer(config)
self.model = AutoModelForCausalLM.from_pretrained(
config.model,
)
# Setup PEFT model
if config.peft_enable:
# check that the peft config exist
if os.path.exists(config.peft_config_path):
# Read the peft config from yaml
with open(config.peft_config_path, "r") as c:
config_peft = yaml.safe_load(c)
else:
raise ValueError(
f"PEFT config {config.peft_config_path} not found"
)
print(config_peft)
# define lora config for peft
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM, **config_peft
)
# create peft model
self.model = get_peft_model(
model=self.model,
peft_config=peft_config,
)
self.model.to(config.device)
else:
raise ValueError(f"Model {config.model} not supported")
# load the model from model_folder
self.load()
@beartype
def load(self) -> None:
"""Load the model from the path"""
# check if there is a model to load
path = ModelLoader.check_model_path(
config=self.config,
is_checkpoint=False,
current_epoch=None,
)
# if there is a model to load
if path is not None:
# load the model
print("Loading ...")
model_dict = torch.load(path)
self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model"))
@beartype
def save(self) -> None:
"""Save the model to the path"""
# get the path to save the model
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=False,
current_epoch=None,
)
# save the model
print(f"Saving model to {path} ...")
torch.save(
{"state_dict": self.model.state_dict()},
path,
)
@staticmethod
def load_tokenizer(config: ConfigActor):
"""Load the tokenizer from the model name"""
if config.model in hf_models:
# load the tokenizer from HF
tokenizer = AutoTokenizer.from_pretrained(
config.model,
padding_side="left",
padding=True,
truncation=True,
model_max_length=config.max_sequence_length,
)
# add eos token if not present
if tokenizer.eos_token is None:
tokenizer.eos_token = ""
tokenizer.eos_token_id = 2 # OPT eos-token-id
# add pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
elif config.model in llama_models:
# llama module might not be present when HF models are used
from chatllama.llama_model import (
load_tokenizer,
) # noqa
tokenizer = load_tokenizer(config.tokenizer_path)
return tokenizer
def parameters(self):
"""Return the parameters of the model"""
return self.model.parameters()
@beartype
def forward(
self, sequences: torch.Tensor, sequences_mask: torch.Tensor
) -> torch.Tensor:
"""Generate logits to have probability distribution over the vocabulary
of the actions
Args:
sequences (torch.Tensor): Sequences of states and actions used to
compute token logits for the whole list of sequences
attention_mask (torch.Tensor): Mask for the sequences attention
Returns:
logits (torch.Tensor): Logits for the actions taken
"""
model_output = self.model.forward(
sequences, attention_mask=sequences_mask
)
# need to return logits for the actions
if self.config.model in hf_models_causal_lm:
model_output = model_output.logits
if self.config.debug:
print("ActorModel.forward")
print("model_output_logits shape", model_output.shape)
print("model_output logits", model_output)
return model_output
@beartype
@torch.no_grad()
def generate(
self, states: torch.Tensor, state_mask: torch.Tensor
) -> Tuple:
"""Generate actions and sequences=[states, actions] from state
(i.e. input of the prompt generator model)
Args:
state (torch.Tensor): the input of the user
state_mask (torch.Tensor): Mask for the state input (for padding)
Returns:
actions (torch.Tensor): Actions generated from the state
sequences (torch.Tensor): Sequences generated from the
state as [states, actions]
"""
# temperature for the actor
temperature = self.config.temperature
# max sequence length for the actor (i.e. prompt + completion)
max_sequence_length = self.config.max_sequence_length
# max and min number of tokens to generate
max_tokens = self.config.max_tokens
min_tokens = self.config.min_tokens
# max generation possible given the state and the max sequence length
max_generation_possible = max_sequence_length - states.shape[1]
if max_generation_possible < min_tokens:
raise ValueError(
f"The prompt is too long w.r.t the "
f"model sequence length \n"
f"max_sequence_length={max_sequence_length}\n"
f"state_length={states.shape[1]}\n"
f"min_tokens={min_tokens}\n"
f"max_tokens={max_tokens}\n"
f"max_generation_possible={max_generation_possible}\n"
)
# take the minimum the max_tokens and the max_generation_possible
max_completion = min(max_tokens, max_generation_possible)
sequences = self.model.generate(
input_ids=states,
attention_mask=state_mask,
temperature=temperature,
max_new_tokens=max_completion,
no_repeat_ngram_size=3,
)
actions = sequences[:, states.shape[1] :] # noqa E203
if self.config.debug:
print(
f"input length {states.shape[1]} \n"
f"max sequence length {max_sequence_length} \n"
f"max completion {max_completion} \n"
f"generated sequence {sequences.shape[1]} \n"
)
print("ActorModel.generate")
print("state", states)
print("state shape", states.shape)
print("sequence shape", sequences.shape)
print("sequence", sequences)
print("actions shape", actions.shape)
print("actions", actions)
return actions, sequences
class ActorDataset(Dataset):
"""Dataset for the pretraining of the actor model
read a json file with the following format:
[
{
"user_input": "..."
"completion": "..."
},
...
]
Where:
user_input: the input of the user
completion: the output of the user
"""
def __init__(
self,
path: str,
) -> None:
self.path = path
with open(path, "r") as f:
data = json.load(f)
self.data = [d["user_input"] + d["completion"] for d in data]
def __getitem__(self, idx):
return self.data[idx]
def __len__(
self,
):
return len(self.data)
class ActorTrainer:
"""Used to pre-train the actor model to generate better prompts.
Args:
config (ConfigActor): Configuration for the actor model
Attributes:
config (ConfigActor): Configuration for the actor model
model (ActorModel): Actor model
loss_function (torch.nn.CrossEntropyLoss): Loss function
optimizer (torch.optim.Adam): Optimizer
validation_flag (bool): Flag to indicate if the validation dataset
is provided
train_dataset (ActorDataset): Training dataset
train_dataloader (DataLoader): Training dataloader
validation_dataset (ActorDataset): Validation dataset
validation_dataloader (DataLoader): Validation dataloader
scheduler (torch.optim.lr_scheduler): Learning rate scheduler
training_stats (TrainingStats): Training statistics
model_engine (ModelEngine): Model engine for deepspeed training
accelerator (Accelerator): Accelerator for accelerate training
Methods:
train: Train the actor model
load_checkpoint: Load a checkpoint
save_checkpoint: Save a checkpoint
"""
def __init__(self, config: ConfigActor) -> None:
# store config
self.config = config
# load the model
self.actor = ActorModel(config)
# define loss function
self.loss_function = torch.nn.CrossEntropyLoss()
# define optimizer
self.optimizer = torch.optim.AdamW(
self.actor.parameters(), lr=config.lr, weight_decay=1e-5
)
# check if validation dataset is provided
self.validation_flag = False
if config.validation_dataset_path is not None:
self.validation_flag = True
# create dataset and dataloaders
self.train_dataset = ActorDataset(config.train_dataset_path)
self.train_dataloader = DataLoader(
self.train_dataset, batch_size=config.batch_size
)
if self.validation_flag:
self.eval_dataset = ActorDataset(config.validation_dataset_path)
self.validation_dataloader = DataLoader(
self.eval_dataset, batch_size=config.batch_size
)
# define scheduler for the learning rate
# learning rate is decreased until 10% of the initial value
self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
self.optimizer,
T_0=len(self.train_dataset) // config.batch_size,
T_mult=1,
eta_min=config.lr * 0.1,
)
# define training statistics
stat_path = ModelLoader.get_training_stats_path(config)
self.training_stats = TrainingStats(stat_path)
# consistency check between accelerate and deepspeed
if config.accelerate_enable and config.deepspeed_enable:
raise ValueError(
"Both DeepSpeed and Accelerate are enabled for the Actor."
"Please choose one of them."
)
# initialize deepspeed
self.model_engine = None
if config.deepspeed_enable is True:
if config.deepspeed_config_path is None:
raise ValueError(
"DeepSpeed config path is None, but deepspeed is enabled"
)
if os.path.exists(config.deepspeed_config_path) is False:
raise ValueError(
f"DeepSpeed config path {config.deepspeed_config_path}"
f"does not exist"
)
(
self.model_engine,
self.optimizer,
self.train_dataloader,
_,
) = deepspeed.initialize(
args=None,
model=self.actor,
model_parameters=self.actor.parameters(),
training_data=self.train_dataset,
config=self.config.deepspeed_config_path,
)
print("Training with DeepSpeed")
# initialize accelerate
self.accelerator = None
if config.accelerate_enable is True:
self.accelerator = Accelerator()
(
self.actor,
self.optimizer,
self.train_dataloader,
self.scheduler,
) = self.accelerator.prepare(
self.actor,
self.optimizer,
self.train_dataloader,
self.scheduler,
)
print("Training with Accelerate")
@beartype
def save_checkpoint(
self,
current_epoch: int,
current_step: int,
max_epochs: int,
max_steps: int,
) -> None:
"""Save the current checkpoint
Args:
current_epoch (int): Current epoch
current_step (int): Current step
max_epochs (int): Maximum number of epochs
max_steps (int): Maximum number of steps
"""
print(
f"Saving checkpoint for epoch {current_epoch + 1}, "
f"step {current_step + 1} ..."
)
# look for path to save the checkpoint
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=current_epoch,
current_step=current_step,
max_epochs=max_epochs,
max_steps=max_steps,
)
# remove the checkpoint if it already exists
if os.path.exists(path):
if self.config.deepspeed_enable:
shutil.rmtree(path)
else:
os.remove(path)
if self.config.deepspeed_enable:
client_state = {
"epoch": current_epoch,
"step": current_step,
}
self.model_engine.save_checkpoint(path, client_state=client_state)
else:
# save the checkpoint
torch.save(
{
"state_dict": self.actor.model.state_dict(),
"optim_state_dict": self.optimizer.state_dict(),
"training_stats": self.training_stats,
"epoch": current_epoch,
"step": current_step,
},
path,
)
# remove old checkpoints
n_checkpoints_to_keep = self.config.n_checkpoints_to_keep
ModelLoader.delete_old_checkpoints(
model_folder, model_name, n_checkpoints_to_keep
)
@beartype
def load_checkpoint(
self,
) -> Tuple[int, int]:
"""Load a checkpoint from the model folder
Returns:
Tuple[int, int]: Current epoch and current step to resume
training
"""
print("Looking for checkpoints...")
# look for a checkpoint
path = ModelLoader.check_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=None,
)
# if there is a checkpoint
if path is not None:
print("Loading ...")
if self.config.deepspeed_enable:
# try to load the checkpoint
try:
_, client_state = self.model_engine.load_checkpoint(path)
except Exception:
print(
"Checkpoint corrupted!"
"Try to remove the last checkpoint."
"Now Starting from epoch 0, step 0"
)
return 0, 0
# load epoch and step to resume loops
epoch = client_state["epoch"]
step = client_state["step"]
else:
# try to load the checkpoint
try:
checkpoint = torch.load(path)
except Exception:
print(
"Checkpoint corrupted!"
"Try to remove the last checkpoint."
"Now Starting from epoch 0, step 0"
)
return 0, 0
# assing the checkpoint to the model
epoch = checkpoint["epoch"]
self.actor.model.load_state_dict(checkpoint["state_dict"])
self.optimizer.load_state_dict(checkpoint["optim_state_dict"])
self.trainign_stats = checkpoint["training_stats"]
step = checkpoint["step"]
return epoch, step + 1 # return the next episode to train
return 0, 0
def add_eos_token(
self, tokens: torch.Tensor, mask: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
# given tokens and mask, add eos token to the end of each sequence
# and update the mask
batch_size, seq_len = tokens.shape
eos_token = self.actor.tokenizer.eos_token_id
# see if i can append 1 token
n_tokens_to_append = min(self.config.max_sequence_length - seq_len, 1)
n_tokens_to_append = max(n_tokens_to_append, 0)
# concatenate eos to tokens and mask
if n_tokens_to_append > 0:
tokens = torch.cat(
[
tokens,
torch.ones(batch_size, n_tokens_to_append)
.long()
.to(tokens.device)
* eos_token,
],
dim=1,
)
mask = torch.cat(
[
mask,
torch.ones(batch_size, n_tokens_to_append)
.long()
.to(mask.device),
],
dim=1,
)
return tokens, mask
def train(
self,
) -> None:
"""Train the model"""
print("Start Actor Model Pretraining")
# get config parameters
if self.config.deepspeed_enable:
batch_size = self.train_dataloader.batch_size
else:
batch_size = self.config.batch_size
epochs = self.config.epochs
device = self.config.device
checkpoint_steps = self.config.checkpoint_steps
# compute the number of iterations
n_iter = int(len(self.train_dataset) / batch_size)
# load model_checkpoint
start_epoch, start_step = self.load_checkpoint()
if start_epoch == 0 and start_step == 0:
self.training_stats.clear()
# counter for the checkpoint
cnt_checkpoint = 1
# traing loop
for epoch in range(start_epoch, epochs):
self.actor.train()
for i, input_text in enumerate(self.train_dataloader):
# skip the first steps if we are resuming training
if i < start_step:
continue
# tokenize input
with torch.no_grad():
input_tokenized = self.actor.tokenizer(
input_text,
return_tensors="pt",
truncation=True,
padding=True,
)
# split tokens and mask
input_tokenized_id = input_tokenized["input_ids"]
input_tokenized_mask = input_tokenized["attention_mask"]
# add eos token
(
input_tokenized_id,
input_tokenized_mask,
) = self.add_eos_token(
input_tokenized_id,
input_tokenized_mask,
)
# split into input and output
training_output = input_tokenized_id[:, 1:]
training_input = input_tokenized_id[:, :-1]
attention_mask = input_tokenized_mask[:, :-1]
# move to device
training_output = training_output.to(device)
training_input = training_input.to(device)
attention_mask = attention_mask.to(device)
# forward pass
if self.config.deepspeed_enable:
est_output = self.model_engine(
training_input, attention_mask
)
else:
est_output = self.actor(training_input, attention_mask)
# compute loss
est_output = rearrange(est_output, "b s v -> (b s) v")
training_output = rearrange(training_output, "b s -> (b s)")
loss = self.loss_function(est_output, training_output)
self.training_stats.training_loss.append(loss.item())
# backward pass
if self.config.deepspeed_enable:
self.model_engine.backward(loss)
self.model_engine.step()
elif self.config.accelerate_enable:
self.optimizer.zero_grad()
self.accelerator.backward(loss)
self.optimizer.step()
self.scheduler.step()
else:
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.scheduler.step()
# print progress
if i % self.config.iteration_per_print == 0:
print(
f"Epoch: {epoch+1}/{epochs}, "
f"Iteration: {i+1}/{n_iter}, "
f"Training Loss: {loss}"
)
# save checkpoint periodically
if cnt_checkpoint % checkpoint_steps == 0:
self.save_checkpoint(epoch, i, epochs, n_iter)
self.training_stats.save()
cnt_checkpoint = 1
else:
cnt_checkpoint += 1
# Validation
if self.validation_flag:
self.actor.eval()
with torch.no_grad():
for i, input_text in enumerate(self.validation_dataloader):
# tokenize input
input_tokenized = self.actor.tokenizer(
input_text, return_tensors="pt", padding=True
)
validation_output = input_tokenized["input_ids"][:, 1:]
validation_input = input_tokenized["input_ids"][:, :-1]
attention_mask = input_tokenized["attention_mask"][
:, :-1
]
# forward pass
est_output = self.actor.forward(
validation_input, attention_mask
)
validation_output = rearrange(
validation_output, "b s -> (b s)"
)
# compute loss
est_output = rearrange(est_output, "b s v -> (b s) v")
loss = self.loss_function(
est_output, validation_output
)
self.training_stats.validation_loss.append(loss.item())
# print progress
if i % self.config.iteration_per_print == 0:
print(
f"Epoch: {epoch+1}/{epochs}, "
f"Iteration: {i+1}/{n_iter}, "
f"Validation Loss: {loss}"
)
# reset start_step after training is resumed
start_step = 0
# save the model
self.actor.save()
print("Training Finished ")
================================================
FILE: optimization/chatllama/chatllama/rlhf/config.py
================================================
import yaml
import os
from dataclasses import dataclass
import torch
from beartype import beartype
from beartype.typing import Optional
@dataclass
class ConfigReward:
"""Config parameters for the reward model
Attributes:
device (torch.device): Device to be used for the reward model
model (str): Model to be used for the reward model
model_folder (str): Path to the folder where model are stored (used
to load / store finetuned model or checkpoints)
model_head_hidden_size (int): Hidden size of the reward model head
max_sequence_length (int): Max sequence length of the reward model
train_dataset_path (Optional[str]): Path to the training dataset.
Default to None. To be specified only for the reward model trainig.
validation_dataset_path (Optional[str]): Path to the validation
dataset. Default to None. To be specified only for the reward
model trainig.
batch_size (Optional[int]): Batch size to train the reward model.
Default to None. To be specified only for the reward model
trainig.
epochs (Optional[int]): Number of epochs to train the reward model.
Default to None. To be specified only for the reward model
trainig.
iteration_per_print (Optional[int]): Number of iterations to print
the training loss. Default to None. To be specified only for the
reward model trainig.
checkpoint_steps (Optional[int]): Number of steps (backProp) to
interleave checkpoints. Default to None. To be specified only for
the reward model trainig.
checkpoint_name (Optional[str]): Name of the checkpoint. Default to
None.
lr (Optional[float]): Learning rate for the reward model. Default to
None. To be specified only for the reward model distillation.
llm_enable (bool): Enable reward model distillation. Default to True.
Disable it if you dont have an API key.
llm_model (Optional[str]): Model to be used for the reward model
distillation. Default to "text-davinci-003".
llm_temperature (Optional[float]): Temperature for the reward model
distillation. Default to 0.9.
llm_max_tokens (Optional[int]): Max tokens for the reward model
distillation. Default to 64.
deepspeed_enable (bool): Enable deepspeed for the reward model
training. Default to False.
deepspeed_config_path (str): Path to the deepspeed config file.
Default to None.
is_reward (bool): True if the model is a reward model. Default to True.
accelerate_enable (bool): Enable accelerate for the reward model
debug (bool): enable prints for Debugging
"""
device: torch.device
model: str
model_folder: str
model_head_hidden_size: int
max_sequence_length: int
train_dataset_path: Optional[str] = None
validation_dataset_path: Optional[str] = None
batch_size: Optional[int] = None
epochs: Optional[int] = None
iteration_per_print: Optional[int] = None
checkpoint_steps: Optional[int] = None
checkpoint_name: Optional[str] = None
lr: Optional[float] = None
llm_enable: Optional[bool] = False
llm_model: Optional[str] = "text-davinci-003"
llm_temperature: Optional[float] = 0.9
llm_max_tokens: Optional[int] = 64
deepspeed_enable: bool = False
deepspeed_config_path: Optional[str] = None
# critic specific parameters
is_reward: bool = True
accelerate_enable: bool = False
debug: bool = False
# just for naming consistency
ConfigCritic = ConfigReward
@dataclass
class ConfigActor:
"""Config parameters for models
Attributes:
model (str): Model to be used for the actor
model_folder (str): Path to the folder where model are stored (used
to load / store finetuned model or checkpoints)
tokenizer_path (str): Path to the folder where tokenizer are stored
train_dataset_path (str): Path to the training dataset
validation_dataset_path (Optional[str]): Path to the validation dataset
froze_embeddings (bool): Froze embeddings for the actor
use_fairscale (bool): Use fairscale module for the actor instead of
pytorch native modules.
max_sequence_length (int): Max sequence length for the actor
max_tokens (int): Max tokens for actor generation
min_tokens (int): Min tokens for actor generation
additonal_prompt_tokens (int): Number of tokens to be used as safety
to avoid too large sequences and to add a template to the
dataset
temperature (float): Temperature for the actor
batch_size (int): Batch size to train the actor
iteration_per_print (int): Number of iterations to print the
training loss
lr (float): Learning rate for the actor
epochs (int): Number of epochs to train the actor
checkpoint_steps (int): Number of steps (backProp) to interleave
checkpoints.
n_checkpoints_to_keep (int): Number of checkpoints to keep
for the actor.
deepspeed_enable (bool): Enable deepspeed for the actor.
Default to False.
deepspeed_config_path (str): Path to the deepspeed config file.
Default to None.
accelerate_enable (bool): Enable accelerate for the actor
device (torch.device): Device to be used for the actor
checkpoint_name (Optional[str]): Name of the checkpoint. Default to
None.
peft_enable (bool): Enable peft for the actor
peft_config_path (str): Path to the peft config file.
debug (bool): Enable prints for debugging
"""
model: str
model_folder: str
tokenizer_path: str
train_dataset_path: str
validation_dataset_path: Optional[str]
froze_embeddings: bool
use_fairscale: bool
max_sequence_length: int
max_tokens: int
min_tokens: int
additonal_prompt_tokens: int
temperature: float
batch_size: int
iteration_per_print: int
lr: float
epochs: int
checkpoint_steps: int
n_checkpoints_to_keep: int
deepspeed_enable: bool
deepspeed_config_path: Optional[str]
accelerate_enable: bool
device: torch.device
peft_enable: bool
peft_config_path: str
checkpoint_name: Optional[str] = None
debug: bool = False
@dataclass
class ConfigTrainer:
"""Config parameters for the trainer, used to configure the reinforcement
learning training loop
Attributes:
actor_lr (float): Learning rate for the actor when training with
reinforcement learning
critic_lr (float): Learning rate for the critic when training with
reinforcement learning
actor_eps_clip (float): Epsilon clip for the actor
critic_eps_clip (float): Epsilon clip for the critic
beta_s (float): Beta for the actor and critic
gamma (float): coefficient for the discounted rewards.
examples_path (str): Path to the examples dataset
num_episodes (int): Number of episodes, each episodes consist of
a number of timesteps that are used to generate examples
stored in the memory buffer.
max_timesteps (int): Max timesteps for the actor and critic.
for each timestep a set of examples are sampled and used to
generate a completion and a reward.
update_timesteps (int): Number of timesteps to update the actor and
critic
num_examples (int): Number of examples to generate for the actor
and critic. For each iteration of timestep, num_examples are
sampled from the prompt dataset, processed and stored in the
memory buffer.
batch_size (int): Batch size to train the actor and critic.
This batch is used to aggregate the memory from the memory buffer
for the actual training of the actor and critic models.
epochs (int): Number of epochs to train the actor and critic.
checkpoint_steps (int): Number of episodes to interleave checkpoints.
device (torch.device): Device to be used for the actor and critic
checkpoint_name (Optional[str]): Name of the checkpoint. Default to
None.
"""
actor_lr: int
critic_lr: int
actor_eps_clip: float
critic_eps_clip: float
beta_s: float
gamma_discounted: float
examples_path: str
num_episodes: int
max_timesteps: int
update_timesteps: int
num_examples: int
batch_size: int
epochs: int
checkpoint_steps: int
device: torch.device
checkpoint_name: Optional[str] = None
debug: bool = False
class Config:
"""Store the config parameters for the whole pipeline
Args:
trainer_dict (Optional[Dict]): Dictionary with the config parameters
for the trainer. Default to None. If None, the config.yaml is
used.
actor_dict (Optional[Dict]): Dictionary with the config parameters
for the actor. Default to None. If None, the config.yaml is
used.
critic_dict (Optional[Dict]): Dictionary with the config parameters
for the critic. Default to None. If None, the config.yaml is
used.
reward_dict (Optional[Dict]): Dictionary with the config parameters
for the reward. Default to None. If None, the config.yaml is
used.
device (Optional[torch.device]): Device to be used for the actor
and critic. Default to None. If None, the device available is
used.
debug (Optional[bool]): Enable prints for debugging. Default to False.
Attributes:
trainer (ConfigTrainer): Config parameters for the trainer
actor (ConfigActor): Config parameters for the actor
critic (ConfigCritic): Config parameters for the critic
reward (ConfigReward): Config parameters for the reward
"""
@beartype
def __init__(
self,
path: str,
device: Optional[torch.device] = None,
debug: Optional[bool] = False,
) -> None:
# if not specified use the device available
if device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
else:
raise ValueError("No GPU available")
print(f"Current device used :{str(device)}")
if path is None or os.path.exists(path) is False:
raise ValueError("Path to the config.yaml is not valid")
# Read the config from yaml
with open(path, "r") as c:
config = yaml.safe_load(c)
trainer_dict = config["trainer_config"]
actor_dict = config["actor_config"]
critic_dict = config["critic_config"]
reward_dict = config["reward_config"]
# Trainer Config
trainer_dict["device"] = device
trainer_dict["debug"] = debug
self.trainer = ConfigTrainer(**trainer_dict)
# Actor Config
actor_dict["device"] = device
actor_dict["debug"] = debug
self.actor = ConfigActor(**actor_dict)
# Critic Config
critic_dict["device"] = device
critic_dict["debug"] = debug
self.critic = ConfigCritic(**critic_dict)
self.critic.is_reward = False
# Reward Config
reward_dict["device"] = device
reward_dict["debug"] = debug
self.reward = ConfigReward(**reward_dict)
================================================
FILE: optimization/chatllama/chatllama/rlhf/dataset.py
================================================
import json
import os
import numpy as np
from beartype.typing import Dict, List, Union
from datasets import load_dataset
from chatllama.rlhf.config import Config, ConfigActor, ConfigReward
from chatllama.rlhf.reward import RewardModel, CriticModel
from chatllama.rlhf.actor import ActorModel
ConfigType = Union[Config, ConfigActor, ConfigReward]
class BaseDataset:
def __init__(
self,
) -> None:
pass
@staticmethod
def sort_conversation(
conversations: List[Dict],
only_input: bool = False,
reverse: bool = True,
shuffle: bool = True,
) -> List[Dict]:
"""Sort the conversations by length of user_input + completion
or by length of user_input only
Args:
conversations (List[Dict]): list of conversations
only_input (bool, optional): sort by length of user_input only.
Defaults to False.
reverse (bool, optional): sort in descending order.
Defaults to True.
shuffle (bool, optional): shuffle the dataset leaving only the
first 100 samples sorted. Defaults to True.
Returns:
List[Dict]: sorted list of conversations
"""
# define the sorting function
if only_input is True:
def sort_fun(x):
return len(x["user_input"])
else:
def sort_fun(x):
return len(x["user_input"]) + len(x["completion"])
# sort
conversations = sorted(
conversations,
key=sort_fun,
reverse=reverse,
)
# shuffle
if shuffle is True:
conversations = (
conversations[:10]
+ np.random.choice(
conversations[10:],
size=len(conversations[10:]),
replace=False,
).tolist()
)
return conversations
@staticmethod
def take_n_samples(
conversations: List[Dict],
n: int,
) -> List[Dict]:
"""Take N samples from the dataset
Args:
conversations (List[Dict]): list of conversations
n (int): number of samples to take randomly
Returns:
List[Dict]: list of N samples
"""
# sample N number of index from 0 to len(conversations)
indexes = np.random.choice(len(conversations), size=n, replace=False)
# take the samples
conversations = [conversations[i] for i in indexes]
return conversations
@staticmethod
def clean_dataset(config: ConfigType):
"""Clean the datasets by removing too long examples
The Reward Dataset constraints are:
- user_input + completion < Reward model max sequence length
The Actor Dataset constraints are:
- user_input + completion < Actor model max sequence length
The RLHF Training Dataset constraints are:
- user_input + min_completion < Actor model max sequence length
- user_input + min_completion < Critic model max sequence length
- user_input + min_completion < Reward model max sequence length
Args:
config (Config): config object
"""
if isinstance(config, Config):
print("Start cleaning the dataset for RLHF")
# constraints
r_model_max_seq_len = config.reward.max_sequence_length
a_model_max_seq_len = config.actor.max_sequence_length
c_model_max_seq_len = config.critic.max_sequence_length
min_completion = config.actor.min_tokens
# dataset
dataset_path = config.trainer.examples_path
# tokenizers
r_tokenizer = RewardModel.load_tokenizer(config.reward)
a_tokenizer = ActorModel.load_tokenizer(config.actor)
c_tokenizer = CriticModel.load_tokenizer(config.critic)
# safety tokens
safety_tokens = config.actor.additonal_prompt_tokens
elif isinstance(config, ConfigActor):
print("Start cleaning the dataset for Actor")
# constraint
a_model_max_seq_len = config.max_sequence_length
# dataset
dataset_path = config.train_dataset_path
# tokenizer
a_tokenizer = ActorModel.load_tokenizer(config)
# safety tokens
safety_tokens = config.additonal_prompt_tokens
elif isinstance(config, ConfigReward):
print("Start cleaning the dataset for Reward")
# constraint
r_model_max_seq_len = config.max_sequence_length
# dataset
dataset_path = config.train_dataset_path
# tokenizer
r_tokenizer = RewardModel.load_tokenizer(config)
# if there is the datasets
if os.path.exists(dataset_path):
# load the dataset
with open(dataset_path, "r") as f:
conversations = json.load(f)
# sort in desceding order - longest first
if isinstance(config, Config):
conversations = BaseDataset.sort_conversation(
conversations,
only_input=True,
reverse=True,
)
else:
conversations = BaseDataset.sort_conversation(
conversations,
only_input=False,
reverse=True,
)
old_len = len(conversations)
# remove too long examples
# since datasets are ordered by the length
# we can remove the first elements until we find
# an example that is not too long
while len(conversations) > 0:
# get the text to be tokenized
if isinstance(config, Config):
text = conversations[0]["user_input"]
else:
text = (
conversations[0]["user_input"]
+ conversations[0]["completion"]
)
# remove elements from RLHF dataset
if isinstance(config, Config):
a_tokens = a_tokenizer.encode(text, truncation=False)
r_tokens = r_tokenizer.encode(text, truncation=False)
c_tokens = c_tokenizer.encode(text, truncation=False)
if (
len(a_tokens) + min_completion + safety_tokens
> a_model_max_seq_len
):
conversations.pop(0)
elif (
len(r_tokens) + min_completion + safety_tokens
> r_model_max_seq_len
):
conversations.pop(0)
elif (
len(c_tokens) + min_completion + safety_tokens
> c_model_max_seq_len
):
conversations.pop(0)
else:
break
# remove elements from Actor dataset
elif isinstance(config, ConfigActor):
tokens = a_tokenizer.encode(text, truncation=False)
if len(tokens) + safety_tokens > a_model_max_seq_len:
conversations.pop(0)
else:
break
# remove elements from Reward dataset
elif isinstance(config, ConfigReward):
tokens = r_tokenizer.encode(text, truncation=False)
if len(tokens) > r_model_max_seq_len:
conversations.pop(0)
else:
break
# if the number of examples has changed
if len(conversations) != old_len:
print("Number of examples before cleaning: ", old_len)
print(
"Number of examples after cleaning: ", len(conversations)
)
# remove the old dataset
os.remove(dataset_path)
# save the new dataset
with open(dataset_path, "w") as f:
json.dump(conversations, f, indent=4)
else:
print("Dataset is already clean")
else:
print(
f"Dataset not found at {dataset_path}"
f" Skipping cleaning of the dataset"
)
class StanfordNLPSHPDataset(BaseDataset):
"""Class for Stanford NLP SHP dataset from HuggingFace"""
def __init__(
self,
) -> None:
print("Download the dataset")
self.dataset = load_dataset("stanfordnlp/SHP")
print("Download Completed")
def reformat_dataset(self, data: List) -> List[Dict]:
"""Reformat the dataset to the format required by RLHF
Args:
data (List): dataset from HuggingFace
Returns:
List[Dict]: reformatted dataset
"""
# initialize conversations
conversations = []
# loop over the dataset
for i, d in enumerate(data):
if d["score_A"] > d["score_B"]:
response = d["human_ref_A"]
else:
response = d["human_ref_B"]
# compose user_input template
user_input = d["history"].rstrip("\n")
user_input = "Human: " + d["history"] + "\n\n##\n\n"
# compose completion template
completion = "Assistant: " + response
conv = {
"user_input": user_input,
"completion": completion,
"score": None,
}
conversations.append(conv)
return conversations
def save_dataset(
self, dataset_folder: str, number_of_samples: int, reverse: bool = True
) -> None:
"""Save the dataset in the format required by RLHF
Args:
dataset_folder (str): path to the folder where the dataset
will be saved
number_of_samples (int): number of samples to take from the
dataset
reverse (bool, optional): sort the dataset in descending order.
Defaults to True.
"""
print("Generate datasets for RLHF")
# take the train and test dataset to create the finetuning dataset
conversations = self.reformat_dataset(self.dataset["train"])
conversations.extend(self.reformat_dataset(self.dataset["test"]))
# sort conversations by length of user_input + completion
conversations = self.sort_conversation(conversations, reverse=reverse)
# save actor training data
with open(f"{dataset_folder}/actor_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
# take N samples and sort them
conversations = self.take_n_samples(conversations, number_of_samples)
conversations = self.sort_conversation(conversations, reverse=reverse)
# save reward training data
with open(f"{dataset_folder}/reward_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
# take the validation dataset for rlhf
conversations = self.reformat_dataset(self.dataset["validation"])
# sort the validation dataset
conversations = self.sort_conversation(
conversations,
only_input=True,
reverse=reverse,
)
# save rlhf training data
with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
print("Generation Completed")
class AnthropicRLHF(BaseDataset):
def __init__(
self,
) -> None:
print("Download the dataset")
self.dataset = load_dataset("Anthropic/hh-rlhf")
print("Download Completed")
def reformat_dataset(self, data: List) -> List[Dict]:
"""Reformat the dataset to the format required by RLHF
Args:
data (List): dataset from HuggingFace
Returns:
List[Dict]: reformatted dataset
"""
conversations = []
for _, d in enumerate(data):
current_conv = d["chosen"]
split_answer = current_conv.split("Assistant:")
# take all the list element in split_answer except the last one
# and joing them with "Assistant:" in a unique string
previous_convers = split_answer[0]
for i, s in enumerate(split_answer[1:-1]):
previous_convers += "Assistant:" + s
# remove the last characters if they are "\n" from the previous
# conversation
previous_convers = previous_convers.rstrip("\n")
user_input = previous_convers + "\n\n##\n\n"
completion = "Assistant: " + split_answer[-1]
conv = {
"user_input": user_input,
"completion": completion,
"score": None,
}
conversations.append(conv)
return conversations
def save_dataset(
self, dataset_folder: str, number_of_samples: int, reverse: bool = True
) -> None:
"""Save the dataset in the format required by RLHF
Args:
dataset_folder (str): path to the folder where the dataset
will be saved
number_of_samples (int): number of samples to take from the
dataset
reverse (bool, optional): sort the dataset in descending order.
Defaults to True.
"""
print("Generate datasets for RLHF")
# generate actor and reward dataset
conversations = self.reformat_dataset(self.dataset["train"])
conversations = self.sort_conversation(conversations, reverse=reverse)
# save actor training data
with open(f"{dataset_folder}/actor_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
# sample N number of index from 0 to len(conversations)
conversations = self.take_n_samples(conversations, number_of_samples)
conversations = self.sort_conversation(conversations, reverse=reverse)
# save reward training data
with open(f"{dataset_folder}/reward_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
# rlhf dataset
conversations = self.reformat_dataset(self.dataset["test"])
# sort conversations by length of user_input
conversations = self.sort_conversation(
conversations, only_input=True, reverse=reverse
)
# save rlhf training data
with open(f"{dataset_folder}/rlhf_training_data.json", "w") as f:
json.dump(conversations, f, indent=4)
print("Generation Completed")
================================================
FILE: optimization/chatllama/chatllama/rlhf/model_list.py
================================================
# llama models
llama_models = ["llama-7B", "llama-13B", "llama-33B", "llama-65B"]
# HF Models
# encoder-decoder models TODO: still not supported
hf_models_seq_2_seq = [
"google/flan-t5-xxl",
"google/flan-t5-xl",
"google/flan-t5-large",
"google/flan-t5-base",
"google/flan-t5-small",
]
# decoder only TODO: codegen is still broken
hf_models_causal_lm = [
"facebook/opt-125m",
"facebook/opt-1.3b",
"facebook/opt-2.7b",
"facebook/opt-6.7b",
"facebook/opt-11b",
"facebook/galactica-125m",
"facebook/galactica-1.3b",
"facebook/galactica-6.7b",
"bigscience/bloom-560m",
"bigscience/bloomz-560m",
"bigscience/bloom-1b1",
"bigscience/bloomz-1b1",
"bigscience/bloom-1b7",
"bigscience/bloomz-1b7",
"bigscience/bloom-3b",
"bigscience/bloomz-3b",
"bigscience/bloom-7b1",
"bigscience/bloomz-7b1",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neox-20b",
"EleutherAI/gpt-j-6B",
"gpt2",
"gpt2-large",
"gpt2-xl",
"benjamin/gerpt2",
"benjamin/gerpt2-large",
"Salesforce/codegen-350M-mono",
"Salesforce/codegen-2B-mono",
"Salesforce/codegen-6B-mono",
"Salesforce/codegen-16B-mono",
]
# create a list of all the models from hf
hf_models = hf_models_seq_2_seq + hf_models_causal_lm
================================================
FILE: optimization/chatllama/chatllama/rlhf/model_loader.py
================================================
import os
import shutil
from beartype.typing import Union, Optional, Tuple
from chatllama.rlhf.config import (
Config,
ConfigActor,
ConfigCritic,
ConfigReward,
)
from chatllama.rlhf.model_list import hf_models
ConfigType = Union[Config, ConfigActor, ConfigCritic, ConfigReward]
class ModelLoader:
"""Class to load and save models and their checkpoints during training."""
def __init__(
self,
) -> None:
pass
@staticmethod
def get_training_stats_path(config: ConfigType) -> str:
"""Method to get the path to the training stats file. Used when saving
Args:
config (ConfigType): the config object
"""
model_folder, model_name, path = ModelLoader.get_model_path(
config, is_checkpoint=True
)
stat_path = os.path.join(model_folder, "training_stats.json")
return stat_path
@staticmethod
def look_for_last_checkpoint(
model_folder: str,
model_name: str,
) -> Optional[str]:
"""Method to look for the last checkpoint in the model folder
checkpoint are saved as {model_name}_epoch_{current_epoch}.pt
Args:
model_folder (str): the folder where the checkpoints are saved
model_name (str): the name of the model
"""
# remove .pt to model name
model_name = model_name.split(".")[0]
checkpoints = [
f for f in os.listdir(model_folder) if f.startswith(model_name)
]
if len(checkpoints) == 0:
return None
else:
checkpoints = sorted(checkpoints)
# get last checkpoint
last_checkpoint = checkpoints[-1]
return last_checkpoint
@staticmethod
def look_for_checkpoint_by_name(
model_folder: str,
checkpoint_name: str,
) -> Optional[str]:
"""Method to look for a particular checkpoint in the model folder
checkpoint are saved as
{model_name}_epoch_{current_epoch}_steps_{current_steps}.pt
Args:
model_folder (str): the folder where the checkpoints are saved
checkpoint_name (str): the name of the checkpoint
"""
# look for a file named checkpoint_name in the model folder
path = os.path.join(model_folder, checkpoint_name)
if os.path.exists(path):
return checkpoint_name
else:
return None
@staticmethod
def get_checkpoint_name(config: ConfigType) -> str:
if isinstance(config, Config):
return config.trainer.checkpoint_name
else:
return config.checkpoint_name
@staticmethod
def get_base_model_folder_from_config(config: ConfigType) -> str:
if isinstance(config, ConfigActor) or isinstance(config, ConfigReward):
return config.model_folder
elif isinstance(config, Config):
return config.actor.model_folder
else:
raise ValueError(
"Config type not recognized during saving or loading"
)
@staticmethod
def get_model_type_from_config(config: ConfigType) -> str:
if isinstance(config, ConfigReward):
# here use ad-hoc flag from config to distinguish between
# reward and critic
if config.is_reward:
return "reward"
else:
return "critic"
elif isinstance(config, ConfigActor):
return "actor"
elif isinstance(config, Config):
return "actor_rl"
@staticmethod
def get_model_name_from_config(config: ConfigType) -> str:
model_name = None
if isinstance(config, Config):
model_name = config.actor.model
elif isinstance(config, ConfigReward) or isinstance(
config, ConfigActor
):
model_name = config.model
if model_name in hf_models:
return os.path.split(model_name)[-1]
if model_name is None:
raise ValueError("Model name not found")
return model_name
@staticmethod
def delete_old_checkpoints(
model_folder: str, model_name: str, n_ckp_to_keep: int = 5
):
"""Method to discard old checkpoints, keeping only the last
n_ckp_to_keep
Args:
model_folder (str): the folder where the checkpoints are saved
model_name (str): the name of the model
n_ckp_to_keep (int): the number of checkpoints to keep
"""
# remove .pt to model name
model_name = model_name.split(".")[0]
checkpoints = [
f for f in os.listdir(model_folder) if f.startswith(model_name)
]
if len(checkpoints) == 0:
return
else:
checkpoints = sorted(checkpoints)
# check if the number of checkpoint is greater than 5
if len(checkpoints) > n_ckp_to_keep:
for c in checkpoints[:-n_ckp_to_keep]:
checkpoint_path = os.path.join(model_folder, c)
os.remove(checkpoint_path)
@staticmethod
def get_model_path(
config: ConfigType,
is_checkpoint: bool = False,
current_epoch: Optional[int] = None,
current_step: Optional[int] = None,
max_epochs: int = 1_000_000_000,
max_steps: int = 1_000_000_000,
) -> Tuple[str, str, Optional[str]]:
"""Method to get the path to the right model file. Used when saving
the model.
The hierarchy of the model folder is:
-- model_folder: here store the models trained, for each type of model
there is a dedicated folder
-- actor
-- critic
-- reward
-- actor_rl
-- checkpoints: here store the checkpoints during training, for
each type of model there is a dedicated folder
-- actor
-- critic
-- reward
-- actor_rl
Args:
config (ConfigType): the config object, contains info of the model
is_checkpoint (bool): if True, the path is for a checkpoint
current_epoch (Optional[int]): the current epoch, used to create
the checkpoint name. If is_checkpoint is True, and
current_epoch is None, return just the folder and the simple
model name for the possible checkpoint.
current_step (Optional[int]): the current step, used to create
the checkpoint name.
max_epochs (Optional[int]): the maximum number of epochs, used to
create the checkpoint name.
max_steps (Optional[int]): the maximum number of steps, used to
create the checkpoint name.
Returns:
model_folder (str): the folder where the model is saved
model_name (str): the name of the model
path (Optional[str]): the path to the model. If is_checkpoint is
True, and current_epoch is None, return None
"""
model_folder = ModelLoader.get_base_model_folder_from_config(config)
# Add the checkpoint path if necessary
if is_checkpoint:
model_folder = os.path.join(model_folder, "checkpoints")
# Create the folder for the model type
# (Actor, Critic, Reward, Actor_RL)
model_type = ModelLoader.get_model_type_from_config(config)
model_folder = os.path.join(model_folder, model_type)
# Make the path if not exists
if os.path.exists(model_folder) is False:
os.makedirs(model_folder, exist_ok=True)
print(f"Model folder does not exist. Creating it: {model_folder}")
# Create the model name
model_name = ModelLoader.get_model_name_from_config(config)
# If is a checkpoint and current epoch are available
# extend the model name with the epoch, if none epoch is provided
# just return the simple model name
if is_checkpoint and current_epoch is not None:
# number of characters to store the checkpoints
n_char = max(len(str(max_epochs)), len(str(max_steps)))
# create the string epoch such that it is always the same length
# equalt to n_char (i.e. 00000001) necessary for sorting
string_epoch = str(current_epoch)
string_epoch = "0" * (n_char - len(string_epoch)) + string_epoch
string_epoch = f"_epoch_{string_epoch}"
if current_step is not None:
string_step = str(current_step)
string_step = "0" * (n_char - len(string_step)) + string_step
string_step = f"_step_{string_step}"
model_name = f"{model_name}{string_epoch}{string_step}.pt"
else:
model_name = f"{model_name}{string_epoch}.pt"
else:
model_name = f"{model_name}.pt"
# if the epoch is not provided, and it is a checkpoint
# is impossible to know the path to the file.
# but we can know the model folder and the model name
if is_checkpoint and current_epoch is None:
path = None
else:
path = os.path.join(model_folder, model_name)
return model_folder, model_name, path
@staticmethod
def check_model_path(
config: ConfigType,
is_checkpoint: bool = False,
current_epoch: Optional[int] = None,
current_step: Optional[int] = None,
) -> Optional[int]:
"""Method to check if the model path exists to load models
or checkpoints.
Args:
config (ConfigType): the config object, contains info of the model
is_checkpoint (bool): if True, the path is for a checkpoint
current_epoch (Optional[int]): the current epoch.
is is_checkpoint is True, and current_epoch is None,
it will look for the last checkpoint and return it.
Returns:
path (Optional[str]): the path to the model. If is_checkpoint is
True, and current_epoch is None, search for the last checkpoint
and return it. If no checkpoint is found, return None.
epoch (Optional[int]): the epoch of the checkpoint if an actual
checkpoint is found. If no checkpoint is found, return None.
"""
model_folder, model_name, path = ModelLoader.get_model_path(
config,
is_checkpoint,
current_epoch,
)
# If i am looking for a checkpoint.
if is_checkpoint and current_epoch is None:
# If the checkpoint is specified by name use it
checkpoint_name = ModelLoader.get_checkpoint_name(config)
if checkpoint_name is not None:
checkpoint = ModelLoader.look_for_checkpoint_by_name(
model_folder, checkpoint_name
)
else:
checkpoint = ModelLoader.look_for_last_checkpoint(
model_folder, model_name
)
if checkpoint is not None:
path = os.path.join(model_folder, checkpoint)
# Get the epoch number from the checkpoint name
if path is not None:
if os.path.exists(path) is False:
path = None
if path is None:
if is_checkpoint:
checkpoint_name = ModelLoader.get_checkpoint_name(config)
if checkpoint_name is not None:
print(
f"No checkpoint found at {model_folder} "
f"with name {config.checkpoint_name}"
)
else:
print(
f"No previous checkpoint found at "
f"{model_folder} for {model_name}"
)
else:
print(
f"No previous model found at "
f"{model_folder} for model {model_name}"
)
else:
if is_checkpoint:
# the name is modelname_epoch_00000001_step_00000001.pt
# or modelname_epoch_00000001.pt
if "_step_" in path:
epoch = int(path.split("_epoch_")[-1].split("_")[0])
step = int(path.split("_step_")[-1].split(".")[0])
print(
f"Found checkpoint for epoch {epoch + 1},"
f" step {step + 1}..."
)
else:
epoch = int(path.split("_epoch_")[-1].split(".")[0])
print(f"Found checkpoint for epoch {epoch + 1} ...")
else:
print(f"Found model at {path}")
return path
def init_critic_from_reward(config: ConfigCritic) -> None:
"""Method to initialize the critic from the reward model.
If the critic folder is empty
"""
if config.is_reward is True:
raise ValueError(
"The config should work for the Critic model,"
"but the config seems to be for the Reward model"
)
# check that the critic folder is empty
path = ModelLoader.check_model_path(config)
_, _, critic_path = ModelLoader.get_model_path(config)
if path is None:
print("Initializing Critic from Reward model...")
config.is_reward = True
path = ModelLoader.check_model_path(config)
if path is not None:
_, _, reward_path = ModelLoader.get_model_path(config)
# copy the file in reward_path to critic_path
shutil.copy(reward_path, critic_path)
else:
print("Critic Model remains uninitialized")
config.is_reward = False
================================================
FILE: optimization/chatllama/chatllama/rlhf/reward.py
================================================
import json
import shutil
import os
import deepspeed
import torch
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Iterable, Tuple
from einops.layers.torch import Rearrange
from torch.utils.data import Dataset, DataLoader
from transformers import (
AutoModel,
AutoTokenizer,
)
from chatllama.rlhf.config import ConfigReward
from chatllama.rlhf.model_list import hf_models
from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.utils import TrainingStats
class RewardModel(torch.nn.Module):
"""Model to be trained to predict the reward for RL.
or to be used as Critic in RL. It is a Language Model with a head
that predicts the reward (a scalar) for a given sequence of tokens.
Attributes:
model (torch.nn.Module): Model to be used for the reward model
tokenizer (torch.nn.Module): Tokenizer to be used for the reward model
head (torch.nn.Module): Head to be used for the reward model
config (ConfigReward): Config parameters for the reward model
Methods:
load_tokenizer: Load the tokenizer for the reward model
forward: Forward pass of the model (used by the critic)
save: Save the model
load: Load the model
get_reward: Get the reward for a given input (used by the reward model)
parameters: Return the parameters of the reward model
"""
def __init__(self, config: ConfigReward) -> None:
super().__init__()
# store config
self.config = config
# initialize the self.model
head_hidden_size = config.model_head_hidden_size
if config.model in hf_models:
self.tokenizer = self.load_tokenizer(config)
self.model = AutoModel.from_pretrained(config.model)
head_dim = self.model.config.hidden_size
if config.model.startswith("gpt2"):
head_dim = self.model.config.n_embd
self.head = torch.nn.Sequential(
torch.nn.Linear(head_dim, head_hidden_size),
torch.nn.ReLU(),
torch.nn.Linear(head_hidden_size, 1),
Rearrange("... 1 -> ..."),
)
else:
raise ValueError(f"Model {config.model} not supported")
# load the model
self.load()
# freeze model parameters (only train the head)
# for param in self.model.parameters():
# param.requires_grad = False
# move model to device
self.model.to(config.device)
self.head.to(config.device)
@staticmethod
def load_tokenizer(config: ConfigReward):
# load tokenizer from HF
tokenizer = AutoTokenizer.from_pretrained(
config.model,
padding_side="left",
padding=True,
truncation=True,
model_max_length=config.max_sequence_length,
)
# add eos token if not present
if tokenizer.eos_token is None:
tokenizer.eos_token = ""
tokenizer.eos_token_id = 2 # OPT eos token id
# add pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
return tokenizer
@beartype
def load(self) -> None:
"""Load the model from the path"""
# look for a pretrained model
path = ModelLoader.check_model_path(
config=self.config,
is_checkpoint=False,
current_epoch=None,
)
# check if the model exists
if path is not None:
# load the model from the path
print("Loading ...")
model_dict = torch.load(path)
self.model.load_state_dict(model_dict.get("state_dict") or model_dict.get("model"))
self.head.load_state_dict(model_dict["head"])
@beartype
def save(self) -> None:
"""Save the model to the path"""
# get the path to save the model
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=False,
current_epoch=None,
)
# save the model
print(f"Saving model to {path} ...")
torch.save(
{"model": self.model.state_dict(), "head": self.head.state_dict()},
path,
)
@beartype
def parameters(
self,
) -> Iterable[torch.nn.Parameter]:
"""Return the parameters of the reward model"""
for p in self.model.parameters():
yield p
for p in self.head.parameters():
yield p
@beartype
def forward(
self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor
) -> torch.Tensor:
"""Generate the sequence of rewards for the given output sequence
what is the quality of the output sequence tokens?
Args:
output_sequence (torch.Tensor): The sequence of tokens to be
evaluated
output_sequence_mask (torch.Tensor): Mask for the attention
Returns:
torch.Tensor: Rewards for the given output sequence
"""
output = self.model(
output_sequence, attention_mask=output_sequence_mask
)
# What if the output_sequence is longer than the max context of
# the model?
rewards = self.head(output.last_hidden_state)
if self.config.debug:
print("RewardModel.forward")
print("output_sequence.shape", output_sequence.shape)
print("output_sequence", output_sequence)
print("reward.shape", rewards.shape)
print("reward", rewards)
return rewards
@beartype
def get_reward(
self, output_sequence: torch.Tensor, output_sequence_mask: torch.Tensor
) -> torch.Tensor:
"""Get the reward for the given output sequence
Args:
output_sequence (torch.Tensor): The concatenation of initial input
and actor output as tokens
output_sequence_mask (torch.Tensor): Mask for the attention
"""
if output_sequence.shape[1] > self.config.max_sequence_length:
raise ValueError(
f"Output sequence is too long: {output_sequence.shape[1]}"
f" > {self.config.max_sequence_length}"
)
rewards = self.forward(output_sequence, output_sequence_mask)
return rewards[:, -1]
# just to keep namings consistent
CriticModel = RewardModel
class RewardDataset(Dataset):
"""Dataset class for the reward model
read a json file with the following format:
[
{
"user_input": "...",
"completion": "...",
"score": ...
},
...
]
Where:
user_input: the initial input of the user
completion: the completion generated by the model
score: the score given by the user to the completion (or by the LLM)
"""
def __init__(self, path: str) -> None:
print(f"Loading dataset from {path}")
with open(path, "r") as f:
self.data = list(json.load(f))
print(f"Loaded {len(self.data)} samples")
def __getitem__(self, idx: int):
user_input = self.data[idx]["user_input"]
completion = self.data[idx]["completion"]
if self.data[idx]["score"]:
score = float(self.data[idx]["score"])
else:
score = 2.5
item = (user_input + completion, score)
return item
def __len__(
self,
):
return len(self.data)
class RewardTrainer:
"""Class to train the reward model
Args:
config (ConfigModel): Config parameters for the model
Attributes:
model (RewardModel): Reward model
config (ConfigModel): Config parameters for the model
optimizer (torch.optim): Optimizer for the model
loss_function (torch.nn): Loss function for the model
validation_flag (bool): Flag to indicate if the validation dataset
is available
train_dataset (RewardDataset): Dataset for training
validation_dataset (RewardDataset): Dataset for validation
train_dataloader (DataLoader): Dataloader for training
validation_dataloader (DataLoader): Dataloader for validation
scheduler (torch.optim.lr_scheduler): Scheduler for the optimizer
training_stats (List[Dict]): List of dictionaries with the training
statistics
model_engine (ModelEngine): Model engine to train the model
using deepspeed
accelerator (Accelerator): Accelerator to train the model using
accelerate by HF.
Methods:
train: Train the reward model
save_checkpoints: Save the checkpoints of the model
load_checkpoints: Load the checkpoints of the model
"""
def __init__(self, config: ConfigReward) -> None:
# save the config
self.config = config
# load the model
self.reward = RewardModel(config)
# optimizer
self.optimizer = torch.optim.AdamW(
self.reward.parameters(), lr=config.lr
)
# loss function
self.loss_function = torch.nn.MSELoss()
# check validation dataset
self.validation_flag = False
if config.validation_dataset_path is not None:
self.validation_flag = True
# create dataset and dataloaders
self.train_dataset = RewardDataset(config.train_dataset_path)
self.train_dataloader = DataLoader(
self.train_dataset, batch_size=config.batch_size
)
if self.validation_flag:
self.eval_dataset = RewardDataset(config.validation_dataset_path)
self.validation_dataloader = DataLoader(
self.eval_dataset, batch_size=config.batch_size
)
# intilize scheduler - learning rate will drop to 10% of the initial
# value
self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
self.optimizer,
T_0=len(self.train_dataset) // config.batch_size,
T_mult=1,
eta_min=config.lr * 0.1,
last_epoch=-1,
)
# initialize training stats
stats_path = ModelLoader.get_training_stats_path(config)
self.training_stats = TrainingStats(stats_path)
# consistency check between accelerate and deepspeed
if config.accelerate_enable and config.deepspeed_enable:
raise ValueError(
"Both DeepSpeed and Accelerate are enabled for the Reward."
"Please choose one of them."
)
# initialize deepspeed
self.model_engine = None
if config.deepspeed_enable is True:
if config.deepspeed_config_path is None:
raise ValueError(
"DeepSpeed config path is None, but deepspeed is enabled"
)
if os.path.exists(config.deepspeed_config_path) is False:
raise ValueError(
f"DeepSpeed config path {config.deepspeed_config_path}"
f"does not exist"
)
(
self.model_engine,
self.optimizer,
self.train_dataloader,
self.scheduler,
) = deepspeed.initialize(
args=None,
model=self.reward,
model_parameters=self.reward.parameters(),
training_data=self.train_dataset,
config=self.config.deepspeed_config_path,
)
print("Training with DeepSpeed")
# initialize accelerate
self.accelerator = None
if config.accelerate_enable is True:
self.accelerator = Accelerator()
(
self.reward,
self.optimizer,
self.train_dataloader,
self.scheduler,
) = self.accelerator.prepare(
self.reward,
self.optimizer,
self.train_dataloader,
self.scheduler,
)
print("Training with Accelerate")
@beartype
def save_checkpoint(
self,
current_epoch: int,
current_step: int,
max_epochs: int,
max_steps: int,
) -> None:
"""Save the checkpoints of the model
Args:
current_epoch (int): Current epoch
current_step (int): Current step
max_epochs (int): Maximum number of epochs
max_steps (int): Maximum number of steps
"""
print(
f"Saving checkpoint for epoch {current_epoch + 1}, "
f" step {current_step} ..."
)
# get the path to save the checkpoint
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=current_epoch,
current_step=current_step,
max_epochs=max_epochs,
max_steps=max_steps,
)
# remove the checkpoint if it already exists
if os.path.exists(path):
if self.config.deepspeed_enable:
shutil.rmtree(path)
else:
os.remove(path)
# save the checkpoint
if self.config.deepspeed_enable:
client_state = {
"epoch": current_epoch,
"step": current_step,
}
self.model_engine.save_checkpoint(path, client_state=client_state)
else:
torch.save(
{
"state_dict": self.reward.model.state_dict(),
"optim_state_dict": self.optimizer.state_dict(),
"scheduler_state_dict": self.scheduler.state_dict(),
"training_stats": self.training_stats,
"epoch": current_epoch,
"step": current_step,
},
path,
)
@beartype
def load_checkpoint(
self,
) -> Tuple[int, int]:
"""Load the checkpoints of the model
Returns:
Tuple[int, int]: The current epoch and step
from which you should resume the training
"""
print("Looking for checkpoints...")
# look for the checkpoints
path = ModelLoader.check_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=None,
)
# check if a checkpoint exists
if path is not None:
print("Loading ...")
if self.config.deepspeed_enable:
# try to load the checkpoint
try:
_, client_state = self.model_engine.load_checkpoint(path)
except Exception:
print(
"Checkpoint corrupted!"
"Try to remove the last checkpoint."
"Now Starting from epoch 0, step 0"
)
return 0, 0
# load epoch and step to resume loops
epoch = client_state["epoch"]
step = client_state["step"]
else:
# try to load the checkpoint
try:
checkpoint = torch.load(path)
except Exception:
print(
"Checkpoint corrupted!"
"Try to remove the last checkpoint."
"Now Starting from epoch 0, step 0"
)
return 0, 0
# load the model parameters and optimizer parameters
# from the checkpoint
epoch = checkpoint["epoch"]
self.reward.model.load_state_dict(checkpoint["state_dict"])
self.optimizer.load_state_dict(checkpoint["optim_state_dict"])
self.scheduler.load_state_dict(
checkpoint["scheduler_state_dict"]
)
self.training_stats = checkpoint["training_stats"]
step = checkpoint["step"]
return epoch, step + 1 # return the next episode to train
return 0, 0
def train(
self,
) -> None:
"""Train the reward model"""
print("Start Training the Reward Model")
# get config parameters
if self.config.deepspeed_enable:
batch_size = self.train_dataloader.batch_size
else:
batch_size = self.config.batch_size
epochs = self.config.epochs
device = self.config.device
iteration_per_print = self.config.iteration_per_print
checkpoint_steps = self.config.checkpoint_steps
# compute the number of iterations
n_iter = int(len(self.train_dataset) / batch_size)
# load checkpoint
start_epoch, start_step = self.load_checkpoint()
# counter for the checkpoint
cnt_checkpoints = 1
# traing loop
for epoch in range(start_epoch, epochs):
self.reward.train()
for i, inputs in enumerate(self.train_dataloader):
# skip the steps if resuming from a checkpoint
if i < start_step:
continue
# get the inputs
input_text = inputs[0]
score = inputs[1]
# tokenize the input
with torch.no_grad():
input_tokens = self.reward.tokenizer(
input_text,
return_tensors="pt",
truncation=True,
padding=True,
)
output = torch.as_tensor(
score, dtype=torch.float32, device=device
)
# forward pass
if self.config.deepspeed_enable:
est_output = self.model_engine(
input_tokens["input_ids"].to(device),
input_tokens["attention_mask"].to(device),
)[:, -1]
else:
est_output = self.reward.get_reward(
input_tokens["input_ids"].to(device),
input_tokens["attention_mask"].to(device),
)
# compute the loss
loss = self.loss_function(est_output, output)
self.training_stats.training_loss.append(loss.item())
# backward pass
if self.config.deepspeed_enable:
self.model_engine.backward(loss)
self.model_engine.step()
elif self.config.accelerate_enable:
self.optimizer.zero_grad()
self.accelerator.backward(loss)
self.optimizer.step()
self.scheduler.step()
else:
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.scheduler.step()
# print progress
if i % iteration_per_print == 0:
print(
f"Epoch: {epoch+1}/{epochs}, "
f"Iteration: {i+1}/{n_iter}, "
f"Training Loss: {loss.item()}"
)
printed_est_output = [
round(float(x), 1) for x in est_output.cpu().tolist()
]
print(
"prediction",
printed_est_output,
"target",
score.cpu().tolist(),
)
# checkpoints saving
if cnt_checkpoints % checkpoint_steps == 0:
self.save_checkpoint(epoch, i, epochs, n_iter)
cnt_checkpoints = 1
else:
cnt_checkpoints += 1
# Validation
if self.validation_flag:
self.reward.eval()
with torch.no_grad():
for i, (text, score) in enumerate(
self.validation_dataloader
):
# tokenize inputs
input_tokens = self.reward.tokenizer(
text, return_tensors="pt", padding=True
)
input_tokens = input_tokens.to(device)
# TODO: check on the length of the input tokens if
# they are too many it can create problems
output = torch.tensor(score, dtype=torch.float32).to(
device
)
# forward pass
est_output = self.reward.get_reward(
input_tokens["input_ids"],
input_tokens["attention_mask"],
)
# compute loss
loss = self.loss_function(est_output, output)
self.training_stats.validation_loss.append(loss.item())
# print progress
if i % iteration_per_print == 0:
print(
f"Epoch: {epoch+1}/{epochs}, "
f"Iteration: {i+1}/{n_iter}, "
f"Validation Loss: {loss.item()}"
)
# reset start_step after training is resumed
start_step = 0
# save the model at the end of the training
self.reward.save()
================================================
FILE: optimization/chatllama/chatllama/rlhf/trainer.py
================================================
import json
import os
import random
from collections import deque, namedtuple
import deepspeed
import torch
import torch.distributed as dist
from accelerate import Accelerator
from beartype import beartype
from beartype.typing import Deque, List, Tuple, Union
from deepspeed.runtime.engine import DeepSpeedEngine
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from chatllama.rlhf.actor import ActorModel
from chatllama.rlhf.config import (
Config,
ConfigActor,
ConfigCritic,
ConfigReward,
)
from chatllama.rlhf.model_list import hf_models
from chatllama.rlhf.model_loader import ModelLoader
from chatllama.rlhf.reward import RewardModel, CriticModel
from chatllama.rlhf.utils import TrainingStats, ConversationLog
"""
train()
┌─────────────────────────────┐
│ │◄─────────────────────────┐
│ │ │
│ ┌─────────────┐ │ │
│ │ user input │ │ │ learn()
│ └─────┬───────┘ │ ┌────────────┴─────────────┐
│ │ │ │ │
│ │ │ │ ┌────────┐ │
│ │ │ │ ┌───│ Update │──┐ │
│ │ │ │ │ └────▲───┘ │ │
│ ┌────────▼────────────┐ │ │ │ │ │ │
│ │ Actor (LLM Model) │ │ │ │ ┌──┴───┐ │ │
│ └────────┬────────────┘ │ │ │ │ PPO │ │ │
│ │ │ │ │ └▲────▲┘ │ │
│ │ │ │ │ │ │ │ │
│ │ │ │ │ │ │ │ │
│ ┌───────▼──────┐ │ │ ┌─▼──────┴┐ ┌─┴───▼──┐ │
│ │ Reward Model │ │ │ │ Actor │ │ Critic │ │
│ └──────────────┘ │ │ └─────────┘ └────────┘ │
│ │ │ │
│ │ x Episodes └─────────────▲────────────┘
└───────────────┬─────────────┘ │ x Epochs
│ store N Examples per Timestep │
┌──────▼──────┐ │
│ │ │
│ Memories ├──────────────────────────────────┘
│ │ (update timesteps x N Examples)
└─────────────┘
""" # noqa W291
def change_tokenization(tokens, tokenizer1, tokenizer2):
"""Change the tokenizer of the tokens
Args:
tokens (torch.Tensor): Tokens to be changed
tokenizer1 (transformers.PreTrainedTokenizer): Tokenizer to be changed
tokenizer2 (transformers.PreTrainedTokenizer): Tokenizer to be
changed to
Returns:
encoded_tokens: Encoded tokens
"""
# decode tokens
with torch.no_grad():
decoded_tokens = [
tokenizer1.decode(token) for i, token in enumerate(tokens)
]
# remove all the pad tokens
decoded_tokens = [
token.replace(tokenizer1.pad_token, "") for token in decoded_tokens
]
# remove all the eos tokens
decoded_tokens = [
token.replace(tokenizer1.eos_token, "") for token in decoded_tokens
]
# encode the actions with critic tokenizer
encoded_tokens = tokenizer2(
decoded_tokens,
return_tensors="pt",
padding=True,
truncation=True,
)
return encoded_tokens
ConfigType = Union[ConfigActor, ConfigReward, ConfigCritic]
@beartype
def check_model_family(config1: ConfigType, config2: ConfigType) -> bool:
"""Check if the model family is the same for the two configs
the model family is specified in the config.model
Args:
config1 (ConfigType): First config
config2 (ConfigType): Second config
Returns:
bool: True if the model family is the same, False otherwise
"""
# check if both are an hugging face models
if (config1.model in hf_models) and (config2.model in hf_models):
# if there is a "/" remove it from the name
model_name1 = config1.model
model_name2 = config2.model
if "/" in model_name1:
model_name1 = model_name1.split("/")[1]
if "/" in model_name2:
model_name2 = model_name2.split("/")[1]
# check if the model family is the same
return model_name1.split("-")[0] == model_name2.split("-")[0]
# check if both are not an hugging face models
elif (config1.model not in hf_models) and (config2.model not in hf_models):
# for now they could be only LLaMA models
return True
else:
return False
class ActorCritic(torch.nn.Module):
"""Actor Critic class stores both the actor and the critic models
and it generates values and action for given sequences during the training
of the actor.
Attributes:
actor (ActorModel): Actor model
critic (CriticModel): Critic model
debug (bool): enable prints for Debugging
use_same_tokenizer (bool): if True the actor and critic use the same
tokenizer
Methods:
forward: given a sequence returns action logits and values (used
to evaluate the actor during training)
generate: given a sequence returns action, action logits, values
sequences and sequences masks (used to generate new sequences
during acting phase)
"""
def __init__(self, config: Config) -> None:
super().__init__()
self.config = config
self.actor = ActorModel(config.actor)
# check if critic must be initialized from reward model
ModelLoader.init_critic_from_reward(config.critic)
self.critic = CriticModel(config.critic)
# if the actor and critic use the same tokenizer is set to True
self.use_same_tokenizer = False
# debug flag
self.debug = config.actor.debug
@beartype
def load(self) -> None:
"""Load the model from the path.
This method is not implemented since it relies on actor and critic
__init__ methods to perform the loading from their respective paths
then loaded.
"""
pass
@beartype
def save(self) -> None:
"""Save the model to the path
This method is implemented to save the actor model as result of RLHF
in the folder actor_rl instead of actor.save() method that saves it
in the actor folder.
"""
# get the path to save the actor
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=False,
)
# save the model
print(f"Saving model to {path} ...")
torch.save(
{"state_dict": self.actor.model.state_dict()},
path,
)
# get the path to save the critic model
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config.critic,
is_checkpoint=False,
)
# save the model
print(f"Saving model to {path} ...")
torch.save(
{
"model": self.critic.model.state_dict(),
"head": self.critic.head.state_dict(),
},
path,
)
def save_deepspeed(
self,
model_engine: DeepSpeedEngine,
config: ConfigType,
client_state: dict = None,
):
"""Save the deepspeed model_engine to the path
This method is implemented to save the actor model as result of RLHF
in the folder actor_rl instead of actor.save() method that saves it
in the actor folder. Same goes for the critic model.
"""
# get the path to save the actor
model_folder, model_name, path = ModelLoader.get_model_path(
config=config,
is_checkpoint=False,
)
# save the model
print(f"Saving model to {path} ...")
model_engine.save_checkpoint(
save_dir=path, client_state=client_state if client_state else {}
)
@beartype
def forward(
self,
sequences_actor: torch.Tensor,
sequences_mask_actor: torch.Tensor,
sequences_critic: torch.Tensor,
sequences_mask_critic: torch.Tensor,
action_len_actor: int,
action_len_critic: int,
) -> Tuple:
"""Given the whole sequences, use the actor forward to get the logits
for each token in the sequence and the critic forward to get the
values for each generation step.
Args:
sequences_actor (torch.Tensor): Sequences composed of
[states, actions] for the actor
sequence_mask_actor (torch.Tensor): Mask for the sequences
of the actor
sequences_critic (torch.Tensor): Sequences composed of
[states, actions] for the critic
sequences_mask_critic (torch.Tensor): Mask for the sequences
of the critic
action_len_actor (int): Length of the actions in the sequences
for the actor
action_len_critic (int): Length of the actions in the sequences
for the critic
Returns:
action_logits (torch.Tensor): Logits for the actions in the
sequences
values (torch.Tensor): Values for the actions in the sequences
"""
# use a single forward on the whole sequence
# to get pi(y | x) and ignore predicted output
actions_logits = self.actor.forward(
sequences_actor, sequences_mask_actor
)
# use the critic forward to get the values for the actions
values = self.critic.forward(sequences_critic, sequences_mask_critic)
# return only logits and values for the actions taken
real_actions_logits = actions_logits[:, -action_len_actor:, :]
real_values = values[:, -action_len_critic:]
if self.debug:
print("ActorCritic.forward")
print("action_len_actor", action_len_actor)
print("action_len_critic", action_len_critic)
print("sequences_actor.shape", sequences_actor.shape)
print("sequences_actor", sequences_actor)
print("sequences_critic.shape", sequences_critic.shape)
print("sequences_critic", sequences_critic)
print("real_action_logits.shape", actions_logits.shape)
print("real_action_logits", actions_logits)
print("real_values.shape", values.shape)
print("real_values", values)
return (
real_actions_logits,
real_values,
)
@torch.no_grad()
@beartype
def generate(
self,
states_actor: torch.Tensor,
states_mask_actor: torch.Tensor,
states_critic: torch.Tensor,
) -> Tuple:
"""Generate actions, actions_logits, values and sequences from states
Args:
states_actor (torch.Tensor): States for the actor
states_mask_actor (torch.Tensor): Mask for the states for the
actor
states_critic (torch.Tensor): States for the critic
Returns:
actions (torch.Tensor): Actions generated from the states
actions_logits (torch.Tensor): Logits for the actions generated
from the states (i.e. pi(y | x))
values (torch.Tensor): Values generated by the critic model
for the actions generated by the actor (i.e. V(x))
sequences (torch.Tensor): Sequences generated from the states
as [states, actions]
"""
# generate action sequence from the actor
actions, sequences_actor = self.actor.generate(
states_actor, states_mask_actor
)
# create mask for the actor sequences
sequences_mask_actor = (
(sequences_actor != self.actor.tokenizer.pad_token_id)
.to(sequences_actor.device)
.long()
.detach()
)
# get the length of the actions
action_len_actor = actions.shape[1]
# check if different encoding is needed for the critic
if self.use_same_tokenizer:
sequences_critic = sequences_actor
sequences_mask_critic = sequences_mask_actor
action_len_critic = action_len_actor
else:
encoded_critic = change_tokenization(
sequences_actor,
self.actor.tokenizer,
self.critic.tokenizer,
)
# split the encoded_critic in tokens and maks
sequences_critic = encoded_critic["input_ids"].to(
sequences_actor.device,
)
sequences_mask_critic = (
encoded_critic["attention_mask"]
.to(sequences_actor.device)
.long()
.detach()
)
# compute len of actions for the critic tokenizer
action_len_critic = states_critic.shape[1]
# generate actions_logits and values
actions_logits, values = self.forward(
sequences_actor,
sequences_mask_actor,
sequences_critic,
sequences_mask_critic,
action_len_actor,
action_len_critic,
)
if self.debug:
print("ActorCritic.generate")
print("actions shape", actions.shape)
print("actions", actions)
print("sequence shape", sequences_actor.shape)
print("sequence", sequences_actor)
print("actions_logits shape", actions_logits.shape)
print("actions_logits", actions_logits)
print("values shape", values.shape)
print("values", values)
return (
actions,
actions_logits,
values,
sequences_actor,
sequences_mask_actor,
sequences_critic,
sequences_mask_critic,
action_len_actor,
action_len_critic,
)
# structure to store the data for each experience
Memory = namedtuple(
"Memory",
[
"states_actor",
"actions",
"values",
"rewards",
"actions_log_probs",
"sequences_actor",
"sequences_mask_actor",
"sequences_critic",
"sequences_mask_critic",
"action_len_actor",
"action_len_critic",
],
)
class ExperienceDataset(Dataset):
"""Dataset to train the actor-critic models"""
def __init__(
self,
memories: Deque[Memory],
device: torch.device,
) -> None:
super().__init__()
self.data = list(memories)
def __len__(
self,
) -> int:
return len(self.data)
def __getitem__(self, idx) -> Tuple:
# return the idx-th memory element as a tuple of tensors on the device
item = (
self.data[idx].states_actor,
self.data[idx].actions,
self.data[idx].values,
self.data[idx].rewards,
self.data[idx].actions_log_probs,
self.data[idx].sequences_actor,
self.data[idx].sequences_mask_actor,
self.data[idx].sequences_critic,
self.data[idx].sequences_mask_critic,
int(self.data[idx].action_len_actor),
int(self.data[idx].action_len_critic),
)
return item
class ExamplesSampler:
"""Store the prompt to be sampled to generate the examples
read a json file with the following format:
[
{
"user_input" : "",
} ,
...
]
Where:
user_input: is the input of the user or directly the input of the user
with the memory preappended (i.e. user_input + memory)
"""
def __init__(
self,
path: str,
) -> None:
self.path = path
with open(path, "r") as f:
data = json.load(f)
self.data = [d["user_input"] for d in data]
def sample(self, n: int) -> List:
"""Sample n examples from the data
Args:
n (int): Number of examples to sample
"""
return random.sample(self.data, n)
class RLTrainer:
"""Train the actor-critic model using RL
Attributes:
config (Config): Configuration of the trainer
debug (bool): Debug mode
actorcritic (ActorCritic): Actor-critic model
actor_optim (torch.optim): Optimizer for the actor
critic_optim (torch.optim): Optimizer for the critic
actor_scheduler (torch.optim.lr_scheduler): Scheduler for the actor
critic_scheduler (torch.optim.lr_scheduler): Scheduler for the critic
reward (RewardModel): Reward model
training_stats (TrainingStats): Class to store training stats
conversation_log (ConversationLog): Class to store the conversation
examples_sampler (ExamplesSampler): Class to sample examples
eps (float): small epsilon to avoid division by zero
Methods:
train: the training loop that calls the learn function after generating
the experiences.
learn: Learn from a batch of experiences and update the actor and the
critic model.
load_checkpoint: Load the checkpoint of the actor-critic model
save_checkpoint: Save the checkpoint of the actor-critic model
"""
def __init__(
self,
config: Config,
) -> None:
# save config
self.config = config
# set debug mode
self.debug = config.trainer.debug
# initialize agent-critic
self.actorcritic = ActorCritic(config)
# initialize actor optimizer
self.actor_optimizer = torch.optim.Adam(
self.actorcritic.actor.parameters(), lr=config.trainer.actor_lr
)
# initialize critic optimizer
self.critic_optimizer = torch.optim.Adam(
self.actorcritic.critic.parameters(), lr=config.trainer.critic_lr
)
# scheduler (defined in the learn() method (i need dataset size))
self.actor_scheduler = None
self.critic_scheduler = None
# initialize reward model
self.reward = RewardModel(config.reward)
# initialize class to store training stats
path = ModelLoader.get_training_stats_path(config)
self.training_stats = TrainingStats(path)
model_folder, _, _ = ModelLoader.get_model_path(
config,
is_checkpoint=True,
)
path = os.path.join(model_folder, "conversations_log.json")
self.conversation_log = ConversationLog(path)
# initialize examples sampler
self.example_sampler = ExamplesSampler(config.trainer.examples_path)
# check if actor and critic use the same tokenizer
self.actorcritic.use_same_tokenizer = check_model_family(
config.actor, config.critic
)
# check if actor and reward use the same tokenizer
self.use_same_tokenizer = check_model_family(
config.actor, config.reward
)
# eps
self.eps = 1e-8
# deepspeed initialization
self.actor_model_engine = None
self.critic_model_engine = None
self.is_deepspeed_init = None
if (
self.config.actor.deepspeed_enable
or self.config.critic.deepspeed_enable
or self.config.critic.deepspeed_enable
):
deepspeed.init_distributed("nccl")
self.is_deepspeed_init = True
os.environ["TOKENIZERS_PARALLELISM"] = "False"
else:
self.is_deepspeed_init = False
if self.config.actor.deepspeed_enable:
(
self.actor_model_engine,
self.actorcritic.actor,
self.actor_optimizer,
) = self.initialize_deepspeed_model(
config=self.config.actor, model=self.actorcritic.actor
)
if self.config.critic.deepspeed_enable:
(
self.critic_model_engine,
self.actorcritic.critic,
self.critic_optimizer,
) = self.initialize_deepspeed_model(
config=self.config.critic, model=self.actorcritic.critic
)
if self.config.reward.deepspeed_enable:
(
_,
self.reward,
_,
) = self.initialize_deepspeed_model(
config=self.config.reward, model=self.reward
)
@staticmethod
def initialize_deepspeed_model(
config: Union[ConfigActor, ConfigCritic, ConfigReward],
model: torch.nn.Module,
):
if config.deepspeed_config_path is None:
raise ValueError("DeepSpeed config path is None, but deepspeed is enabled")
if os.path.exists(config.deepspeed_config_path) is False:
raise ValueError(
f"DeepSpeed config path"
f"{config.deepspeed_config_path}"
f"does not exist"
)
(model_engine, ds_optimizer, _, _,) = deepspeed.initialize(
args=None,
model=model,
model_parameters=model.parameters(),
config=config.deepspeed_config_path,
)
# model_engine.module has to be returned to make custom methods
# of Module accessible
return model_engine, model_engine.module, ds_optimizer
@beartype
def save_checkpoint(
self,
current_episode: int,
max_episode: int,
) -> None:
print(f"Saving checkpoint for episode {current_episode+1}..")
# get the path to save the checkpoint for the critic
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config.critic,
is_checkpoint=True,
current_epoch=current_episode,
max_epochs=max_episode,
max_steps=0,
)
# if the checkpoint already exists remove it.
# Deepspeed checkpoints are already directories and will be overwritten
if os.path.exists(path) and not self.is_deepspeed_init:
os.remove(path)
# save the checkpoint
actor_checkpoint_dict = {
"episode": current_episode,
"critic_state_dict": self.actorcritic.critic.state_dict(),
"critic_optim_state_dict": self.critic_optimizer.state_dict(),
}
if self.config.actor.deepspeed_enable:
# The model and optimizer state dicts are actually already saved
# In the deepspeed model engine. But to make sure no depending
# methods fail, the states are included in actor_checkpoint_dict.
# ATTENTION: If you use deepspeed zero optimization, the client_state
# will not be saved
self.actor_model_engine.save_checkpoint(
save_dir=path, client_state=actor_checkpoint_dict
)
else:
torch.save(actor_checkpoint_dict, path)
# get the path to save the checkpoint for the actor
model_folder, model_name, path = ModelLoader.get_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=current_episode,
max_epochs=max_episode,
max_steps=0,
)
# if the checkpoint already exists remove it.
# Deepspeed checkpoints are already directories and will be overwritten
if os.path.exists(path) and not self.is_deepspeed_init:
os.remove(path)
# save the checkpoint
critic_checkpoint_dict = {
"episode": current_episode,
"actor_state_dict": self.actorcritic.actor.state_dict(),
"actor_optim_state_dict": self.actor_optimizer.state_dict(),
"training_stats": self.training_stats,
}
if self.config.critic.deepspeed_enable:
# The model and optimizer state dicts are actually already saved
# In the deepspeed model engine. But to make sure no depending
# methods fail, the states are included in critic_checkpoint_dict.
# ATTENTION: If you use deepspeed zero optimization, the client_state
# will not be saved
self.critic_model_engine.save_checkpoint(
save_dir=path, client_state=critic_checkpoint_dict
)
else:
torch.save(critic_checkpoint_dict, path)
@beartype
def load_checkpoint(
self,
) -> int:
critic_episode = -1
actor_episode = -1
# check if there are some checkpoint for the critic
print("Looking for checkpoints...")
path = ModelLoader.check_model_path(
config=self.config.critic,
is_checkpoint=True,
current_epoch=None,
)
# if there are checkpoint
if path is not None:
# load the critic checkpoint
print("Loading ...")
try:
checkpoint = torch.load(path)
except Exception:
print(
"Checkpoint of critic corrupted!"
"Try to remove the last checkpoint."
"Now Starting from episode 0"
)
return 0
# load checkpoint into model
critic_episode = checkpoint["episode"]
self.actorcritic.critic.load_state_dict(
checkpoint["critic_state_dict"]
)
self.critic_optimizer.load_state_dict(
checkpoint["critic_optim_state_dict"]
)
# check if there are checkpoints for the actor
print("Looking for checkpoints...")
path = ModelLoader.check_model_path(
config=self.config,
is_checkpoint=True,
current_epoch=None,
)
# if there are some checkpoints
if path is not None:
# load the actor checkpoint
print("Loading ...")
try:
checkpoint = torch.load(path)
except Exception:
print(
"Checkpoint of actor corrupted!"
"Try to remove the last checkpoint."
"Now Starting from episode 0"
)
return 0
# load checkpoint into the model
actor_episode = checkpoint["episode"]
self.actorcritic.actor.load_state_dict(
checkpoint["actor_state_dict"]
)
self.actor_optimizer.load_state_dict(
checkpoint["actor_optim_state_dict"]
)
self.training_stats = checkpoint["training_stats"]
# check if there are some discrepancies between the checkpoints
if critic_episode == actor_episode:
# all ok start from next episode
return critic_episode + 1
else:
print(
f"There are some discrepancies between the checkpoints"
f"of actor and critic \nactor episode: {actor_episode}"
f"\n critic episode: {critic_episode}\n"
)
return min(critic_episode, actor_episode) + 1
@beartype
def learn(self, memories: Deque[Memory]) -> None:
"""Train the agent-critic model using RL:
- for each batch of episodes, compute action logits and values
- then compare action logits probs with memories one and values with
rewards to compute the PPO loss and update the actor-critic model
"""
print("Start to Learn...")
# get parameters
epochs = self.config.trainer.epochs
actor_eps_clip = self.config.trainer.actor_eps_clip
critic_eps_clip = self.config.trainer.critic_eps_clip
beta_s = self.config.trainer.beta_s
batch_size = self.config.trainer.batch_size
device = (
torch.device(f"cuda:{dist.get_rank()}")
if self.is_deepspeed_init
else self.config.trainer.device
)
# create dataset from memories
dataset = ExperienceDataset(memories, device)
if self.is_deepspeed_init:
engine = self.actor_model_engine or self.critic_model_engine
dataloader = engine.deepspeed_io(dataset)
else:
dataloader = DataLoader(dataset, batch_size=batch_size)
# initialize scheduler for actor
actor_lr = self.config.trainer.actor_lr
# This lr_scheduler is not available in deepspeed
# see https://deepspeed.readthedocs.io/en/latest/schedulers.html
if not self.is_deepspeed_init:
self.actor_scheduler = CosineAnnealingWarmRestarts(
self.actor_optimizer, T_0=len(dataset), eta_min=actor_lr * 0.1
)
# initialize scheduler for critic
critic_lr = self.config.trainer.critic_lr
# This lr_scheduler is not available in deepspeed
# see https://deepspeed.readthedocs.io/en/latest/schedulers.html
if not self.is_deepspeed_init:
self.critic_scheduler = CosineAnnealingWarmRestarts(
self.critic_optimizer, T_0=len(dataset), eta_min=critic_lr * 0.1
)
# initialize actor accelerate
if self.config.actor.accelerate_enable is True:
actor_accelerator = Accelerator()
(
actor_model,
self.actor_optimizer,
self.train_dataloader,
self.actor_scheduler,
) = actor_accelerator.prepare(
self.actorcritic.actor,
self.actor_optimizer,
self.train_dataloader,
self.actor_scheduler,
)
self.actorcritic.actor = actor_model
# initialize critic accelerate
if self.config.critic.accelerate_enable is True:
critic_accelerator = Accelerator()
(
critic_model,
self.critic_optimizer,
self.critic_scheduler,
) = critic_accelerator.prepare(
self.actorcritic.critic,
self.critic_optimizer,
self.critic_scheduler,
)
self.actorcritic.critic = critic_model
# train agent-critic
self.actorcritic.train()
for epoch in range(epochs):
for k, batch in enumerate(dataloader):
(
states_actor,
old_actions,
old_values,
rewards,
old_actions_log_probs,
sequences_actor,
sequences_mask_actor,
sequences_critic,
sequences_mask_critic,
action_len_actor,
action_len_critic,
) = [tensor.to(device) for tensor in batch]
if self.debug:
print(
f"#########################################"
f" batch from memories {k} \n "
f"#########################################"
f"states_actor {states_actor.shape} \n"
f"old_actions {old_actions.shape} \n"
f"old_values {old_values.shape} \n"
f"rewards {rewards.shape} \n"
f"old_actions_log_probs "
f"{old_actions_log_probs.shape}\n"
f"sequences_actor {sequences_actor.shape} \n"
f"sequences_mask_actor "
f"{sequences_mask_actor.shape} \n"
f"sequences_critic {sequences_critic.shape} \n"
f"sequences_mask_critic "
f"{sequences_mask_critic.shape} \n"
f"action_len_actor {action_len_actor} \n"
f"action_len_critic {action_len_critic} \n"
f"#########################################"
)
# get actor critic new probabilities and values
actions_logits, values = self.actorcritic.forward(
sequences_actor,
sequences_mask_actor,
sequences_critic,
sequences_mask_critic,
action_len_actor.item(),
action_len_critic.item(),
)
# get action log prob
actions_prob = (
torch.softmax(actions_logits, dim=-1).max(dim=-1).values
)
actions_log_prob = torch.log(actions_prob + self.eps)
# compute entropy
entropies = (actions_prob * actions_log_prob).sum(dim=-1)
# compute KL divergence
kl_div_loss = (
(actions_prob * (old_actions_log_probs - actions_log_prob))
.sum(dim=-1)
.mean()
)
# compute ratios
ratios = (actions_log_prob - old_actions_log_probs).exp()
# compute PPO loss
if check_model_family(self.config.actor, self.config.critic):
# compute discounted rewards as in TRL
gamma = self.config.trainer.gamma_discounted
discounted_rewards = torch.zeros_like(old_values)
for i in range(discounted_rewards.shape[1]):
for j in range(i, discounted_rewards.shape[1]):
discounted_rewards[:, i] += (
gamma ** (j - i) * rewards[:, j]
)
advantages = (
discounted_rewards - old_values
) # TRL has opposite sign for old values
advantages = (advantages - advantages.mean(dim=-1)) / (
advantages.std() + self.eps
)
surr1 = advantages * ratios
else:
advantages = rewards - old_values[:, -1]
surr1 = advantages * ratios
surr2 = (
torch.clamp(ratios, 1 - actor_eps_clip, 1 + actor_eps_clip)
* advantages
)
policy_loss = -torch.min(surr1, surr2) - beta_s * entropies
policy_loss = policy_loss.mean()
loss = policy_loss + kl_div_loss
# check if loss item is NaN
if torch.isnan(loss):
raise ValueError("Loss is nan")
# update actor with loss
if self.config.actor.deepspeed_enable:
self.actor_model_engine.backward(loss)
self.actor_model_engine.step()
elif self.config.actor.accelerate_enable:
self.actor_optimizer.zero_grad()
actor_accelerator.backward(loss)
self.actor_optimizer.step()
self.actor_scheduler.step()
else:
self.actor_optimizer.zero_grad()
loss.backward()
self.actor_optimizer.step()
self.actor_scheduler.step()
# compute value loss
# the loss is the distance between the rewards and the values
# I want this distance to be small so that values are
# representative of the rewards, for this reason i took the
# maximum between the two.
# The clip is limiting the slew-rate of values_loss_clipped
value_loss_clipped = old_values + (values - old_values).clamp(
-critic_eps_clip, critic_eps_clip
)
value_loss1 = (value_loss_clipped - rewards) ** 2
value_loss2 = (values - rewards) ** 2
value_loss = torch.max(value_loss1, value_loss2).mean()
if torch.isnan(value_loss):
raise ValueError("Value loss is nan")
# upate critic
if self.config.critic.deepspeed_enable:
self.critic_model_engine.backward(value_loss)
self.critic_model_engine.step()
elif self.config.critic.accelerate_enable:
self.critic_optimizer.zero_grad()
critic_accelerator.backward(loss)
self.critic_optimizer.step()
self.critic_scheduler.step()
else:
self.critic_optimizer.zero_grad()
value_loss.backward()
self.critic_optimizer.step()
self.critic_scheduler.step()
# append the losses to the training stats
self.training_stats.training_loss.append(
loss.detach().cpu().item()
)
self.training_stats.value_loss.append(
value_loss.detach().cpu().item()
)
# print iteration info
print(
f"Epoch {epoch+1}/{epochs}",
f"Step {k+1}/{int(len(dataloader) / batch_size)}",
f"Loss {loss.detach().cpu().item():.4f}",
f"Value Loss {value_loss.detach().cpu().item():.4f}",
)
self.actorcritic.eval()
print("End Learning")
def train(
self,
) -> None:
print("Start RL Training")
# initialize settings
num_episodes = self.config.trainer.num_episodes
max_timesteps = self.config.trainer.max_timesteps
num_examples = self.config.trainer.num_examples
update_timesteps = self.config.trainer.update_timesteps
batch_size = self.config.trainer.batch_size
checkpoint_steps = self.config.trainer.checkpoint_steps
device = (
torch.device(f"cuda:{dist.get_rank()}")
if self.is_deepspeed_init
else self.config.trainer.device
)
# number of elements that the memories should contain when learning
number_of_memories_per_learn_iteration = (
num_examples * update_timesteps
)
# the number of memories must be a multiple of the batch size
assert (
number_of_memories_per_learn_iteration % batch_size == 0
), "The number of memories must be a multiple of the batch size"
# the total number of timesteps done in the train() are
total_number_of_timesteps = num_episodes * max_timesteps
# the total timesteps done should be a multiple of the update timesteps
assert total_number_of_timesteps % update_timesteps == 0, (
"The number of timesteps (num_episodes*max_timesteps)"
"must be a multiple of the update_timesteps"
)
# initialize memories
memories = deque([])
# load checkpoint
start_episode = self.load_checkpoint()
# if it is a new training from the start clear the conversation log
if start_episode == 0:
self.conversation_log.clear()
# initialize counters
cnt_timesteps = 0
cnt_learn_iter = 0
# loop over episodes and timesteps
self.actorcritic.eval()
for episode in range(start_episode, num_episodes):
for timestep in range(max_timesteps):
# print the iteration info
print(
f"Episode: {episode + 1}/{num_episodes}, "
f"Timestep: {timestep + 1}/{max_timesteps}",
f"Learning Cnt: {cnt_timesteps + 1}/{update_timesteps}",
)
# counter used to count timesteps into memory
cnt_timesteps += 1
# sample num_examples examples from example dataset
inputs = self.example_sampler.sample(num_examples)
# tokenize examples for the actor
tok_inputs_act = self.actorcritic.actor.tokenizer(
inputs, padding=True, return_tensors="pt", truncation=True
)
# states are [batch_size, seq_len_of_states]
states_actor = tok_inputs_act["input_ids"].to(device)
states_mask_actor = tok_inputs_act["attention_mask"].to(device)
# tokenize examples for the critic
tok_inputs_crt = self.actorcritic.critic.tokenizer(
inputs, padding=True, return_tensors="pt", truncation=True
)
# states are [batch_size, seq_len_of_states]
states_critic = tok_inputs_crt["input_ids"].to(device)
# generate sequences of actions and values
(
actions,
actions_logits,
values,
sequences_actor,
sequences_mask_actor,
sequences_critic,
sequences_mask_critic,
action_len_actor,
action_len_critic,
) = self.actorcritic.generate(
states_actor, states_mask_actor, states_critic
)
# compute action log probs
action_prob = (
torch.softmax(actions_logits, dim=-1).max(dim=-1).values
)
actions_log_probs = torch.log(action_prob + self.eps)
# get tokenized sequence for the reward models
if self.use_same_tokenizer:
reward_sequence = sequences_actor
reward_mask = sequences_mask_actor
elif check_model_family(
self.config.critic, self.config.reward
):
reward_sequence = sequences_critic
reward_mask = sequences_mask_critic
else:
tokenized_responses = change_tokenization(
sequences_actor,
self.actorcritic.actor.tokenizer,
self.reward.tokenizer,
)
# get tokens and mask
reward_sequence = tokenized_responses["input_ids"].to(
device
)
reward_mask = tokenized_responses["attention_mask"].to(
device
)
# compute rewards
rewards = self.reward.forward(
reward_sequence,
reward_mask,
)
rewards = rewards[:, -action_len_critic:]
reward = rewards[:, -1]
# store memories of the episode / timestep
for i in range(states_actor.shape[0]):
memories.append(
Memory(
states_actor[i, :].detach().cpu(),
actions[i, :].detach().cpu(),
values[i, :].detach().cpu(),
rewards[i, :].detach().cpu(),
actions_log_probs[i, :].detach().cpu(),
sequences_actor[i, :].detach().cpu(),
sequences_mask_actor[i, :].detach().cpu(),
sequences_critic[i, :].detach().cpu(),
sequences_mask_critic[i, :].detach().cpu(),
int(action_len_actor),
int(action_len_critic),
)
)
# decode completions to be logged in the conversation log
completions = [
self.actorcritic.actor.tokenizer.decode(action)
for action in actions
]
# remove pad tokens from completions
completions = [
c.replace(self.actorcritic.actor.tokenizer.pad_token, "")
for c in completions
]
# remove eos tokens from completions
completions = [
c.replace(self.actorcritic.actor.tokenizer.eos_token, "")
for c in completions
]
# strange i need to force this?
completions = [c.replace("", "") for c in completions]
# log the memories in the conversation log
for i in range(states_actor.shape[0]):
self.conversation_log.append(
inputs[i],
completions[i],
reward[i].detach().cpu().item(),
cnt_learn_iter,
)
# learn from memories
if (cnt_timesteps % update_timesteps == 0) and (
cnt_timesteps != 0
):
print("len memories", len(memories))
if not self.is_deepspeed_init or (dist.get_rank() == 0):
self.conversation_log.save()
self.learn(memories)
mean_reward = sum([m.rewards[-1] for m in memories]) / len(
memories
)
print(f"Mean Reward: {mean_reward}")
memories.clear()
cnt_timesteps = 0
cnt_learn_iter += 1
if not self.is_deepspeed_init or (dist.get_rank() == 0):
self.conversation_log.save()
# save checkpoints
if (episode % checkpoint_steps == 0) and (episode != 0):
self.save_checkpoint(
current_episode=episode, max_episode=num_episodes
)
if not self.is_deepspeed_init or (dist.get_rank() == 0):
self.conversation_log.save()
# save the models
if self.is_deepspeed_init:
self.actorcritic.save_deepspeed(self.actor_model_engine, self.config)
self.actorcritic.save_deepspeed(
self.critic_model_engine, self.config.critic
)
else:
self.actorcritic.save()
print("End RL Training")
================================================
FILE: optimization/chatllama/chatllama/rlhf/utils.py
================================================
import json
import os
from beartype import beartype
from plotly import graph_objects as go
class TrainingStats:
"""Training statistics
Attributes:
training_loss (List): List of training losses
training_accuracy (List): List of training accuracies
value_loss (List): List of value losses
validation_loss (List): List of validation losses
validation_accuracy (List): List of validation accuracies
"""
def __init__(self, path: str):
"""Initialize the training stats
Args:
path (str): Path to save the stats
"""
self.training_loss = []
self.training_accuracy = []
self.value_loss = []
self.validation_loss = []
self.validation_accuracy = []
self.path = path
def plot(self):
"""Plot the training statistics using plotly"""
fig = go.Figure()
if len(self.training_loss) > 0:
fig.add_trace(
go.Scatter(y=self.training_loss, name="Training loss")
)
if len(self.training_accuracy) > 0:
fig.add_trace(
go.Scatter(y=self.training_accuracy, name="Training accuracy")
)
if len(self.value_loss) > 0:
fig.add_trace(go.Scatter(y=self.value_loss, name="Value loss"))
if len(self.validation_loss) > 0:
fig.add_trace(
go.Scatter(y=self.validation_loss, name="Validation loss")
)
if len(self.validation_accuracy) > 0:
fig.add_trace(
go.Scatter(
y=self.validation_accuracy, name="Validation accuracy"
)
)
fig.update_layout(
showlegend=True, xaxis_type="log", xaxis_title="steps"
)
fig.show()
def save(
self,
):
"""Save the stats"""
if os.path.exists(self.path):
with open(self.path, "r") as f:
stats = json.load(f)
stats["training_loss"].extend(self.training_loss)
stats["training_accuracy"].extend(self.training_accuracy)
stats["value_loss"].extend(self.value_loss)
stats["validation_loss"].extend(self.validation_loss)
stats["validation_accuracy"].extend(self.validation_accuracy)
else:
stats = {
"training_loss": self.training_loss,
"training_accuracy": self.training_accuracy,
"value_loss": self.value_loss,
"validation_loss": self.validation_loss,
"validation_accuracy": self.validation_accuracy,
}
with open(self.path, "w") as f:
json.dump(stats, f, indent=4)
def load(
self,
):
"""Load the stats"""
with open(self.path, "r") as f:
stats = json.load(f)
self.training_loss = stats["training_loss"]
self.training_accuracy = stats["training_accuracy"]
self.value_loss = stats["value_loss"]
self.validation_loss = stats["validation_loss"]
self.validation_accuracy = stats["validation_accuracy"]
def clear(
self,
):
"""Clear the stats"""
self.training_loss = []
self.training_accuracy = []
self.value_loss = []
self.validation_loss = []
self.validation_accuracy = []
if os.path.exists(self.path):
os.remove(self.path)
class ConversationLog:
"""Save the conversation:
(user input, model output, rewards and learn_counter)
during the RL training loop.
"""
def __init__(self, path: str):
self.conversation = []
self.path = path
if self.path is None:
self.path = "./convesation_log.json"
@beartype
def append(
self,
user_input: str,
model_output: str,
reward: float,
learn_counter: int,
):
"""Add a conversation to the log
Args:
user_input (str): User input / initial prompt
model_output (str): Completion of the LLM model
reward (float): Reward of the reward model assigned to the output
learn_counter (int): Number of the learning iteration to
distinguish the conversations that happens at different
points of the training loopt
"""
self.conversation.append(
{
"user_input": user_input,
"model_output": model_output,
"reward": reward,
"learn_counter": learn_counter,
}
)
def save(self):
print("Saving conversations log")
if os.path.exists(self.path):
with open(self.path, "r") as f:
conversation = json.load(f)
self.conversation.extend(conversation)
self.conversation = sorted(
self.conversation, key=lambda x: float(x["learn_counter"])
)
with open(self.path, "w") as f:
json.dump(self.conversation, f, indent=4)
def load(self):
with open(self.path, "r") as f:
self.conversation = json.load(f)
def clear(self):
print("Clearing conversations log")
self.conversation = []
# remove the file in path exists
if os.path.exists(self.path):
os.remove(self.path)
def show(self, current_iteration: int = None):
"""Show the conversation log
Args:
current_iteration (int): Current iteration of the training loop,
if not None, print only the conversations that happened at
"""
for i, c in enumerate(self.conversation):
if current_iteration is None:
print(
f"##########################################\n"
f"Conversation {i} at learn_counter "
f"{c['learn_counter']}\n"
f"##########################################\n"
f"## User Input:\n\n{c['user_input']}\n\n"
f"## Model Output:\n\n{c['model_output']}\n\n"
f"## Reward: {c['reward']}\n\n"
)
else:
if current_iteration == c["learn_counter"]:
print(
f"##########################################\n"
f"Conversation {i} at learn_counter "
f"{c['learn_counter']}\n"
f"##########################################\n"
f"## User Input:\n\n{c['user_input']}\n\n"
f"## Model Output:\n\n{c['model_output']}\n\n"
f"## Reward: {c['reward']}\n\n"
)
================================================
FILE: optimization/chatllama/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages
REQUIREMENTS = [
"accelerate",
"beartype",
"deepspeed",
"einops",
"fairscale",
"langchain>=0.0.103",
"torch",
"tqdm",
"transformers",
"datasets",
"openai",
"plotly",
"peft"
]
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")
setup(
name="chatllama-py",
version="0.0.4",
packages=find_packages(),
install_requires=REQUIREMENTS,
long_description=long_description,
include_package_data=True,
long_description_content_type="text/markdown",
)
================================================
FILE: optimization/cloud_surfer/README.md
================================================
# 🏄 CloudSurfer (WIP)
Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.
If you like this module, give us a star to show your support for the project ⭐
## 📚 Description
The CloudSurfer module allows users to automatically compare the inference performance of their deep learning model across hardware and cloud providers. It leverages state-of-the-art optimization techniques to custom-accelerate the models on each platform, providing the user with an accurate benchmark of their model performances in terms of speed, accuracy, and cost.
With CloudSurfer, users can input their model in their preferred deep learning framework and express their preferences for accuracy and performance. The library will then automatically test the model on a range of hardware and cloud platforms, using optimization techniques to ensure that the results are accurate and representative of the model's performances.
Users can then compare the results side-by-side, seeing the performance of their model on different hardware and cloud providers. This is key to make informed decisions about which platform (cloud and hardware type) to pick, without having to guess or rely on outdated information.
Overall, CloudSurfer provides a powerful and easy-to-use tool to optimize deep learning models and to choose the best inference hardware and cloud platform. Try it out today, and reach out if you have any feedback!
================================================
FILE: optimization/forward_forward/README.md
================================================
# Forward-Forward Algorithm
This module implements a complete open-source version of [Geoffrey Hinton's Forward Forward](https://www.cs.toronto.edu/~hinton/FFA13.pdf) Algorithm, an alternative approach to backpropagation.
The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes, one with positive (i.e., real) data and the other with negative data that could be generated by the network itself.
Unlike the backpropagation approach, Forward-Forward does not require calculating the gradient of the loss function with respect to the network parameters. Instead, each optimization step can be performed locally and the weights of each layer can be updated immediately after the layer has performed its forward pass.
If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)
## Installation
The forward-forward module is built on top of nebullvm, a framework for efficiency-based modules. The library can be easily installed from source code. First you have to clone the repository and navigate to the app directory:
```bash
git clone https://github.com/nebuly-ai/nebullvm.git
cd nebullvm/apps/accelerate/forward_forward
```
Then install the module:
```bash
pip install .
```
This process will just install the minimum requirements for running the module. If you want to run the module on a GPU you have to install the CUDA version of PyTorch. You can find the instructions on the official PyTorch website.
## Usage
At the current stage, this implementation supports the main architectures discussed by Hinton in his paper. Each architecture can be trained with the following command:
```python
from forward_forward import train_with_forward_forward_algorithm
import os
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
trained_model = train_with_forward_forward_algorithm(
model_type="progressive",
n_layers=3,
hidden_size=2000,
lr=0.03,
device=device,
epochs=100,
batch_size=5000,
theta=2.,
)
```
Three architectures are currently supported:
* `progressive`: the most simple architecture described in the paper. It has a pipeline-like structure and each layer can be trained independently from the following ones. Our implementation differs respect the original one since the labels are injected in the image concatenating them to the flattened tensor instead of replacing the first n_classes pixels value with a one-hot-representation of the label.
* `recurrent`: the recurrent architecture described in the paper. It has a recurrent-like structure and its based on the `GLOM` architecture proposed by Hinton.
* `nlp`: A simple network which can be used as a language model.
The recurrent and nlp network architectures are better explained below.
## Recurrent Architecture
The recurrent architecture is based in the `GLOM` architecture for videos, proposed by Hinton in the paper [How to represent part-whole hierarchies in a neural network](https://arxiv.org/pdf/2102.12627.pdf). Its application to the forward-forward algorithm aims at enabling each layer to learn not just from the previous layer output, but from the following layers as well. This is done by concatenating the outputs of the previous layer and following layers computed at the previous time-step. A learned representation of the label (positive or negative) it is given as input to the last layer. The following figure shows the structure of the network:
## NLP Architecture
The forward-forward architecture developed for NLP is a simple network which can be used as a language model. The network is composed by few normalized fully connected layers followed by a ReLU activation. All hidden representations are then concatenated together and given as input to the softmax for predicting the next token. The network can be trained in a progressive way, i.e. each layer can be sequentially trained separately from the following ones. The following figure shows the structure of the network:
## What is missing
This app implements the main architectures exposed by hinton in its paper. However, there are still some features that are not implemented yet. In particular, the following features are missing:
* [ ] Implementation of unsupervised training.
* [ ] Implementation of the `progressive` architecture using local receptive fields instead of fully connected layers.
* [ ] Training on CIFAR-10 for CV-based architectures.
And don't forget to [leave a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers) if you appreciate the project!
If you have any questions about the implementation, [open an issue](https://github.com/nebuly-ai/nebullvm/issues) or contact us in the [community chat](https://discord.gg/RbeQMu886J).
## Contributing
We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the linked page for more information on how to get involved.
A special thanks to [Additi Pandey](https://github.com/cyclotomicextension) for her amazing contribution to the Forward-Forward module.
================================================
FILE: optimization/forward_forward/forward_forward/__init__.py
================================================
from forward_forward.api.functions import ( # noqa F401
train_with_forward_forward_algorithm,
)
================================================
FILE: optimization/forward_forward/forward_forward/api/__init__.py
================================================
================================================
FILE: optimization/forward_forward/forward_forward/api/functions.py
================================================
from torchvision import datasets
from forward_forward.root_op import (
ForwardForwardRootOp,
ForwardForwardModelType,
)
def train_with_forward_forward_algorithm(
n_layers: int = 2,
model_type: str = "progressive",
device: str = "cpu",
hidden_size: int = 2000,
lr: float = 0.03,
epochs: int = 100,
batch_size: int = 5000,
theta: float = 2.0,
shuffle: bool = True,
**kwargs,
):
model_type = ForwardForwardModelType(model_type)
root_op = ForwardForwardRootOp(model_type)
output_size = None
if model_type is ForwardForwardModelType.PROGRESSIVE:
input_size = 28 * 28 + len(datasets.MNIST.classes)
elif model_type is ForwardForwardModelType.RECURRENT:
input_size = 28 * 28
output_size = len(datasets.MNIST.classes)
else: # model_type is ForwardForwardModelType.NLP
input_size = 10 # number of characters
output_size = 30 # length of vocabulary
assert (
kwargs.get("predicted_tokens") is not None
), "predicted_tokens must be specified for NLP model"
root_op.execute(
input_size=input_size,
n_layers=n_layers,
hidden_size=hidden_size,
optimizer_name="Adam",
optimizer_params={"lr": lr},
loss_fn_name="alternative_loss_fn",
batch_size=batch_size,
epochs=epochs,
device=device,
shuffle=shuffle,
theta=theta,
output_size=output_size,
)
return root_op.get_result()
================================================
FILE: optimization/forward_forward/forward_forward/app.py
================================================
from nebullvm.apps.base import App
from forward_forward.root_op import ForwardForwardRootOp
class ForwardForwardApp(App):
def __init__(self):
super().__init__()
self.root_op = ForwardForwardRootOp()
def execute(self, *args, **kwargs):
return self.root_op.execute(*args, **kwargs)
================================================
FILE: optimization/forward_forward/forward_forward/operations/__init__.py
================================================
================================================
FILE: optimization/forward_forward/forward_forward/operations/build_models.py
================================================
from abc import ABC, abstractmethod
import torch
from nebullvm.operations.base import Operation
from forward_forward.utils.modules import (
FCNetFFProgressive,
RecurrentFCNetFF,
LMFFNet,
)
class BaseModelBuildOperation(Operation, ABC):
def __init__(self):
super().__init__()
self.model = None
@abstractmethod
def execute(
self,
input_size: int,
n_layers: int,
hidden_size: int,
optimizer_name: str,
optimizer_params: dict,
loss_fn_name: str,
output_size: int = None,
):
raise NotImplementedError
def get_result(self):
return self.model
class FCNetFFProgressiveBuildOperation(BaseModelBuildOperation):
def __init__(self):
super().__init__()
def execute(
self,
input_size: int,
n_layers: int,
hidden_size: int,
optimizer_name: str,
optimizer_params: dict,
loss_fn_name: str,
output_size: int = None,
):
layer_sizes = [input_size] + [hidden_size] * n_layers
model = FCNetFFProgressive(
layer_sizes=layer_sizes,
optimizer_name=optimizer_name,
optimizer_kwargs=optimizer_params,
loss_fn_name=loss_fn_name,
epochs=-1,
)
if output_size is not None:
output_layer = torch.nn.Linear(layer_sizes[-1], output_size)
model = torch.nn.Sequential(model, output_layer)
self.model = model
class RecurrentFCNetFFBuildOperation(BaseModelBuildOperation):
def __init__(self):
super().__init__()
def execute(
self,
input_size: int,
n_layers: int,
hidden_size: int,
optimizer_name: str,
optimizer_params: dict,
loss_fn_name: str,
output_size: int = None,
):
layer_sizes = [input_size] + [hidden_size] * n_layers + [output_size]
model = RecurrentFCNetFF(
layer_sizes=layer_sizes,
optimizer_name=optimizer_name,
optimizer_kwargs=optimizer_params,
loss_fn_name=loss_fn_name,
)
self.model = model
class LMFFNetBuildOperation(BaseModelBuildOperation):
def __init__(self):
super().__init__()
def execute(
self,
input_size: int,
n_layers: int,
hidden_size: int,
optimizer_name: str,
optimizer_params: dict,
loss_fn_name: str,
output_size: int = None,
):
model = LMFFNet(
token_num=output_size,
hidden_size=hidden_size,
n_layers=n_layers,
seq_len=input_size,
optimizer_name=optimizer_name,
optimizer_kwargs=optimizer_params,
loss_fn_name=loss_fn_name,
epochs=-1,
predicted_tokens=-1,
)
self.model = model
================================================
FILE: optimization/forward_forward/forward_forward/operations/data.py
================================================
import urllib.request
from typing import Any
import torch
import torch.utils.data
from nebullvm.operations.base import Operation
from torchvision import datasets, transforms
class MNISTDataLoaderOperation(Operation):
"""DataLoaderOperation"""
def __init__(self):
super().__init__()
self.train_data = None
self.test_data = None
def get_result(self) -> Any:
if self.train_data is not None:
return self.train_data, self.test_data
else:
return None
def execute(self, batch_size: int, shuffle: bool):
train_loader = torch.utils.data.DataLoader(
datasets.MNIST(
"data",
train=True,
download=True,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
),
),
batch_size=batch_size,
shuffle=shuffle,
)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(
"data",
train=False,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
),
),
batch_size=1000,
shuffle=False,
)
self.train_data = train_loader
self.test_data = test_loader
def download_fables():
http_str = "http://classics.mit.edu/Aesop/fab.mb.txt"
with urllib.request.urlopen(http_str) as response:
html = response.read()
return html.decode("utf-8")
def get_fables():
fables = download_fables()
fables = fables.split("SECTION 1")[1]
fables = fables.split("THE END")[0]
fables = fables.split("\n\n")
fables = [fable for fable in fables if len(fable) >= 100]
return fables
VOCABULARY = {
" ": 0,
"!": 1,
",": 2,
".": 3,
"a": 4,
"b": 5,
"c": 6,
"d": 7,
"e": 8,
"f": 9,
"g": 10,
"h": 11,
"i": 12,
"j": 13,
"k": 14,
"l": 15,
"m": 16,
"n": 17,
"o": 18,
"p": 19,
"q": 20,
"r": 21,
"s": 22,
"t": 23,
"u": 24,
"v": 25,
"w": 26,
"x": 27,
"y": 28,
"z": 29,
}
def tokenize(fable, max_len=100):
tokenized_fable = [
VOCABULARY[char]
for i, char in enumerate(fable.lower())
if char in VOCABULARY
]
return tokenized_fable[:max_len]
def get_tokenized_fables():
fables = get_fables()
tokenized_fables = [tokenize(fable) for fable in fables]
tokenized_fables = torch.stack(
[
torch.tensor(tokens)
for tokens in tokenized_fables
if len(tokens) == 100
]
)
return tokenized_fables
def get_dataloader(batch_size=32, test_size=0.2, shuffle=True):
tokenized_fables = get_tokenized_fables()
n_test = int(len(tokenized_fables) * test_size)
test_set = torch.utils.data.TensorDataset(tokenized_fables[:n_test])
train_set = torch.utils.data.TensorDataset(tokenized_fables[n_test:])
train_loader = torch.utils.data.DataLoader(
train_set, batch_size=batch_size, shuffle=shuffle
)
test_loader = torch.utils.data.DataLoader(
test_set, batch_size=n_test, shuffle=False
)
return train_loader, test_loader
class AesopFablesDataLoaderOperation(Operation):
"""DataLoaderOperation"""
def __init__(self):
super().__init__()
self.train_data = None
self.test_data = None
def get_result(self) -> Any:
if self.train_data is not None:
return self.train_data, self.test_data
else:
return None
def execute(self, batch_size: int, shuffle: bool):
train_loader, test_loader = get_dataloader(
batch_size=batch_size, test_size=0.2, shuffle=shuffle
)
self.train_data = train_loader
self.test_data = test_loader
================================================
FILE: optimization/forward_forward/forward_forward/operations/fetch_operations.py
================================================
from typing import Any
from nebullvm.operations.base import Operation
from torch.utils.data import DataLoader
class FetchTrainingDataFromLocal(Operation):
def get_result(self) -> Any:
pass
def execute(self, train_data: DataLoader, test_data: DataLoader):
self.state["train_data"] = train_data
self.state["test_data"] = test_data
def get_train_data(self) -> DataLoader:
return self.state.get("train_data")
def get_test_data(self) -> DataLoader:
return self.state.get("test_data")
================================================
FILE: optimization/forward_forward/forward_forward/operations/trainers.py
================================================
from abc import ABC, abstractmethod
import torch
from nebullvm.operations.base import Operation
from nebullvm.operations.fetch_operations.local import FetchModelFromLocal
from torch.utils.data import DataLoader
from torchvision import datasets
from forward_forward.operations.data import VOCABULARY
from forward_forward.operations.fetch_operations import (
FetchTrainingDataFromLocal,
)
from forward_forward.utils.labels import LabelsInjector
from forward_forward.utils.modules import FCNetFFProgressive
from forward_forward.utils.utils import (
ProgressiveTrainingDataset,
compute_perplexity,
)
class BaseForwardForwardTrainer(Operation, ABC):
def __init__(self):
super().__init__()
self.model = None
self.train_data = None
self.test_data = None
self.fetch_model_op = FetchModelFromLocal()
self.fetch_data_op = FetchTrainingDataFromLocal()
def get_result(self):
if self.state.get("model_is_trained"):
return self.model
def execute(
self,
model: FCNetFFProgressive,
train_data: DataLoader,
test_data: DataLoader,
epochs: int,
theta: float,
device: str,
**kwargs,
):
if self.fetch_model_op.get_model() is None:
self.fetch_model_op.execute(model)
if self.fetch_data_op.get_train_data() is None:
self.fetch_data_op.execute(train_data, test_data)
self.model = self.fetch_model_op.get_model()
self.train_data = self.fetch_data_op.get_train_data()
self.test_data = self.fetch_data_op.get_test_data()
if (
self.model is not None
and self.train_data is not None
and self.test_data is not None
):
self._train(epochs, theta, device, **kwargs)
@abstractmethod
def _train(self, *args, **kwargs):
raise NotImplementedError
class ForwardForwardTrainer(BaseForwardForwardTrainer):
def _train(self, epochs: int, theta: float, device: str, **kwargs):
# Define model
model = self.model.to(device)
model.epochs = epochs
batch_size = self.train_data.batch_size
# TODO: SELECT THE N_CLASSES OUTSIDE THE OPERATION
label_injector = LabelsInjector(datasets.MNIST.classes)
progressive_dataset = ProgressiveTrainingDataset(
(label_injector.inject_train(x, y) for x, y in self.train_data)
)
progressive_dataloader = torch.utils.data.DataLoader(
progressive_dataset, batch_size=2 * batch_size, shuffle=False
)
model.train()
model.progressive_train(progressive_dataloader, theta)
model.eval()
correct = 0
with torch.no_grad():
for data, target in self.test_data:
input_data = label_injector.inject_eval(data)
input_data = input_data.to(device)
target = target.to(device)
input_shapes = input_data.shape[:-1]
input_data = input_data.reshape(-1, input_data.shape[-1])
_, prob = model.positive_eval(input_data, theta)
prob = prob.reshape(*input_shapes)
pred = prob.argmax(dim=1)
correct += (pred == target).float().sum().item()
if isinstance(correct, torch.Tensor):
correct = correct.item()
self.logger.info(
"Test set: Accuracy: {}/{} ({:.0f}%)".format(
correct,
len(self.test_data.dataset),
100.0 * correct / len(self.test_data.dataset),
)
)
class RecurrentForwardForwardTrainer(BaseForwardForwardTrainer):
def _train(self, epochs: int, theta: float, device: str, **kwargs):
model = self.model.to(device)
for epoch in range(epochs):
accumulated_goodness = None
model.train()
for j, (data, target) in enumerate(self.train_data):
# TODO: THE IMAGE SHAPE SHOULD NOT BE DEFINED HERE
data = data.to(device).reshape(-1, 28 * 28)
target = torch.functional.F.one_hot(
target.to(device),
num_classes=len(datasets.MNIST.classes),
)
_, goodness = model.ff_train(data, target, theta)
if accumulated_goodness is None:
accumulated_goodness = goodness
else:
accumulated_goodness[0] += goodness[0]
accumulated_goodness[1] += goodness[1]
goodness_ratio = (
accumulated_goodness[0] - accumulated_goodness[1]
) / abs(max(accumulated_goodness))
self.logger.info(f"Epoch {epoch + 1}")
self.logger.info(f"Accumulated goodness: {accumulated_goodness}")
self.logger.info(f"Goodness ratio: {goodness_ratio}")
model.eval()
correct = 0
with torch.no_grad():
for data, target in self.test_data:
data = data.to(device).reshape(-1, 28 * 28)
target = target.to(device)
pred, _ = model.positive_eval(data, theta)
correct += pred.eq(target.view_as(pred)).sum().item()
self.logger.info(
f"Test accuracy: {correct} / 10000 ({correct / 10000 * 100}%)"
)
class NLPForwardForwardTrainer(BaseForwardForwardTrainer):
def _train(
self,
epochs: int,
theta: float,
device: str,
predicted_tokens: int,
**kwargs,
):
model = self.model.to(device)
self.model.epochs = epochs
self.model.predicted_tokens = predicted_tokens
token_num = len(VOCABULARY)
sequence_len = self.model.seq_len
for input_data in self.train_data:
input_data = torch.functional.F.one_hot(
input_data[0].to(device), num_classes=token_num
).float()
accumulated_goodness = model.LM_ff_train(input_data, theta=theta)
goodness_ratio = (
accumulated_goodness[0] - accumulated_goodness[1]
) / abs(max(accumulated_goodness))
self.logger.info("Trained on batch")
self.logger.info(f"Accumulated goodness: {accumulated_goodness}")
self.logger.info(f"Accumulated goodness ratio: {goodness_ratio}")
for test_data in self.test_data:
test_data = torch.functional.F.one_hot(
test_data[0].to(device), num_classes=token_num
).float()
test_data = test_data.reshape(-1, token_num * sequence_len)
predictions, _ = model.positive_eval(test_data, theta)
perplexity = compute_perplexity(predictions)
self.logger.info(f"Perplexity: {perplexity}")
================================================
FILE: optimization/forward_forward/forward_forward/root_op.py
================================================
from enum import Enum
from nebullvm.operations.base import Operation
from forward_forward.operations.build_models import (
FCNetFFProgressiveBuildOperation,
RecurrentFCNetFFBuildOperation,
LMFFNetBuildOperation,
)
from forward_forward.operations.data import (
MNISTDataLoaderOperation,
AesopFablesDataLoaderOperation,
)
from forward_forward.operations.trainers import (
ForwardForwardTrainer,
RecurrentForwardForwardTrainer,
NLPForwardForwardTrainer,
)
class ForwardForwardModelType(Enum):
PROGRESSIVE = "progressive"
RECURRENT = "recurrent"
NLP = "nlp"
class ForwardForwardRootOp(Operation):
def __init__(self, model_type: ForwardForwardModelType):
super().__init__()
if model_type is ForwardForwardModelType.PROGRESSIVE:
self.build_model = FCNetFFProgressiveBuildOperation()
self.train_model = ForwardForwardTrainer()
self.load_data = MNISTDataLoaderOperation()
elif model_type is ForwardForwardModelType.RECURRENT:
self.build_model = RecurrentFCNetFFBuildOperation()
self.train_model = RecurrentForwardForwardTrainer()
self.load_data = MNISTDataLoaderOperation()
elif model_type is ForwardForwardModelType.NLP:
self.build_model = LMFFNetBuildOperation()
self.train_model = NLPForwardForwardTrainer()
self.load_data = AesopFablesDataLoaderOperation()
def execute(
self,
input_size: int,
n_layers: int,
hidden_size: int,
optimizer_name: str,
optimizer_params: dict,
loss_fn_name: str,
batch_size: int,
epochs: int,
shuffle: bool,
theta: float,
device: str,
output_size: int = None,
**kwargs,
):
if self.build_model.get_result() is None:
self.build_model.execute(
input_size=input_size,
n_layers=n_layers,
hidden_size=hidden_size,
optimizer_name=optimizer_name,
optimizer_params=optimizer_params,
loss_fn_name=loss_fn_name,
output_size=output_size,
)
if self.load_data.get_result() is None:
self.load_data.execute(batch_size=batch_size, shuffle=shuffle)
if (
self.build_model.get_result() is not None
and self.load_data.get_result() is not None
):
if self.train_model.get_result() is None:
train_loader, test_loader = self.load_data.get_result()
self.train_model.execute(
model=self.build_model.get_result(),
train_data=train_loader,
test_data=test_loader,
epochs=epochs,
theta=theta,
device=device,
**kwargs,
)
if self.train_model.get_result() is not None:
self.state["model"] = self.train_model.get_result()
def get_result(self):
return self.state.get("model")
================================================
FILE: optimization/forward_forward/forward_forward/utils/__init__.py
================================================
================================================
FILE: optimization/forward_forward/forward_forward/utils/labels.py
================================================
from typing import List
import torch
class LabelsInjector:
def __init__(self, labels: List):
# save labels into a dict having label as key and a tensor of size
# len(labels) as value. The tensor contains ones up to the index of
# the label and zeros after.
self.label_names = labels
self.labels = [
torch.nn.functional.one_hot(
torch.tensor([i]), len(labels)
).reshape(-1)
for i in range(len(labels))
]
@torch.no_grad()
def inject_train(self, input_image: torch.Tensor, labels: torch.Tensor):
# inject label in the input image
bs = input_image.shape[0]
injecting_labels = torch.stack(
[self.labels[label] for label in labels]
)
negative_injecting_labels = torch.stack(
[
self.labels[label]
for label in select_random_different_label(
labels, len(self.labels)
)
]
)
positive_images = torch.cat(
[input_image.reshape(bs, -1), injecting_labels], dim=1
)
negative_images = torch.cat(
[input_image.reshape(bs, -1), negative_injecting_labels], dim=1
)
images = torch.cat([positive_images, negative_images], dim=0)
signs = torch.cat([torch.ones(bs), -torch.ones(bs)], dim=0)
return images, signs
@torch.no_grad()
def inject_eval(self, input_image: torch.Tensor):
# input image is expected to have batch size 1
# TODO: FIX THIS BEHAVIOUR
labels = torch.stack(self.labels).unsqueeze(0)
labels = labels.repeat(input_image.shape[0], 1, 1)
input_image = input_image.reshape(input_image.shape[0], -1).unsqueeze(
1
)
replicated_input = input_image.repeat(1, len(self.labels), 1)
new_input = torch.cat([replicated_input, labels], dim=2)
return new_input # .reshape(-1, new_input.shape[2])
def select_random_different_label(labels: torch.Tensor, n_classes: int):
# select a random label different from the given one
for label in enumerate(labels):
samples = torch.randint(0, n_classes, (1,))
while samples[0] == label:
samples = torch.randint(0, n_classes, (1,))
yield samples[0]
================================================
FILE: optimization/forward_forward/forward_forward/utils/modules.py
================================================
from abc import ABC, abstractmethod
from typing import List
import torch
import torch.utils.data
from forward_forward.utils.utils import ProgressiveTrainingDataset
def loss_fn(y, theta, sign):
logits = torch.square(y).mean(dim=1) - theta
loss = -logits * sign
with torch.no_grad():
accumulated_logits = logits.mean().item()
loss = loss.mean()
return loss, accumulated_logits
def probabilistic_loss_fn(y, theta, sign):
logits = torch.square(y).mean(dim=1) - theta
prob = torch.sigmoid(logits)
loss = -torch.log(prob + 1e-6) * sign
with torch.no_grad():
accumulated_logits = logits.mean().item()
loss = loss.mean()
return loss, accumulated_logits
def alternative_loss_fn(y, theta, sign):
logits = y.pow(2).mean(dim=1) - theta
with torch.no_grad():
accumulated_logits = logits.mean().item()
logits = -logits * sign
prob = torch.nan_to_num(torch.exp(logits))
loss = torch.log(1 + prob)
loss = loss.mean()
return loss, accumulated_logits
class BaseFFLayer(torch.nn.Module, ABC):
@abstractmethod
def ff_train(
self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
):
raise NotImplementedError
@abstractmethod
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
raise NotImplementedError
@property
def requires_training(self):
return True
class FFLayer(BaseFFLayer):
"""Layer wrapper for efficient forward-forward layers."""
def __init__(
self,
layer,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str = "loss_fn",
):
super().__init__()
self.layer = layer
self.optimizer = getattr(torch.optim, optimizer_name)(
layer.parameters(), **optimizer_kwargs
)
if loss_fn_name == "loss_fn":
self.loss_fn = loss_fn
elif loss_fn_name == "alternative_loss_fn":
self.loss_fn = alternative_loss_fn
elif loss_fn_name == "probabilistic_loss_fn":
self.loss_fn = probabilistic_loss_fn
def forward(self, x):
return self.layer(x)
def ff_train(
self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
):
"""Train the layer with the given target."""
# upgrade optimizer for positive goodness
y = self(input_tensor.detach())
y_pos = y[torch.where(signs == 1)]
y_neg = y[torch.where(signs == -1)]
# y_pos = self(input_tensor.detach()[torch.where(signs == 1)])
loss_pos, cumulated_logits_pos = self.loss_fn(y_pos, theta, sign=1)
# self.optimizer.zero_grad()
# loss_pos.backward()
# print(loss_pos.item())
# self.optimizer.step()
# y_neg = self(input_tensor.detach()[torch.where(signs == -1)])
loss_neg, cumulated_logits_neg = self.loss_fn(y_neg, theta, sign=-1)
self.optimizer.zero_grad()
loss = loss_pos + loss_neg
loss.backward()
self.optimizer.step()
separation = [cumulated_logits_pos, cumulated_logits_neg]
y = torch.zeros(
input_tensor.shape[0], *y_pos.shape[1:], device=input_tensor.device
)
y[torch.where(signs == 1)] = y_pos
y[torch.where(signs == -1)] = y_neg
return y.detach(), separation
@torch.no_grad()
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
"""Evaluate the layer with the given input and theta."""
y = self(input_tensor)
return y, torch.square(y).mean(dim=1) - theta
class FFNormalization(BaseFFLayer):
def __init__(self):
super().__init__()
def forward(self, x):
l2_norm = (
torch.norm(x.reshape(x.shape[0], -1), p=2, dim=1, keepdim=True)
+ 1e-8
)
return x / l2_norm
def ff_train(
self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
):
with torch.no_grad():
output = self()
return output, None
@torch.no_grad()
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
with torch.no_grad():
output = self(input_tensor)
return output, torch.zeros(
input_tensor.shape[0], device=input_tensor.device
)
@property
def requires_training(self):
return False
class LinearReLU(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.linear = torch.nn.Linear(in_features, out_features, bias=True)
self.relu = torch.nn.ReLU()
def forward(self, x):
return self.relu(self.linear(x))
class FCNetFFProgressive(BaseFFLayer):
"""FCNet trained using forward-forward algorithm. The network is trained
in a progressive manner, i.e. the first layer is trained, then the
second layer, and so on.
"""
def __init__(
self,
layer_sizes: list,
optimizer_name: str,
optimizer_kwargs: dict,
epochs: int,
loss_fn_name: str = "loss_fn",
):
super().__init__()
self.epochs = epochs
self.layers = torch.nn.ModuleList()
for i in range(len(layer_sizes) - 1):
self.layers.append(FFNormalization())
self.layers.append(
FFLayer(
LinearReLU(layer_sizes[i], layer_sizes[i + 1]),
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
)
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
def progressive_train(self, dl: torch.utils.data.DataLoader, theta: float):
"""Train the network in a progressive manner."""
print("Training the network in a progressive manner.")
for i, layer in enumerate(self.layers):
if layer.requires_training:
for epoch in range(self.epochs):
accumulated_separation = None
for j, (data, signs) in enumerate(dl):
data = data.to(self.device)
signs = signs.to(self.device)
_, separation = layer.ff_train(data, signs, theta)
if accumulated_separation is None:
accumulated_separation = separation
else:
accumulated_separation[0] += separation[0]
accumulated_separation[1] += separation[1]
if j % 100 == 0:
print(f"Epoch: {epoch}, Batch: {j}, Layer: {i}")
print(f"Epoch {epoch} of layer {i} done.")
accumulated_separation[0] /= len(dl.dataset)
accumulated_separation[1] /= len(dl.dataset)
separation_ratio = (
accumulated_separation[0] - accumulated_separation[1]
) / abs(max(accumulated_separation))
print("Goodness: ", accumulated_separation)
print(f"Accumulated separation: {separation_ratio}")
print(f"Finished training layer {i} / {len(self.layers)}.")
# create a new dataloader for the next layer
dataset = ProgressiveTrainingDataset(
(
(layer(x.to(self.device)), sign.to(self.device))
for x, sign in dl
)
)
batch_size = dl.batch_size
dl = torch.utils.data.DataLoader(
dataset, batch_size=batch_size, shuffle=False
)
print("Finished training the network.")
def ff_train(
self, input_tensor: torch.Tensor, signs: torch.Tensor, theta: float
):
"""Train the network with the given target."""
accumulated_separation = None
for layer in self.layers:
input_tensor, separation = layer.ff_train(
input_tensor, signs, theta
)
if accumulated_separation is None:
accumulated_separation = separation
else:
accumulated_separation[0] += separation[0]
accumulated_separation[1] += separation[1]
return input_tensor, accumulated_separation
@torch.no_grad()
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
"""Evaluate the network with the given input and theta."""
accumulated_goodness = torch.zeros(
input_tensor.shape[0], device=input_tensor.device
)
for i, layer in enumerate(self.layers):
input_tensor, goodness = layer.positive_eval(input_tensor, theta)
if i > 1:
accumulated_goodness += goodness
return input_tensor, accumulated_goodness
@property
def device(self):
return next(self.parameters()).device
class NormLinearReLU(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.norm = FFNormalization()
self.linear_relu = LinearReLU(in_features, out_features)
def forward(self, x):
return self.linear_relu(self.norm(x))
class RecurrentFFLayer(BaseFFLayer):
def __init__(
self,
hidden_size: int,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str,
):
super().__init__()
self.layer = NormLinearReLU(2 * hidden_size, hidden_size)
self.optimizer = getattr(torch.optim, optimizer_name)(
self.layer.parameters(), **optimizer_kwargs
)
self.loss_fn = eval(loss_fn_name)
def forward(self, x_prev, x_same, x_next):
x = torch.cat((x_prev, x_next), dim=1)
new_x = self.layer(x)
new_x = 0.3 * x_same + 0.7 * new_x
return new_x
def ff_train(
self,
x_prev: torch.Tensor,
x_same: torch.Tensor,
x_next: torch.Tensor,
signs: torch.Tensor,
theta: float,
):
new_x = self(x_prev.detach(), x_same.detach(), x_next.detach())
y_pos = new_x[signs == 1]
y_neg = new_x[signs == -1]
loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
loss = loss_pos + loss_neg
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return new_x, [goodness_pos, goodness_neg]
@torch.no_grad()
def positive_eval(
self,
x_prev: torch.Tensor,
x_same: torch.Tensor,
x_next: torch.Tensor,
theta: float,
):
new_x = self(x_prev, x_same, x_next)
goodness = new_x.pow(2).mean(dim=1) - theta
return new_x, goodness
class RecurrentProjectionFFLayer(BaseFFLayer):
def __init__(
self,
input_size: int,
output_size: int,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str,
):
super().__init__()
self.layer = NormLinearReLU(input_size, output_size)
self.optimizer = getattr(torch.optim, optimizer_name)(
self.layer.parameters(), **optimizer_kwargs
)
self.loss_fn = eval(loss_fn_name)
def forward(self, x: torch.Tensor):
return self.layer(x)
def ff_train(
self,
x: torch.Tensor,
signs: torch.Tensor,
theta: float,
):
new_x = self(x.detach())
y_pos = new_x[signs == 1]
y_neg = new_x[signs == -1]
loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
loss = loss_pos + loss_neg
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return new_x, [goodness_pos, goodness_neg]
@torch.no_grad()
def positive_eval(self, x: torch.Tensor, theta: float):
new_x = self(x)
goodness = new_x.pow(2).mean(dim=1) - theta
return new_x, goodness
class RecurrentProjectedSoftmaxFFLayer(BaseFFLayer):
def __init__(
self,
input_size: int,
output_size: int,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str,
):
super().__init__()
self.loss_fn = eval(loss_fn_name)
self.norm = FFNormalization()
self.linear = torch.nn.Linear(input_size, output_size)
self.softmax = torch.nn.Softmax(dim=1)
self.optimizer = getattr(torch.optim, optimizer_name)(
self.linear.parameters(), **optimizer_kwargs
)
def forward(self, x: torch.Tensor):
x = self.norm(x)
x = self.linear(x)
x = self.softmax(x)
return x
def ff_train(
self,
x: torch.Tensor,
signs: torch.Tensor,
theta: float,
):
new_x = self(x.detach())
y_pos = new_x[signs == 1]
y_neg = new_x[signs == -1]
loss_pos, goodness_pos = self.loss_fn(y_pos, theta, 1)
loss_neg, goodness_neg = self.loss_fn(y_neg, theta, -1)
loss = loss_pos + loss_neg
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return new_x, [goodness_pos, goodness_neg]
@torch.no_grad()
def positive_eval(self, x: torch.Tensor, theta: float):
new_x = self(x)
goodness = new_x.pow(2).mean(dim=1) - theta
return new_x, goodness
class RecurrentFCNetFF(BaseFFLayer):
"""Recurrent FCNet trained using forward-forward algorithm."""
def __init__(
self,
layer_sizes: list,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str = "loss_fn",
):
super().__init__()
self.time_steps = 8
self.test_time_steps = 8
self.storable_time_steps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# self.storable_time_steps = [3, 4, 5]
self.states = []
self.layers = torch.nn.ModuleList()
self.projector = RecurrentProjectionFFLayer(
layer_sizes[0],
layer_sizes[1],
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
for i in range(1, len(layer_sizes) - 1):
self.layers.append(
RecurrentFFLayer(
layer_sizes[i],
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
)
self.proj_y = RecurrentProjectionFFLayer(
layer_sizes[-1],
layer_sizes[-2],
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
self.softmax = RecurrentProjectedSoftmaxFFLayer(
layer_sizes[-2],
layer_sizes[-1],
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
self.num_labels = layer_sizes[-1]
@property
def device(self):
return next(self.parameters()).device
@torch.no_grad()
def bottom_up(self, x: torch.Tensor, y: torch.Tensor):
states = []
x_proj = self.projector(x)
for layer in self.layers:
states.append(x_proj)
x_proj = layer(
x_proj,
torch.zeros_like(x_proj, device=self.device),
torch.zeros_like(x_proj, device=self.device),
)
states.append(x_proj)
states.append(y)
y_arg = torch.argmax(y, dim=1)
x_proj_ = x_proj.clone()
x_proj_[torch.arange(x_proj.shape[0]), y_arg] = -1e6
neg_prob = self.softmax(x_proj_)
cumulative_neg_prob = torch.cumsum(neg_prob, dim=1)
neg_samples = torch.argmax(
1.0
* (
cumulative_neg_prob > torch.rand(x.shape[0], 1).to(self.device)
),
dim=1,
)
neg_samples = torch.functional.F.one_hot(
neg_samples, num_classes=self.num_labels
)
return states, neg_samples
def forward(self, x: torch.Tensor, prev_states: List[torch.Tensor]):
x_proj = self.projector(x)
new_states = []
for i, layer in enumerate(self.layers):
if i < len(self.layers) - 1:
next_state = prev_states[i + 2]
else:
next_state = self.proj_y(prev_states[i + 2].float())
new_states.append(x_proj)
x_proj = layer(prev_states[i], prev_states[i + 1], next_state)
new_states.append(x_proj)
y = self.softmax(x_proj)
new_states.append(y)
return new_states
def ff_train(
self, input_tensor: torch.Tensor, labels: torch.Tensor, theta: float
):
"""Train the network with the given target."""
with torch.no_grad():
states, neg_samples = self.bottom_up(input_tensor, labels)
neg_states, _ = self.bottom_up(input_tensor, neg_samples)
states = [
torch.cat([s, ns], dim=0) for s, ns in zip(states, neg_states)
]
signs = torch.cat(
[
torch.ones(input_tensor.shape[0], device=self.device),
-torch.ones(input_tensor.shape[0], device=self.device),
],
dim=0,
)
input_tensor = torch.cat([input_tensor, input_tensor], dim=0)
# states have been created, now we can train the network
x_proj, accumulated_goodness = self.projector.ff_train(
input_tensor, signs, theta
)
for _ in range(self.time_steps):
new_states = []
x = x_proj
for j, layer in enumerate(self.layers):
if j < len(self.layers) - 1:
next_state = states[j + 2]
else:
next_state = self.proj_y(states[j + 2].float())
new_states.append(x)
x, goodnesses = layer.ff_train(
states[j], states[j + 1], next_state, signs, theta
)
accumulated_goodness[0] += goodnesses[0]
accumulated_goodness[1] += goodnesses[1]
new_states.append(x)
with torch.no_grad():
x_ = states[-2][torch.where(signs == -1)]
real_y = states[-1][torch.where(signs == 1)]
x_[
torch.arange(x_.shape[0]), torch.argmax(real_y, dim=1)
] = -1e6
y = self.softmax(x_)
cumulative_y = torch.cumsum(y, dim=1)
neg_samples = torch.argmax(
1.0
* (
cumulative_y
> torch.rand(x_.shape[0], 1).to(self.device)
),
dim=1,
)
neg_samples = torch.functional.F.one_hot(
neg_samples, num_classes=self.num_labels
)
# replace just negative samples
next_labels = states[-1].clone()
next_labels[torch.where(signs == -1)] = neg_samples
new_states.append(next_labels)
states = new_states
accumulated_goodness[0] /= self.time_steps * len(self.layers) + 1
accumulated_goodness[1] /= self.time_steps * len(self.layers) + 1
with torch.no_grad():
states = [t[: input_tensor.shape[0] // 2] for t in states]
return states, accumulated_goodness
@torch.no_grad()
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
"""Evaluate the network with the given input and theta."""
labels = torch.arange(0, self.num_labels, device=self.device)
labels = torch.functional.F.one_hot(
labels, num_classes=self.num_labels
)
original_bs = input_tensor.shape[0]
input_tensor = (
input_tensor.unsqueeze(1)
.repeat(1, self.num_labels, 1)
.reshape(-1, input_tensor.shape[-1])
)
labels = (
labels.unsqueeze(0)
.repeat(original_bs, 1, 1)
.reshape(-1, labels.shape[-1])
)
states, _ = self.bottom_up(input_tensor, labels)
x_proj, goodness = self.projector.positive_eval(input_tensor, theta)
accumulated_goodness = goodness
for time_step in range(self.test_time_steps):
new_states = []
x = x_proj
for j, layer in enumerate(self.layers):
if j < len(self.layers) - 1:
next_state = states[j + 2]
else:
next_state = self.proj_y(states[j + 2].float())
new_states.append(x)
x, goodnesses = layer.positive_eval(
states[j], states[j + 1], next_state, theta
)
if time_step in self.storable_time_steps:
accumulated_goodness += goodnesses
new_states.append(x)
if time_step in self.storable_time_steps:
_, goodness = self.softmax.positive_eval(x, theta)
accumulated_goodness += goodness
new_states.append(states[-1])
states = new_states
accumulated_goodness = accumulated_goodness.reshape(
original_bs, self.num_labels
)
prediction = torch.argmax(accumulated_goodness, dim=1)
return prediction, accumulated_goodness
class LMFFLinearSoftmax(BaseFFLayer):
def __init__(
self,
input_size: int,
output_size: int,
optimizer_name: str,
optimizer_kwargs: dict,
):
super().__init__()
self.loss_fn = torch.nn.NLLLoss()
self.norm = FFNormalization()
self.linear = torch.nn.Linear(input_size, output_size)
self.softmax = torch.nn.Softmax(dim=1)
self.optimizer = getattr(torch.optim, optimizer_name)(
self.parameters(), **optimizer_kwargs
)
def forward(self, x: torch.Tensor):
x = self.norm(x)
x = self.linear(x)
x = self.softmax(x)
return x
def ff_train(
self,
input_tensor: torch.Tensor,
labels: torch.Tensor,
signs: torch.Tensor,
):
x = input_tensor[torch.where(signs == 1)]
y = labels[torch.where(signs == 1)]
x = self(x)
loss = self.loss_fn(x, torch.argmax(y, dim=1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
with torch.no_grad():
x_neg = input_tensor[torch.where(signs == -1)]
new_y_neg = self(x_neg)
new_x = torch.zeros(
len(input_tensor), *x.shape[1:], device=input_tensor.device
)
new_x[torch.where(signs == 1)] = x
new_x[torch.where(signs == -1)] = new_y_neg
return new_x, loss.item()
@torch.no_grad()
def positive_eval(self, x: torch.Tensor):
pred = self(x)
return pred
class LMFFNet(BaseFFLayer):
def __init__(
self,
token_num: int,
hidden_size: int,
n_layers: int,
seq_len: int,
predicted_tokens: int,
epochs: int,
optimizer_name: str,
optimizer_kwargs: dict,
loss_fn_name: str = "loss_fn",
):
super().__init__()
self.token_num = token_num
self.hidden_size = hidden_size
self.seq_len = seq_len
self.predicted_tokens = predicted_tokens
self.token2emb = RecurrentProjectionFFLayer(
token_num * seq_len,
hidden_size,
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
self.layers = torch.nn.ModuleList(
[
FFLayer(
NormLinearReLU(hidden_size, hidden_size),
optimizer_name,
optimizer_kwargs,
loss_fn_name,
)
for _ in range(n_layers)
]
)
self.emb2token = LMFFLinearSoftmax(
n_layers * hidden_size, token_num, optimizer_name, optimizer_kwargs
)
self.epochs = epochs
def forward(self, input_tensor: torch.Tensor):
x = self.token2emb(input_tensor)
xs = []
for layer in self.layers:
x = layer(x)
xs.append(x)
x = torch.cat(xs, dim=1)
x = self.emb2token(x)
return x
def ff_train(
self,
input_tensor: torch.Tensor,
prev_pred: torch.Tensor,
labels: torch.Tensor,
theta: float,
):
signs = torch.cat(
[
torch.ones(input_tensor.shape[0], device=input_tensor.device),
-torch.ones(input_tensor.shape[0], device=input_tensor.device),
]
)
input_tensor = torch.cat([input_tensor, prev_pred], dim=0)
labels = torch.cat([labels, labels], dim=0)
for idx in range(self.epochs):
x, goodness = self.token2emb.ff_train(input_tensor, signs, theta)
if idx % 20 == 0:
print(f"Epoch {idx}: {goodness}")
accumulated_goodness = goodness
xs = []
for layer in self.layers:
for epoch in range(self.epochs):
x_new, goodness = layer.ff_train(x, signs, theta)
if epoch % 20 == 0:
print(f"Epoch {epoch}: {goodness}")
x = x_new
xs.append(x)
accumulated_goodness[0] += goodness[0]
accumulated_goodness[1] += goodness[1]
x = torch.cat(xs, dim=1)
for epoch in range(self.epochs):
x_new, loss = self.emb2token.ff_train(x, labels, signs)
if epoch % 20 == 0 or epoch < 20:
print(f"Epoch {epoch}: {loss}")
x = x_new
next_input = input_tensor[signs == 1].roll(-self.token_num, dims=1)
next_input[
:, -self.token_num : # noqa E203
] = torch.functional.F.one_hot(
torch.argmax(x[signs == 1], dim=1), num_classes=self.token_num
)
return next_input, accumulated_goodness
def LM_ff_train(self, input_tensor: torch.Tensor, theta: float):
with torch.no_grad():
input_tensor = input_tensor.reshape(
-1, self.token_num * self.seq_len
)
labels = input_tensor[:, -self.token_num :].roll( # noqa E203
-1, dims=0
)
temp = torch.argmax(labels, dim=1)
print(temp.shape, torch.sum(temp == 0))
pred = self(input_tensor)
new_char = torch.functional.F.one_hot(
torch.argmax(pred, dim=1), num_classes=self.token_num
)
prev_pred = input_tensor.clone().roll(1)
prev_pred[:, -self.token_num :] = new_char # noqa E203
_, accumulated_goodness = self.ff_train(
input_tensor, prev_pred, labels, theta
)
return accumulated_goodness
@torch.no_grad()
def positive_eval(self, input_tensor: torch.Tensor, theta: float):
cumulated_goodness = torch.zeros(
input_tensor.shape[0], device=input_tensor.device
)
prediction = torch.zeros(
input_tensor.shape[0],
self.predicted_tokens,
self.token_num,
device=input_tensor.device,
)
for idx in range(self.predicted_tokens):
x, goodness = self.token2emb.positive_eval(input_tensor, theta)
cumulated_goodness += goodness
xs = []
for layer in self.layers:
x, goodness = layer.positive_eval(x, theta)
xs.append(x)
cumulated_goodness += goodness
x = torch.cat(xs, dim=1)
x = self.emb2token.positive_eval(x)
prediction[:, idx] = x
input_tensor = input_tensor.roll(-self.token_num, dims=1)
input_tensor[
:, -self.token_num : # noqa E203
] = torch.functional.F.one_hot(
torch.argmax(x, dim=1), num_classes=self.token_num
)
cumulated_goodness /= self.predicted_tokens
return prediction, cumulated_goodness
================================================
FILE: optimization/forward_forward/forward_forward/utils/utils.py
================================================
from collections import Generator
import torch.utils.data
class ProgressiveTrainingDataset(torch.utils.data.Dataset):
"""Dataset for progressive training."""
def __init__(self, dataset_generator: Generator):
with torch.no_grad():
self.internal_dataset = [
batch
for data, sign in dataset_generator
for batch in zip(data, sign)
]
def __getitem__(self, index):
return self.internal_dataset[index]
def __len__(self):
return len(self.internal_dataset)
def compute_perplexity(tensor: torch.Tensor):
"""Compute perplexity of a tensor. The tensor has shape (batch_size,
sequence_length, vocab_size).
The softmax has already been computed over the vocab dimension.
"""
return torch.exp(-torch.sum(tensor * torch.log(tensor), dim=-1)).mean()
================================================
FILE: optimization/forward_forward/requirements.txt
================================================
torch>=1.9
torchvision>=0.10
nebullvm>=0.6
================================================
FILE: optimization/forward_forward/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages
REQUIREMENTS = [
"torch>=1.9",
"torchvision>=0.10",
"nebullvm>=0.6",
]
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")
setup(
name="forward_forward",
version="0.0.1",
packages=find_packages(),
install_requires=REQUIREMENTS,
long_description=long_description,
include_package_data=True,
long_description_content_type="text/markdown",
)
================================================
FILE: optimization/large_speedster/README.md
================================================
# ⚡ LargeSpeedster App (WIP)
Automatically apply SOTA optimization techniques on large AI models to achieve the maximum acceleration on your hardware.
If you like this App, give us a star to show your support for the project ⭐
## 📚 Description
The LargeSpeedster App is a powerful tool to optimize large AI models (LMs). Leveraging state-of-the-art open-source optimization tools, LargeSpeedster enables the acceleration of large models, i.e. models with a number of parameters in excess of what could be stored on a single GPU. The workflow consists in 3 steps: select, search, and serve.
In the select step, users input their large model in their preferred deep learning framework and express their preferences regarding maximum consented accuracy loss. This information is used to guide the optimization process and ensure that the resulting model meets the user's needs.
In the search step, the App automatically tests multiple LMs-specific optimization techniques across the software-to-hardware stack, such as SmoothQuant quantization, FlashAttention, and inference-specific kernels. The App also tunes the optimal parallelization strategy and its configuration parameters, allowing it to find the optimal configuration of techniques for accelerating the model.
Finally, in the serve step, the App returns an accelerated version of the user's model in the DL framework of choice, providing a significant boost in performance.
Overall, LargeSpeedster is an easy-to-use tool that allows users to optimize their large AI models and get the most out of their software-to-hardware stack. Try it out today, and reach out if you have any feedback!
================================================
FILE: optimization/nebullvm/.pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/ambv/black
rev: 22.3.0
hooks:
- id: black
args: [--line-length=79]
- repo: https://github.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
args: [--exclude=nebullvm/tools/diffusers.py]
================================================
FILE: optimization/nebullvm/CONTRIBUTING.md
================================================
# Guidelines for Contributing to Nebullvm 🚀
Hello coder 👋
We are very happy that you have decided to contribute to the library and we thank you for your efforts. Here you can find guidelines on how to standardize your code with the style we adopted for `nebullvm`. But remember, there are various ways to help the community other than submitting code contributions, answering questions and improving the documentation are also very valuable.
It also helps us if you mention our library in your blog posts to show off the cool things it's made possible, or just give the repository a ⭐️ to show us that you appreciate the project
This guide was inspired by the awesome [Transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing.
We hope to come across your pull request soon!
Happy coding 💫 The nebullvm Team
## How to submit an issue
Did you spot a bug? Did you come up with a cool idea that you think should be implemented in nebullvm? Well, GitHub issues are the best way to let us know!
We don't have a strict policy on issue generation, just use a meaningful title and specify the problem or your proposal in the first problem comment. Then, you can use GitHub labels to let us know what kind of proposal you are making, for example `bug` if you are reporting a bug or `enhancement` if you are proposing a library improvement.
## How to contribute to solve an issue
We are always delighted to welcome other people to the contributors section of nebullvm! We are looking forward to welcoming you to the community, here are some guidelines to follow:
1. Please [fork](https://github.com/nebuly-ai/nebullvm/fork) the [library](https://github.com/nebuly-ai/nebullvm) by clicking on the Fork button on the repository's page. This will create a copy of the repository in your GitHub account.
2. Clone your fork to your local machine, and add the base repository as a remote:
```bash
$ git clone git@github.com:/nebuly-ai/nebullvm.git
$ cd nebullvm
$ git remote add upstream https://github.com/nebuly-ai/nebullvm.git
```
3. Install the library in editable mode with the following command:
```bash
$ pip install -e .
```
4. Work on your fork to develop the feature you have in mind.
5. Nebullvm relies on `black` to format its source code consistently. To use the formatting style defined for nebullvm, run the following commands:
```bash
$ pip install pre-commit black autoflake
$ pre-commit install
# the following command is optional, but needed if you have already
# committed some files to your forked repo.
$ pre-commit run --all-files
```
As for the naming convention, we follow [PEP 8](https://peps.python.org/pep-0008/) for code and a slight variation of [Google convention](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for docstrings. For docstrings we redundantly express the input type in both the function definition and the function docstring.
6. Once you're happy with your changes, add changed files with git add and commit your code:
```bash
$ git add edited_file.py
$ git commit -m "Add a cool feature"
```
7. Push your changes to your repo:
```bash
$ git push
```
8. Now you can go to the repo you have forked on your github profile and press on **Pull Request** to open a pull request. In the pull request specify which problems it is solving. For instance, if the pull request solves `Issue #1`, the comment should be `Closes #1`. Also make the title of the pull request meaningful and self-explanatory.
---
See you soon in the list of nebullvm contributors 🌈
================================================
FILE: optimization/nebullvm/Dockerfile
================================================
ARG STARTING_IMAGE=nvcr.io/nvidia/tensorrt:23.03-py3
FROM ${STARTING_IMAGE}
WORKDIR /
# Set frontend as non-interactive
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update && apt-get -y upgrade
RUN apt-get install ffmpeg libsm6 libxext6 -y
# Install other libraries
RUN apt-get install -y sudo wget
# Install libraries
RUN python3 -m pip install --upgrade pip \
&& pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118 \
&& pip install --no-cache-dir tensorflow \
&& pip install --no-cache-dir xformers \
&& pip install --no-cache-dir accelerate \
&& python3 -m pip install --no-cache-dir --upgrade tensorrt
# Copy the working dir to the container
COPY ../.. /nebullvm
# Install nebullvm
ARG NEBULLVM_VERSION=latest
RUN if [ "$NEBULLVM_VERSION" = "latest" ] ; then \
cd nebullvm ; \
pip install . ; \
cd apps/accelerate/speedster ; \
pip install . ; \
cd ../../../.. ; \
rm -rf nebullvm ; \
else \
pip install --no-cache-dir nebullvm==${NEBULLVM_VERSION} ; \
fi
# Install required python modules
RUN pip install --no-cache-dir cmake
# Install default deep learning compilers
ARG COMPILER=all
RUN if [ "$COMPILER" = "all" ] ; then \
python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers all ; \
elif [ "$COMPILER" = "tensorrt" ] ; then \
python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers tensorrt ; \
elif [ "$COMPILER" = "openvino" ] ; then \
python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers openvino ; \
elif [ "$COMPILER" = "onnxruntime" ] ; then \
python3 -m nebullvm.installers.auto_installer --frameworks all --extra-backends all --compilers onnxruntime ; \
fi
# Install TVM
RUN if [ "$COMPILER" = "all" ] || [ "$COMPILER" = "tvm" ] ; then \
pip install --no-cache-dir https://github.com/tlc-pack/tlcpack/releases/download/v0.11.1/tlcpack_cu116-0.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; \
pip install --no-cache-dir xgboost ; \
python3 -c "from tvm.runtime import Module" ; \
fi
ENV SIGOPT_PROJECT="tmp"
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/tensorrt
ENV CUDA_MODULE_LOADING="LAZY"
================================================
FILE: optimization/nebullvm/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: optimization/nebullvm/MANIFEST.in
================================================
recursive-include nebullvm/installers/tvm_installers *.cmake
recursive-include nebullvm/installers *.sh
================================================
FILE: optimization/nebullvm/README.md
================================================
A framework for building optimization modules to boost the performances of your AI systems
---
**Documentation**: docs.nebuly.com/
---
`Nebullvm` is a framework for building the optimization modules needed to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away.
If you like the idea, give us a star to show your support for the project ⭐
## **What can this help with?**
There are multiple modules we actually provide built on top of the framework:
✅ [Speedster](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster): Automatically apply the best set of SOTA optimization techniques to achieve the maximum inference speed-up on your hardware.
✅ [OpenAlphaTensor](https://github.com/nebuly-ai/nebuly/tree/main/optimization/open_alpha_tensor): Increase the computational performances of an AI model with custom-generated matrix multiplication algorithm fine-tuned for your specific hardware.
✅ [Forward-Forward](https://github.com/nebuly-ai/nebuly/tree/main/optimization/forward_forward): The Forward Forward algorithm is a method for training deep neural networks that replaces the backpropagation forward and backward passes with two forward passes.
## Next modules and roadmap
We are actively working on incorporating the following modules, as requested by members of our community, in upcoming releases:
- [ ] [CloudSurfer](https://github.com/nebuly-ai/nebuly/blob/main/optimization/cloud_surfer): Automatically discover the optimal cloud configuration and hardware on AWS, GCP and Azure to run your AI models.
- [ ] [OptiMate](https://github.com/nebuly-ai/nebuly/blob/main/optimizatione/optimate): Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.
## Contributing
As an open source project in a rapidly evolving field, we welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.
---
================================================
FILE: optimization/nebullvm/azure-pipelines.yml
================================================
trigger:
branches:
include:
- main
paths:
exclude:
- .github/*
- docs/**
- README.md
- notebooks/*
pool:
name: gpu-t4-pool
variables:
imageName: 'nebulydocker/nebullvm'
steps:
- script: |
nvidia-smi
displayName: 'Ensure cuda is installed correctly'
- script: |
pip uninstall -y nebullvm
pip install .
displayName: 'Install nebullvm'
- script: |
cd apps/accelerate/speedster
pip uninstall -y speedster
pip install .
cd ../../..
displayName: 'Install speedster'
- script: python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
displayName: 'Install PyTorch'
- script: |
export PATH=$PATH:/home/AzDevOps/.local/bin
python -m nebullvm.installers.auto_installer --compilers all
displayName: 'Install deep learning compilers'
- script: |
python -m pip install -r "requirements-dev.txt"
pip install pytest-azurepipelines
displayName: 'Install requirements for testing'
- script: |
res=$(python -c "from nebullvm.tools.utils import check_device; print(check_device().type.name == 'GPU')")
if [ "$res" = "False" ]; then
echo "GPU is not available"
exit 1
fi
echo "GPU is available: $res"
res=$(python -c "import torch; print(torch.cuda.is_available())")
if [ "$res" = "False" ]; then
echo "CUDA is not available for PyTorch"
exit 1
fi
echo "CUDA is available for PyTorch: $res"
res=$(python -c "import torch; num_devices = torch.cuda.device_count(); print(num_devices is not None and isinstance(num_devices, int) and num_devices > 0)")
if [ "$res" = "False" ]; then
echo "No CUDA devices found"
exit 1
fi
echo "CUDA devices found: $res"
displayName: 'Check GPU is available'
- script: |
export SPEEDSTER_DISABLE_TELEMETRY=1
export PATH=$PATH:/home/AzDevOps/.local/bin
cd apps/accelerate/speedster
pytest
cd ../../..
displayName: 'Run api tests'
failOnStderr: true
- script: |
export PATH=$PATH:/home/AzDevOps/.local/bin
cd nebullvm
pytest
cd ../
displayName: 'Run components tests'
failOnStderr: true
================================================
FILE: optimization/nebullvm/docker_build.sh
================================================
# Create image with all compilers installed
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-allcompilers .
# Create an image for each compiler installed
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-onnxruntime . --build-arg COMPILER="onnxruntime"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-openvino . --build-arg COMPILER="openvino"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tvm . --build-arg COMPILER="tvm"
docker build -t nebulydocker/nebullvm:cuda11.2.0-nebullvm0.3.1-tensorrt . --build-arg COMPILER="tensorrt"
================================================
FILE: optimization/nebullvm/docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: optimization/nebullvm/docs/README.md
================================================
# Documentation
Nebullvm documentation is built using Sphynx and furo! You can follow the guide below for
## Build the docs:
1. Install nebullvm according to [README.md](../../../README.md#step-1-installation-of-nebullvm-library).
2. Install additional libraries required to build docs:
```
pip install -r requirements-docs.txt
```
3. Run `make html` from this directory.
================================================
FILE: optimization/nebullvm/docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# flake8: noqa
import os
import sys
sys.path.insert(0, os.path.abspath("../../../"))
# import sphinx_rtd_theme
# -- Project information -----------------------------------------------------
project = "nebullvm"
copyright = "2022, nebuly"
author = "nebuly"
# The full version, including alpha/beta/rc tags
# release = "0.3.0"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.napoleon",
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
"sphinx.ext.coverage",
"sphinx.ext.mathjax",
"sphinx.ext.viewcode",
"sphinx.ext.githubpages",
]
# -- Configurations for plugins ------------
napoleon_google_docstring = True
napoleon_include_init_with_doc = True
napoleon_include_special_with_doc = True
napoleon_numpy_docstring = False
napoleon_use_rtype = False
autodoc_inherit_docstrings = False
autodoc_member_order = "bysource"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# html_theme = "sphinx_rtd_theme"
html_theme = "furo"
html_theme_options = {
"light_css_variables": {
"color-brand-primary": "#dark",
"color-brand-content": "#dark",
"color-admonition-background": "#dark",
"font-stack": "Montserrat, sans-serif",
"font-stack--monospace": "Courier, monospace",
},
"footer_icons": [
{
"name": "GitHub",
"url": "https://github.com/nebuly-ai/nebullvm",
"html": """
""",
"class": "",
},
],
"light_logo": "Logo_azure.svg",
"dark_logo": "Logo_azure.svg",
}
html_static_path = ["_static"]
html_title = ""
# html_theme_options = {
# "announcement": "Important announcement!",
# }
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
================================================
FILE: optimization/nebullvm/docs/index.rst
================================================
Welcome to nebullvm's documentation!
======================================
.. toctree::
:maxdepth: 2
modules/index
================================================
FILE: optimization/nebullvm/docs/modules/api.rst
================================================
nebullvm.api
=============
.. automodule:: nebullvm
:members:
.. automodule:: nebullvm.api.frontend.huggingface
:members:
================================================
FILE: optimization/nebullvm/docs/modules/converters.rst
================================================
nebullvm.converters
===================
.. automodule:: nebullvm.converters
:members:
================================================
FILE: optimization/nebullvm/docs/modules/index.rst
================================================
API Documentation
==================
.. toctree::
api
converters
inference_learners
installers
optimizers
================================================
FILE: optimization/nebullvm/docs/modules/inference_learners.rst
================================================
nebullvm.inference_learners
===========================
.. automodule:: nebullvm.inference_learners
:members:
================================================
FILE: optimization/nebullvm/docs/modules/installers.rst
================================================
nebullvm.installers
===================
.. automodule:: nebullvm.installers
:members:
================================================
FILE: optimization/nebullvm/docs/modules/optimizers.rst
================================================
nebullvm.optimizers
===================
.. automodule:: nebullvm.optimizers
:members:
================================================
FILE: optimization/nebullvm/docs/requirements-docs.txt
================================================
Sphinx==4.5.0
coloredlogs
sympy
furo
================================================
FILE: optimization/nebullvm/nebullvm/__init__.py
================================================
# The torch import is necessary for a strange issue when
# using cuda 11.8, if torch is imported after
# tensorflow it generates a core dumped error
from nebullvm.optional_modules.torch import torch # noqa F401
from nebullvm.tools.logger import setup_logger
setup_logger()
__all__ = [k for k in globals().keys() if not k.startswith("_")]
================================================
FILE: optimization/nebullvm/nebullvm/api/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/apps/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/apps/base.py
================================================
import abc
class App(abc.ABC):
def __init__(self):
super().__init__()
@abc.abstractmethod
def execute(self, **kwargs):
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/config.py
================================================
from nebullvm.optional_modules.torch import torch
VERSION = "0.10.0"
LEARNER_METADATA_FILENAME = "metadata.json"
ONNX_OPSET_VERSION = 13
NEBULLVM_DEBUG_FILE = "nebullvm_debug.json"
AUTO_TVM_TUNING_OPTION = {
"tuner": "xgb",
"trials": 10,
"early_stopping": 100,
}
# TODO: remove the min_repeat_ms key
AUTO_TVM_PARAMS = {
"number": 10,
"repeat": 1,
"min_repeat_ms": 0, # since we're tuning on a CPU, can be set to 0
"timeout": 10, # in seconds
}
NVIDIA_FILENAMES = {
"engine": "tensor_rt.engine",
"metadata": LEARNER_METADATA_FILENAME,
}
TVM_FILENAMES = {"engine": "compiled_lib.so"}
ONNX_FILENAMES = {"model_name": "model.onnx"}
ONNX_PROVIDERS = {
"cuda": [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
],
"cpu": [
"CPUExecutionProvider",
],
}
OPENVINO_FILENAMES = {
"metadata": LEARNER_METADATA_FILENAME,
"description_file": "description.xml",
"weights": "weights.bin",
}
TENSORFLOW_BACKEND_FILENAMES = {
"tflite_model": "tf_model.tflite",
"tf_model": "tf_model.h5",
}
TORCH_TENSORRT_PRECISIONS = {
"torch.float32": {torch.float},
"torch.float16": {torch.float, torch.half},
"torch.int8": {torch.float, torch.half, torch.int8},
}
MIN_DIM_INPUT_DATA = 100
QUANTIZATION_DATA_NUM = 300
CONSTRAINED_METRIC_DROP_THS = 1e-2
TRAIN_TEST_SPLIT_RATIO = 0.8
COMPILER_LIST = [
"deepsparse",
"tensor_rt",
"torchscript",
"onnxruntime",
"tflite",
"xla",
"tvm",
"openvino",
"bladedisc",
"intel_neural_compressor",
"torch_neuron",
"torch_xla",
"torch_dynamo",
"faster_transformer",
]
COMPRESSOR_LIST = [
"sparseml",
"intel_pruning",
]
ONNX_MODULES = ["openvino", "tensor_rt"]
TORCH_MODULES = [
"deepsparse",
"intel_neural_compressor",
"tensor_rt",
"torch_tensor_rt",
"faster_transformer",
]
TENSORFLOW_MODULES = []
HUGGING_FACE_MODULES = []
DIFFUSERS_MODULES = []
LIBRARIES_GPU = ["tensor_rt", "torch_tensor_rt", "faster_transformer"]
MIN_NUMBER = 1e-4
DEFAULT_METRIC_DROP_THS = 1e-2
ACTIVATION_METRIC_DROP_THS = 2e-2
================================================
FILE: optimization/nebullvm/nebullvm/core/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/core/models.py
================================================
import subprocess
from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from typing import Optional, Any, Union, Tuple, List, Dict
import numpy as np
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
class DeepLearningFramework(Enum):
PYTORCH = "torch"
TENSORFLOW = "tensorflow"
NUMPY = "numpy"
class QuantizationType(Enum):
DYNAMIC = "DYNAMIC"
STATIC = "STATIC"
HALF = "HALF"
class Status(Enum):
OK = "OK"
ERROR = "ERROR"
class DeviceType(Enum):
CPU = "cpu"
GPU = "gpu"
TPU = "tpu"
NEURON = "neuron"
class DataType(str, Enum):
FLOAT16 = "float16"
FLOAT32 = "float32"
INT32 = "int32"
INT64 = "int64"
@classmethod
def from_framework_format(
cls, dtype: Union[torch.dtype, tf.dtypes.DType, np.dtype]
):
if isinstance(dtype, torch.dtype):
framework = "torch"
elif isinstance(dtype, tf.dtypes.DType):
framework = "tensorflow"
else:
framework = "numpy"
dtype = dtype.type
return FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[framework][dtype]
def to_torch_format(self):
for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
"torch"
].items():
if value == self:
return key
def to_tf_format(self):
for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
"tensorflow"
].items():
if value == self:
return key
def to_numpy_format(self):
for key, value in FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT[
"numpy"
].items():
if value == self:
return key
class ModelCompiler(Enum):
TENSOR_RT = "tensor_rt"
TENSOR_RT_ONNX = "onnx_tensor_rt"
TENSOR_RT_TORCH = "torch_tensor_rt"
OPENVINO = "openvino"
APACHE_TVM = "tvm"
APACHE_TVM_TORCH = "torch_tvm"
APACHE_TVM_ONNX = "onnx_tvm"
ONNX_RUNTIME = "onnxruntime"
DEEPSPARSE = "deepsparse"
TORCHSCRIPT = "torchscript"
XLA = "xla"
TFLITE = "tflite"
BLADEDISC = "bladedisc"
INTEL_NEURAL_COMPRESSOR = "intel_neural_compressor"
TORCH_NEURON = "torch_neuron"
TORCH_XLA = "torch_xla"
TORCH_DYNAMO = "torch_dynamo"
FASTER_TRANSFORMER = "faster_transformer"
class ModelCompressor(Enum):
SPARSE_ML = "sparseml"
INTEL_PRUNING = "intel_pruning"
class OptimizationTime(Enum):
CONSTRAINED = "constrained"
UNCONSTRAINED = "unconstrained"
@dataclass
class HardwareSetup:
cpu: str
operating_system: str
memory_gb: int
accelerator: Optional[str] = None
@dataclass
class OptimizedModel:
inference_learner: Any
latency_seconds: float
metric_drop: float
technique: str
compiler: str
throughput: float
size_mb: float
@dataclass
class OriginalModel:
model: Any
latency_seconds: float
throughput: float
name: str
size_mb: float
framework: DeepLearningFramework
@dataclass
class BenchmarkOriginalModelResult:
"""The result of the LatencyOriginalModelMeasureOp"""
latency_seconds: float
model_outputs: Any
@dataclass
class OptimizeInferenceResult:
"""The result of the OptimizeInferenceOp"""
original_model: OriginalModel
hardware_setup: HardwareSetup
optimized_model: Optional[OptimizedModel]
@property
def metric_drop(self) -> Optional[float]:
if self.optimized_model is None:
return None
return self.optimized_model.metric_drop
@cached_property
def latency_improvement_rate(self) -> Optional[float]:
if self.optimized_model is None:
return None
if self.optimized_model.latency_seconds == 0:
return -1
return (
self.original_model.latency_seconds
/ self.optimized_model.latency_seconds
)
@cached_property
def throughput_improvement_rate(self) -> Optional[float]:
if self.optimized_model is None:
return None
if self.original_model.throughput == 0:
return -1
return self.optimized_model.throughput / self.original_model.throughput
@cached_property
def size_improvement_rate(self) -> Optional[float]:
if self.optimized_model is None:
return None
if self.optimized_model.size_mb == 0:
return 1
return self.original_model.size_mb / self.optimized_model.size_mb
class InputInfo:
"""Class for storing all the information needed for creating an input
tensor for AI models.
Attributes:
size (tuple): Tuple with the input size (batch size excluded)
dtype (str): Data type of the tensor.
min_value (int or float, optional): Min value the tensor elements can
have.
max_value (int or float, optional): Max value the tensor elements can
have.
"""
def __init__(self, size: Tuple[int, ...], dtype: str, **extra_info):
self.dtype = DataType(dtype)
self.size = size
self.__dict__.update(extra_info)
def __getattr__(self, item):
return self.__dict__.get(item)
def dict(self):
return {
k: v for k, v in self.__dict__.items() if not k.startswith("_")
}
@dataclass
class DynamicAxisInfo:
inputs: List[Dict[int, str]]
outputs: List[Dict[int, str]]
def dict(self):
return {
k: v for k, v in self.__dict__.items() if not k.startswith("_")
}
def retrieve_output_dim(
self,
input_shapes: List[Tuple[int, ...]],
output_idx: int,
dimension_idx: int,
default_output_value: int,
) -> int:
output_tag = self.outputs[output_idx][dimension_idx]
for input_dict, input_shape in zip(self.inputs, input_shapes):
for key, value in input_dict.items():
if (
isinstance(value, dict) and value.get("name") == output_tag
) or value == output_tag:
return input_shape[key]
return default_output_value
@dataclass
class ModelParams:
batch_size: int
input_infos: List[InputInfo]
output_sizes: List[Tuple[int, ...]]
output_types: List[DataType]
dynamic_info: Union[DynamicAxisInfo, Dict] = None
def __post_init__(self):
if isinstance(self.dynamic_info, dict):
self.dynamic_info = DynamicAxisInfo(**self.dynamic_info)
self.input_infos = [
InputInfo(**x) if isinstance(x, dict) else x
for x in self.input_infos
]
self.output_types = [DataType(x) for x in self.output_types]
def dict(self):
def recursively_dictionarize(element):
if isinstance(element, list):
element = [recursively_dictionarize(el) for el in element]
elif hasattr(element, "dict"):
element = element.dict()
return element
return {
k: recursively_dictionarize(v)
for k, v in self.__dict__.items()
if not k.startswith("_")
}
@property
def input_sizes(self):
for input_info in self.input_infos:
yield input_info.size
class Device:
def __init__(self, type: DeviceType, idx: int = 0):
self.type = type
self.idx = idx
@classmethod
def from_str(cls, string: str) -> "Device":
if string.startswith("cuda") or string.startswith("gpu"):
return cls(
DeviceType.GPU,
int(string.split(":")[1] if ":" in string else 0),
)
elif string.startswith("tpu"):
return cls(
DeviceType.TPU,
int(string.split(":")[1] if ":" in string else 0),
)
return cls(DeviceType.CPU)
def to_torch_format(self) -> str:
if self.type is DeviceType.GPU:
return f"cuda:{self.idx}"
elif self.type is DeviceType.TPU:
return f"xla:{self.idx}"
return "cpu"
def to_tf_format(self) -> str:
if self.type is DeviceType.GPU:
return f"GPU:{self.idx}"
return "CPU"
def get_total_memory(self) -> int:
# Return total memory in bytes using nvidia-smi in bytes
if self.type is not DeviceType.GPU:
raise Exception("Device type must be GPU")
else:
try:
output = (
subprocess.check_output(
"nvidia-smi --query-gpu=memory.total "
"--format=csv,nounits,noheader",
shell=True,
)
.decode("utf-8")
.split()[self.idx]
)
return int(output) * 1024 * 1024
except Exception:
raise Exception(
"Unable to get total memory of device. "
"Please make sure nvidia-smi is available."
)
def get_free_memory(self) -> int:
# Return free memory in bytes using nvidia-smi in bytes
if self.type is not DeviceType.GPU:
raise Exception("Device type must be GPU")
else:
try:
output = (
subprocess.check_output(
"nvidia-smi --query-gpu=memory.free "
"--format=csv,nounits,noheader",
shell=True,
)
.decode("utf-8")
.split()[self.idx]
)
return int(output) * 1024 * 1024
except Exception:
raise Exception(
"Unable to get free memory of device. "
"Please make sure nvidia-smi is available."
)
FRAMEWORK_TO_DATA_TYPE_CONVERSION_DICT = {
"torch": {
torch.float16: DataType.FLOAT16,
torch.float32: DataType.FLOAT32,
torch.int32: DataType.INT32,
torch.int64: DataType.INT64,
},
"tensorflow": {
tf.float16: DataType.FLOAT16,
tf.float32: DataType.FLOAT32,
tf.int32: DataType.INT32,
tf.int64: DataType.INT64,
},
"numpy": {
np.float16: DataType.FLOAT16,
np.float32: DataType.FLOAT32,
np.int32: DataType.INT32,
np.int64: DataType.INT64,
},
}
================================================
FILE: optimization/nebullvm/nebullvm/core/tests/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/core/tests/test_models.py
================================================
import unittest
from unittest.mock import MagicMock
from nebullvm.core.models import OptimizeInferenceResult
class TestOptimizeInferenceResult(unittest.TestCase):
def test_latency_improvement_rate__optimized_model_is_none(self):
res = OptimizeInferenceResult(
original_model=MagicMock(),
hardware_setup=MagicMock(),
optimized_model=None,
)
self.assertIsNone(res.latency_improvement_rate)
def test_latency_improvement_rate__optimized_latency_is_zero(self):
original_latency = 1.0
optimized_latency = 0.0
res = OptimizeInferenceResult(
original_model=MagicMock(latency_seconds=original_latency),
hardware_setup=MagicMock(),
optimized_model=MagicMock(latency_seconds=optimized_latency),
)
self.assertEqual(-1, res.latency_improvement_rate)
def test_latency_improvement_rate__original_latency_is_zero(self):
original_latency = 0.0
optimized_latency = 1.0
res = OptimizeInferenceResult(
original_model=MagicMock(latency_seconds=original_latency),
hardware_setup=MagicMock(),
optimized_model=MagicMock(latency_seconds=optimized_latency),
)
self.assertEqual(0, res.latency_improvement_rate)
def test_latency_improvement_rate__rate_gt_1(self):
original_latency = 1.0
optimized_latency = 0.5
res = OptimizeInferenceResult(
original_model=MagicMock(latency_seconds=original_latency),
hardware_setup=MagicMock(),
optimized_model=MagicMock(latency_seconds=optimized_latency),
)
self.assertGreater(res.latency_improvement_rate, 1)
def test_latency_improvement_rate__rate_lt_1(self):
original_latency = 0.5
optimized_latency = 1.0
res = OptimizeInferenceResult(
original_model=MagicMock(latency_seconds=original_latency),
hardware_setup=MagicMock(),
optimized_model=MagicMock(latency_seconds=optimized_latency),
)
self.assertLess(res.latency_improvement_rate, 1)
def test_th_improvement_rate__optimized_model_is_none(self):
res = OptimizeInferenceResult(
original_model=MagicMock(),
hardware_setup=MagicMock(),
optimized_model=None,
)
self.assertIsNone(res.throughput_improvement_rate)
def test_th_improvement_rate__optimized_th_is_zero(self):
original_th = 1.0
optimized_th = 0.0
res = OptimizeInferenceResult(
original_model=MagicMock(throughput=original_th),
hardware_setup=MagicMock(),
optimized_model=MagicMock(throughput=optimized_th),
)
self.assertEqual(0, res.throughput_improvement_rate)
def test_th_improvement_rate__original_th_is_zero(self):
original_th = 0.0
optimized_th = 1.0
res = OptimizeInferenceResult(
original_model=MagicMock(throughput=original_th),
hardware_setup=MagicMock(),
optimized_model=MagicMock(throughput=optimized_th),
)
self.assertEqual(-1, res.throughput_improvement_rate)
def test_th_improvement_rate__rate_gt_1(self):
original_th = 0.5
optimized_th = 1
res = OptimizeInferenceResult(
original_model=MagicMock(throughput=original_th),
hardware_setup=MagicMock(),
optimized_model=MagicMock(throughput=optimized_th),
)
self.assertGreater(res.throughput_improvement_rate, 1)
def test_th_improvement_rate__rate_lt_1(self):
original_th = 1.0
optimized_th = 0.5
res = OptimizeInferenceResult(
original_model=MagicMock(throughput=original_th),
hardware_setup=MagicMock(),
optimized_model=MagicMock(throughput=optimized_th),
)
self.assertLess(res.throughput_improvement_rate, 1)
def test_size_improvement_rate__optimized_model_is_none(self):
res = OptimizeInferenceResult(
original_model=MagicMock(),
hardware_setup=MagicMock(),
optimized_model=None,
)
self.assertIsNone(res.size_improvement_rate)
def test_size_improvement_rate__optimized_size_is_zero(self):
original_size = 1.0
optimized_size = 0.0
res = OptimizeInferenceResult(
original_model=MagicMock(size_mb=original_size),
hardware_setup=MagicMock(),
optimized_model=MagicMock(size_mb=optimized_size),
)
self.assertEqual(1, res.size_improvement_rate)
def test_size_improvement_rate__original_size_is_zero(self):
original_size = 0.0
optimized_size = 1.0
res = OptimizeInferenceResult(
original_model=MagicMock(size_mb=original_size),
hardware_setup=MagicMock(),
optimized_model=MagicMock(size_mb=optimized_size),
)
self.assertEqual(0, res.size_improvement_rate)
def test_size_improvement_rate__rate_gt_1(self):
original_size = 1
optimized_size = 0.5
res = OptimizeInferenceResult(
original_model=MagicMock(size_mb=original_size),
hardware_setup=MagicMock(),
optimized_model=MagicMock(size_mb=optimized_size),
)
self.assertGreater(res.size_improvement_rate, 1)
def test_size_improvement_rate__rate_lt_1(self):
original_size = 0.5
optimized_size = 1
res = OptimizeInferenceResult(
original_model=MagicMock(size_mb=original_size),
hardware_setup=MagicMock(),
optimized_model=MagicMock(size_mb=optimized_size),
)
self.assertLess(res.size_improvement_rate, 1)
def test_metric_drop__optimized_model_is_none(self):
res = OptimizeInferenceResult(
original_model=MagicMock(),
hardware_setup=MagicMock(),
optimized_model=None,
)
self.assertIsNone(res.metric_drop)
def test_metric_drop(self):
metric_drop = 0.1
res = OptimizeInferenceResult(
original_model=MagicMock(),
hardware_setup=MagicMock(),
optimized_model=MagicMock(metric_drop=metric_drop),
)
self.assertEqual(metric_drop, res.metric_drop)
================================================
FILE: optimization/nebullvm/nebullvm/core/types.py
================================================
from typing import Union, Iterable, Sequence
from nebullvm.tools.data import DataManager
InputData = Union[Iterable, Sequence, DataManager]
================================================
FILE: optimization/nebullvm/nebullvm/installers/__init__.py
================================================
# flake8: noqa
__all__ = [k for k in globals().keys() if not k.startswith("_")]
================================================
FILE: optimization/nebullvm/nebullvm/installers/auto_installer.py
================================================
import argparse
from typing import List, Union
from loguru import logger
from nebullvm.config import (
ONNX_MODULES,
TENSORFLOW_MODULES,
TORCH_MODULES,
HUGGING_FACE_MODULES,
DIFFUSERS_MODULES,
)
from nebullvm.installers.installers import (
ONNXInstaller,
PytorchInstaller,
TensorflowInstaller,
HuggingFaceInstaller,
DiffusersInstaller,
)
SUPPORTED_BACKENDS_DICT = {
"torch": ["onnx"],
"tensorflow": ["onnx"],
"huggingface": ["torch", "tensorflow", "onnx"],
"diffusers": ["torch", "onnx"],
"onnx": [],
}
INSTALLERS = {
"onnx": ONNXInstaller,
"torch": PytorchInstaller,
"tensorflow": TensorflowInstaller,
"huggingface": HuggingFaceInstaller,
"diffusers": DiffusersInstaller,
}
MODULES = {
"onnx": ONNX_MODULES,
"torch": TORCH_MODULES,
"tensorflow": TENSORFLOW_MODULES,
"huggingface": HUGGING_FACE_MODULES,
"diffusers": DIFFUSERS_MODULES,
}
def select_frameworks_to_install(
include_frameworks: Union[List[str], str],
include_backends: Union[List[str], str],
) -> List[str]:
supported_frameworks = list(INSTALLERS.keys())
if isinstance(include_frameworks, str) and include_frameworks == "all":
frameworks_list = supported_frameworks
elif isinstance(include_frameworks, list):
frameworks_list = []
for framework in include_frameworks:
if framework in supported_frameworks:
frameworks_list.append(framework)
else:
logger.warning(f"Framework {framework} not supported")
if len(frameworks_list) == 0:
raise ValueError("No supported frameworks selected")
if isinstance(include_backends, str) and include_backends == "all":
for framework in frameworks_list:
for backend in SUPPORTED_BACKENDS_DICT[framework]:
frameworks_list.append(backend)
elif isinstance(include_backends, list):
for backend in include_backends:
if backend not in supported_frameworks:
logger.warning(f"Backend {backend} not supported")
else:
backend_supported = False
for framework in frameworks_list:
if backend in SUPPORTED_BACKENDS_DICT[framework]:
frameworks_list.append(backend)
backend_supported = True
break
if not backend_supported:
logger.warning(
f"Backend {backend} not supported for selected "
f"frameworks"
)
else:
raise ValueError("Invalid backends list")
else:
raise ValueError("Invalid frameworks list")
frameworks_list = list(set(frameworks_list))
frameworks_list.sort()
return frameworks_list
def select_compilers_to_install(
include_compilers: Union[List[str], str], framework_list: List[str]
) -> List[str]:
compiler_list = []
supported_compilers = list(
set([item for sublist in MODULES.values() for item in sublist])
)
if isinstance(include_compilers, str) and include_compilers == "all":
compiler_list = list(
set(
[
item
for (fr, compilers) in MODULES.items()
for item in compilers
if fr in framework_list
]
)
)
else:
for compiler in include_compilers:
if compiler not in supported_compilers:
logger.warning(f"Compiler {compiler} not supported")
else:
compiler_supported = False
for framework in framework_list:
if compiler in MODULES[framework]:
compiler_list.append(compiler)
compiler_supported = True
break
if not compiler_supported:
logger.warning(
f"Compiler {compiler} not supported for selected "
f"frameworks"
)
compiler_list = list(set(compiler_list))
compiler_list.sort()
return compiler_list
def auto_install_libraries(
include_frameworks: Union[List[str], str] = "all",
include_backends: Union[List[str], str] = "all",
include_compilers: Union[List[str], str] = "all",
):
logger.info("Running auto install of nebullvm dependencies")
framework_list = select_frameworks_to_install(
include_frameworks, include_backends
)
compilers_list = select_compilers_to_install(
include_compilers, framework_list
)
for framework in framework_list:
framework_installer = INSTALLERS[framework](MODULES[framework])
if not framework_installer.check_framework():
framework_installer.install_framework()
framework_installer.install_dependencies(framework_list)
framework_installer.install_compilers(compilers_list)
def main():
parser = argparse.ArgumentParser(
description="Auto install dl frameworks and dependencies"
)
parser.add_argument(
"-f",
"--frameworks",
help="The base dl frameworks to be installed",
default="all",
nargs="+",
)
parser.add_argument(
"-b",
"--extra-backends",
help="additional dl frameworks to be installed to "
"gain the optimal speedup",
default="all",
nargs="+",
)
parser.add_argument(
"-c",
"--compilers",
help="Compilers to be installed",
default="all",
nargs="+",
)
args = vars(parser.parse_args())
if len(args["frameworks"]) == 1 and args["frameworks"][0] == "all":
framework_list = "all"
else:
framework_list = args["frameworks"]
if len(args["extra_backends"]) == 1 and args["extra_backends"][0] in [
"all",
"none",
]:
if args["extra_backends"][0] == "all":
backend_list = "all"
else:
backend_list = []
else:
backend_list = args["extra_backends"]
if len(args["compilers"]) == 1 and args["compilers"][0] == "all":
compilers_list = "all"
else:
compilers_list = args["compilers"]
auto_install_libraries(framework_list, backend_list, compilers_list)
if __name__ == "__main__":
main()
================================================
FILE: optimization/nebullvm/nebullvm/installers/install_bladedisc.sh
================================================
#!/bin/bash
# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive
if [ ! -d "BladeDISC" ]
then
git clone https://github.com/alibaba/BladeDISC.git
fi
cd BladeDISC && git submodule update --init --recursive
# Install bazel
sudo apt install apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update && sudo apt install bazel
sudo apt install default-jdk
if [ $1 == "true" ]
then
cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
else
if [[ $OSTYPE == "darwin"* ]]
then
export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF
export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.10.0+aarch64
cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
else
export TORCH_BLADE_BUILD_WITH_CUDA_SUPPORT=OFF
export TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.8.1+cpu
cd pytorch_blade && bash ./scripts/build_pytorch_blade.sh
fi
fi
cd ../..
================================================
FILE: optimization/nebullvm/nebullvm/installers/install_fastertransformer.sh
================================================
#!/bin/bash
# TODO: check requirements
# https://github.com/NVIDIA/FasterTransformer/blob/main/docs/bert_guide.md
# Requirements
#CMake >= 3.8 for Tensorflow, CMake >= 3.13 for PyTorch
#CUDA 11.0 or newer version
#Python: Only verify on python 3
#Tensorflow: Verify on 1.15, 1.13 and 1.14 should work.
#PyTorch: Verify on 1.8.0, >= 1.5.0 should work.
# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive
if [[ $OSTYPE == "darwin"* ]]
then
echo "MacOS is not supported for FasterTransformer"
exit 1
fi
if [ ! -d "FasterTransformer" ]
then
git clone --recursive https://github.com/NVIDIA/FasterTransformer FasterTransformer
fi
# TODO: checkout to latest release
cd FasterTransformer &&
mkdir -p build &&
cd build &&
cmake -DSM=$COMPUTE_CAPABILITY -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON .. &&
make -j8 &&
touch ../../FasterTransformer_build_success # create a file to indicate that the build was successful
# TODO: enable multi gpu if possible
#-DBUILD_MULTI_GPU=OFF
================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tensor_rt.sh
================================================
#!/bin/bash
if [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]]
then
# Installation for centos type linux distribution
# Try installation with pip if fails then install from source
pip3 install --upgrade "setuptools<=65.7.0" pip
# If cuda version is less than 12.0 then install tensorrt<=8.5.3.1
if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]
then
python3 -m pip install --upgrade "tensorrt<=8.5.3.1"
else
python3 -m pip install --upgrade "tensorrt<=8.6.1"
fi
pip3 install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com
if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]]
then
# Uninstall previous version
pip3 uninstall nvidia-tensorrt
# install pre-requisites
pip3 install numpy
yum update && \
yum -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \
libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \
rm -rf /var/lib/apt/lists/*
fi
else
# Try installation with pip if fails then install from source
pip install --upgrade "setuptools<=65.7.0" pip
# If cuda version is less than 12.0 then install tensorrt<=8.5.3.1
if [[ $(nvidia-smi | grep CUDA | awk '{print $9}' | cut -d '.' -f 1) -lt 12 ]]
then
python3 -m pip install --upgrade "tensorrt<=8.5.3.1"
else
python3 -m pip install --upgrade "tensorrt<=8.6.1"
fi
pip install colored polygraphy --extra-index-url https://pypi.ngc.nvidia.com
if [[ $(python3 -c "import tensorrt; print(tensorrt.__version__); assert tensorrt.Builder(tensorrt.Logger())" || echo 1) == 1 ]]
then
# Uninstall previous version
pip uninstall nvidia-tensorrt
# install pre-requisites
pip install numpy
apt-get update && \
apt-get -y install glibnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev \
libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer && \
rm -rf /var/lib/apt/lists/*
fi
fi
================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tvm.sh
================================================
#!/bin/bash
# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive
if [ ! -d "tvm" ]
then
git clone --recursive https://github.com/apache/tvm tvm
fi
cd tvm
mkdir -p build
cp $CONFIG_PATH build/
cd build
cmake ..
make -j8
if [[ $OSTYPE == "darwin"* ]]
then
pip install tornado
brew install openblas gfortran
pip install pybind11 cython pythran
conda install -y scipy
pip install xgboost decorator
export MACOSX_DEPLOYMENT_TARGET=10.9
else
pip3 install decorator attrs tornado psutil xgboost cloudpickle
fi
cd ../python
python3 setup.py install --user
cd ../..
================================================
FILE: optimization/nebullvm/nebullvm/installers/install_tvm_prerequisites.sh
================================================
#!/bin/bash
# Set non interactive mode for apt-get
export DEBIAN_FRONTEND=noninteractive
if [[ $OSTYPE == "darwin"* ]]
then
brew install gcc git cmake
#brew install llvm
conda install -y -c conda-forge clangdev
elif [[ "$(grep '^ID_LIKE' /etc/os-release)" == *"centos"* ]]
then
sudo yum update -y && sudo yum install -y gcc gcc-c++ llvm-devel cmake3 git
if [ -f "/usr/bin/cmake" ]
then
sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 \
--slave /usr/local/bin/ctest ctest /usr/bin/ctest \
--slave /usr/local/bin/cpack cpack /usr/bin/cpack \
--slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake \
--family cmake
sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 \
--slave /usr/local/bin/ctest ctest /usr/bin/ctest3 \
--slave /usr/local/bin/cpack cpack /usr/bin/cpack3 \
--slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 \
--family cmake
else
sudo ln -s /usr/bin/cmake3 /usr/bin/cmake
fi
else
sudo apt-get update && sudo apt-get install -y libpython3.8 gcc libtinfo-dev zlib1g-dev \
build-essential cmake libedit-dev libxml2-dev llvm-12
fi
================================================
FILE: optimization/nebullvm/nebullvm/installers/installers.py
================================================
import os
import platform
import subprocess
import sys
from abc import ABC
from pathlib import Path
from typing import List
import cpuinfo
from loguru import logger
from nebullvm.config import LIBRARIES_GPU
from nebullvm.operations.optimizations.compilers.utils import (
deepsparse_is_available,
get_faster_transformer_repo_path,
intel_neural_compressor_is_available,
openvino_is_available,
tensorrt_is_available,
torch_tensorrt_is_available,
)
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.utils import check_module_version, gpu_is_available
def get_cpu_arch():
arch = cpuinfo.get_cpu_info()["arch"].lower()
if "x86" in arch:
return "x86"
else:
return "arm"
def _get_os():
return platform.system()
def install_tvm(
working_dir: str = None,
):
"""Helper function for installing ApacheTVM.
This function needs some prerequisites for running, as a valid `git`
installation and having MacOS or a Linux-distribution as OS.
Args:
working_dir (str, optional): The directory where the tvm repo will be
cloned and installed.
"""
path = Path(__file__).parent
# install pre-requisites
installation_file_prerequisites = str(
path / "install_tvm_prerequisites.sh"
)
subprocess.run(
["bash", installation_file_prerequisites],
cwd=working_dir or Path.home(),
)
installation_file = str(path / "install_tvm.sh")
hardware_config = get_cpu_arch()
if gpu_is_available():
hardware_config = f"{hardware_config}_cuda"
env_dict = {
"CONFIG_PATH": str(
path / f"tvm_installers/{hardware_config}/config.cmake"
),
**dict(os.environ.copy()),
}
subprocess.run(
["bash", installation_file],
cwd=working_dir or Path.home(),
env=env_dict,
)
try:
import tvm # noqa F401
except ImportError:
return True
return True
def install_bladedisc():
"""Helper function for installing BladeDisc."""
has_cuda = False
if gpu_is_available():
has_cuda = True
path = Path(__file__).parent
installation_file = str(path / "install_bladedisc.sh")
subprocess.Popen(["bash", installation_file, str(has_cuda).lower()])
try:
import torch_blade # noqa F401
except ImportError:
return False
return True
def install_torch_tensor_rt():
"""Helper function for installing Torch-TensorRT.
The function will install the software only if a cuda driver is available.
"""
if not gpu_is_available():
raise RuntimeError(
"Torch-TensorRT can run just on Nvidia machines. "
"No available cuda driver has been found."
)
elif not check_module_version(
torch, min_version="1.12.0", max_version="1.13.1+cu117"
):
logger.warning(
"Torch-TensorRT can be installed only for "
"'PyTorch>=1.12, <=1.13.1'. Please update your Pytorch "
"version accordingly if you want to use Torch-TensorRT."
)
return False
# Verify that TensorRT is installed, otherwise install it
try:
import tensorrt # noqa F401
except ImportError:
install_tensor_rt()
cmd = [
"pip3",
"install",
"torch-tensorrt",
"--find-links",
"https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0",
]
subprocess.run(cmd)
cuda_version = subprocess.check_output(["nvidia-smi"])
cuda_version = int(
cuda_version.decode("utf-8")
.split("\n")[2]
.split("|")[-2]
.split(":")[-1]
.strip()
.split(".")[0]
)
if cuda_version >= 12:
cmd = [
"pip3",
"install",
"tensorrt>=8.6.0,<=8.6.1",
]
subprocess.run(cmd)
try:
import torch_tensorrt # noqa F401
except ImportError:
return False
return True
def install_tf2onnx():
if _get_os() == "Darwin" and get_cpu_arch() == "arm":
cmd = ["conda", "install", "-y", "tf2onnx>=1.8.4"]
subprocess.run(cmd)
else:
cmd = ["pip3", "install", "--user", "protobuf<4,>=3.20.2"]
subprocess.run(cmd)
cmd = ["pip3", "install", "tf2onnx>=1.8.4"]
subprocess.run(cmd)
try:
import tf2onnx # noqa F401
except ImportError:
return False
except AttributeError:
# Sometimes the import could raise an attribute error
# if installation fails
pass
return True
def install_tensor_rt():
"""Helper function for installing TensorRT.
The function will install the software only if a cuda driver is available.
"""
if not gpu_is_available():
raise RuntimeError(
"TensorRT can run just on Nvidia machines. "
"No available cuda driver has been found."
)
path = Path(__file__).parent
installation_file = str(path / "install_tensor_rt.sh")
subprocess.run(["bash", installation_file])
try:
import polygraphy # noqa F401
import tensorrt # noqa F401
except ImportError:
return False
return True
def install_openvino(with_optimization: bool = True):
"""Helper function for installing the OpenVino compiler.
This function just works on intel machines.
Args:
with_optimization (bool): Flag for installing the full openvino engine
or limiting the installation to the tools need for inference
models.
"""
processor = cpuinfo.get_cpu_info()["brand_raw"].lower()
if "intel" not in processor:
raise RuntimeError(
f"Openvino can run just on Intel machines. "
f"You are trying to install it on {processor}"
)
openvino_version = "openvino-dev" if with_optimization else "openvino"
# If on windows
if _get_os() == "Windows":
cmd = ["pip3", "install", "--user", f"{openvino_version}>=2022.1.0"]
else:
cmd = ["pip3", "install", f"{openvino_version}>=2022.1.0"]
subprocess.run(cmd)
cmd = ["pip3", "install", "scipy>=1.7.3"]
subprocess.run(cmd)
try:
from openvino.runtime import ( # noqa F401
CompiledModel,
Core,
InferRequest,
Model,
)
except ImportError:
return False
return True
def install_onnxruntime():
"""Helper function for installing the right version of onnxruntime."""
distribution_name = "onnxruntime"
if gpu_is_available():
distribution_name = f"{distribution_name}-gpu"
if _get_os() == "Darwin" and get_cpu_arch() == "arm":
cmd = ["conda", "install", "-y", distribution_name]
else:
cmd = ["pip3", "install", distribution_name]
subprocess.run(cmd)
# install requirements for onnxruntime.transformers
cmd = ["pip3", "install", "coloredlogs", "sympy"]
subprocess.run(cmd)
try:
import onnxruntime # noqa F401
except ImportError:
return False
return True
def install_deepsparse():
"""Helper function for installing DeepSparse."""
python_minor_version = sys.version_info.minor
os_ = platform.system()
if os_ in ["Darwin", "Windows"] or get_cpu_arch() == "arm":
raise RuntimeError(
"DeepSparse is not supported on this platform. "
"It won't be installed."
)
try:
cmd = ["apt-get", "install", f"python3.{python_minor_version}-venv"]
subprocess.run(cmd)
except Exception:
pass
cmd = ["pip3", "install", "deepsparse"]
subprocess.run(cmd)
try:
cmd = ["pip3", "install", "numpy>=1.22.0,<1.24.0"]
subprocess.run(cmd)
except Exception:
# For python 3.7 numpy 1.22.0 is not available
pass
try:
from deepsparse import compile_model, cpu # noqa F401
except ImportError:
return False
return True
def install_intel_neural_compressor():
"""Helper function for installing Intel Neural Compressor."""
processor = cpuinfo.get_cpu_info()["brand_raw"].lower()
if "intel" not in processor:
raise RuntimeError(
f"Intel Neural Compressor can run just on Intel machines. "
f"You are trying to install it on {processor}"
)
cmd = ["pip3", "install", "--user", "neural-compressor"]
subprocess.run(cmd)
try:
from neural_compressor.experimental import ( # noqa F401
MixedPrecision,
Quantization,
)
except ImportError:
return False
return True
def install_onnx_simplifier():
"""Helper function for installing ONNX simplifier."""
if get_cpu_arch() != "arm":
# Install onnx simplifier
cmd = ["pip3", "install", "onnxsim"]
subprocess.run(cmd)
try:
import onnxsim # noqa F401
except ImportError:
return False
return True
def install_faster_transformer(
working_dir: str = None,
):
"""Helper function for installing FasterTransformer.
https://github.com/NVIDIA/FasterTransformer
This function needs some prerequisites for running, as a valid `git`
installation and having MacOS or a Linux-distribution as OS.
Args:
working_dir (str, optional): The directory where the FasterTransformer
repo will be cloned and installed. Default: None
"""
if not gpu_is_available():
return False
path = Path(__file__).parent
# install faster transformer
try:
import torch
CP = compute_capability = torch.cuda.get_device_capability()
assert len(compute_capability) == 2
except (ImportError, AssertionError):
return False
installation_file = str(path / "install_fastertransformer.sh")
env_dict = {
"COMPUTE_CAPABILITY": f"{CP[0]}{CP[1]}",
**dict(os.environ.copy()),
}
result = subprocess.run(
["bash", installation_file],
cwd=get_faster_transformer_repo_path().parent,
env=env_dict,
)
# check result
if result.returncode != 0:
return False
return True
class BaseInstaller(ABC):
def __init__(self, module_list: List[str]):
self.modules = module_list
def install_compilers(
self,
include_libraries: List[str],
):
for library in self.modules:
if (
isinstance(include_libraries, List)
and library not in include_libraries
) or (not gpu_is_available() and library in LIBRARIES_GPU):
continue
logger.info(f"Trying to install {library} on the platform...")
try:
if not COMPILERS_AVAILABLE[library]():
install_ok = COMPILER_INSTALLERS[library]()
else:
install_ok = True
except Exception:
install_ok = False
if not install_ok:
logger.warning(
f"Unable to install {library} on this platform. "
f"The compiler will be skipped. "
)
else:
logger.info(f"{library} installed successfully!")
@staticmethod
def install_dependencies(include_framework: List[str]):
raise NotImplementedError
@staticmethod
def check_framework():
raise NotImplementedError
@staticmethod
def install_framework():
raise NotImplementedError
class PytorchInstaller(BaseInstaller):
@staticmethod
def install_dependencies(include_framework: List[str]):
return
@staticmethod
def check_framework():
try:
import torch # noqa F401
except ImportError:
raise ImportError(
"No PyTorch found in your python environment. Please install "
"it from https://pytorch.org/get-started/locally/."
)
if not check_module_version(
torch, min_version="1.12.0", max_version="2.0.1+cu118"
):
logger.warning(
"PyTorch version is not supported. Please install "
"PyTorch >= 1.12.0 and <= 2.0.1."
)
return True
@staticmethod
def install_framework():
cmd = ["pip3", "install", "torch>=1.12.0, <=2.0.1"]
subprocess.run(cmd)
try:
import torch # noqa F401
except ImportError:
return False
return True
class TensorflowInstaller(BaseInstaller):
@staticmethod
def install_dependencies(include_framework: List[str]):
if "onnx" in include_framework:
install_tf2onnx()
@staticmethod
def check_framework():
try:
import tensorflow # noqa F401
except ImportError:
return False
if not check_module_version(
tensorflow, min_version="2.7.0", max_version="2.12.0"
):
logger.warning(
"TensorFlow version is not supported. Please install "
"TensorFlow >= 2.7.0 and <= 2.12.0."
)
return False
return True
@staticmethod
def install_framework():
if _get_os() == "Darwin" and get_cpu_arch() == "arm":
cmd = [
"conda",
"install",
"-y",
"tensorflow>=2.7.0, 2.12.0",
"numpy<1.24",
]
subprocess.run(cmd)
else:
cmd = ["pip3", "install", "--user", "tensorflow>=2.7.0, <=2.12.0"]
subprocess.run(cmd)
try:
import tensorflow # noqa F401
except ImportError:
return False
return True
class ONNXInstaller(BaseInstaller):
@staticmethod
def install_dependencies(include_framework: List[str]):
install_onnxruntime()
cmd = ["pip3", "install", "onnxmltools>=1.11.0"]
subprocess.run(cmd)
install_onnx_simplifier()
@staticmethod
def check_framework():
try:
import onnx # noqa F401
except ImportError:
return False
if not check_module_version(
onnx, min_version="1.10.0", max_version="1.14.0"
):
logger.warning(
"ONNX version is not supported. Please install "
"ONNX >= 1.10.0 and <= 1.14.0."
)
return False
return True
@staticmethod
def install_framework():
if _get_os() == "Darwin" and get_cpu_arch() == "arm":
cmd = ["pip3", "install", "cmake"]
subprocess.run(cmd)
cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"]
subprocess.run(cmd)
try:
import onnx # noqa F401
except ImportError:
return False
return True
class HuggingFaceInstaller(BaseInstaller):
@staticmethod
def install_dependencies(include_framework: List[str]):
pass
@staticmethod
def check_framework():
try:
import transformers # noqa F401
except ImportError:
return False
return True
@staticmethod
def install_framework():
cmd = ["pip3", "install", "transformers<=4.28.0"]
subprocess.run(cmd)
try:
import transformers # noqa F401
except ImportError:
return False
return True
class DiffusersInstaller(BaseInstaller):
@staticmethod
def install_dependencies(include_framework: List[str]):
cmd = ["pip3", "install", "transformers<=4.28.0"]
subprocess.run(cmd)
if gpu_is_available():
cmd = ["pip3", "install", "cuda-python"]
subprocess.run(cmd)
cmd = ["pip3", "install", "onnx>=1.10.0, <=1.14.0"]
subprocess.run(cmd)
cmd = [
"pip3",
"install",
"onnx_graphsurgeon",
"--index-url",
"https://pypi.ngc.nvidia.com",
]
subprocess.run(cmd)
@staticmethod
def check_framework():
try:
import diffusers # noqa F401
except ImportError:
return False
if not check_module_version(diffusers, min_version="0.13.0"):
return False
return True
@staticmethod
def install_framework():
cmd = ["pip3", "install", "diffusers>=0.13.0, <=0.15.0"]
subprocess.run(cmd)
try:
import diffusers # noqa F401
except ImportError:
return False
return True
COMPILER_INSTALLERS = {
"openvino": install_openvino,
"tensor_rt": install_tensor_rt,
"torch_tensor_rt": install_torch_tensor_rt,
"deepsparse": install_deepsparse,
"intel_neural_compressor": install_intel_neural_compressor,
# "faster_transformer": install_faster_transformer,
}
COMPILERS_AVAILABLE = {
"openvino": openvino_is_available,
"tensor_rt": tensorrt_is_available,
"torch_tensor_rt": torch_tensorrt_is_available,
"deepsparse": deepsparse_is_available,
"intel_neural_compressor": intel_neural_compressor_is_available,
# "faster_transformer": faster_transformer_is_available,
}
================================================
FILE: optimization/nebullvm/nebullvm/installers/tests/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/installers/tests/test_install_frameworks.py
================================================
from nebullvm.installers.auto_installer import (
select_frameworks_to_install,
select_compilers_to_install,
)
def test_install_default_option():
include_frameworks = "all"
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == [
"diffusers",
"huggingface",
"onnx",
"tensorflow",
"torch",
]
def test_install_torch_full():
include_frameworks = ["torch"]
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["onnx", "torch"]
def test_install_torch_base():
include_frameworks = ["torch"]
include_backends = []
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["torch"]
def test_install_tensorflow_full():
include_frameworks = ["tensorflow"]
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["onnx", "tensorflow"]
def test_install_tensorflow_base():
include_frameworks = ["tensorflow"]
include_backends = []
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["tensorflow"]
def test_install_onnx_full():
include_frameworks = ["onnx"]
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["onnx"]
def test_install_onnx_base():
include_frameworks = ["onnx"]
include_backends = []
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["onnx"]
def test_install_diffusers_full():
include_frameworks = ["diffusers"]
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["diffusers", "onnx", "torch"]
def test_install_huggingface_full():
include_frameworks = ["huggingface"]
include_backends = "all"
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["huggingface", "onnx", "tensorflow", "torch"]
def test_install_huggingface_full_tf():
include_frameworks = ["huggingface"]
include_backends = ["onnx", "tensorflow"]
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["huggingface", "onnx", "tensorflow"]
def test_install_huggingface_full_torch():
include_frameworks = ["huggingface"]
include_backends = ["onnx", "torch"]
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["huggingface", "onnx", "torch"]
def test_install_huggingface_tf():
include_frameworks = ["huggingface"]
include_backends = ["tensorflow"]
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["huggingface", "tensorflow"]
def test_install_huggingface_torch():
include_frameworks = ["huggingface"]
include_backends = ["torch"]
include_backends = select_frameworks_to_install(
include_frameworks, include_backends
)
assert include_backends == ["huggingface", "torch"]
def test_install_huggingface_compilers_all():
framework_list = ["huggingface"]
include_compilers = "all"
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == []
def test_install_huggingface_torch_compilers_all():
framework_list = ["huggingface", "torch"]
include_compilers = "all"
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == [
"deepsparse",
"faster_transformer",
"intel_neural_compressor",
"tensor_rt",
"torch_tensor_rt",
]
def test_install_torch_compilers_all():
framework_list = ["torch"]
include_compilers = "all"
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == [
"deepsparse",
"faster_transformer",
"intel_neural_compressor",
"tensor_rt",
"torch_tensor_rt",
]
def test_install_torch_compilers_deepsparse():
framework_list = ["torch"]
include_compilers = ["deepsparse"]
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == ["deepsparse"]
def test_install_torch_compilers_invalid():
framework_list = ["torch"]
include_compilers = ["best_compiler"]
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == []
def test_install_torch_onnx_compilers_all():
framework_list = ["torch", "onnx"]
include_compilers = "all"
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == [
"deepsparse",
"faster_transformer",
"intel_neural_compressor",
"openvino",
"tensor_rt",
"torch_tensor_rt",
]
def test_install_tensorflow_compilers_all():
framework_list = ["tensorflow"]
include_compilers = "all"
compiler_list = select_compilers_to_install(
include_compilers, framework_list
)
assert compiler_list == []
================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#--------------------------------------------------------------------
# Template custom cmake configuration for compiling
#
# This file is used to override the build options in build.
# If you want to change the configuration, please use the following
# steps. Assume you are on the root directory. First copy the this
# file so that any local changes will be ignored by git
#
# $ mkdir build
# $ cp cmake/config.cmake build
#
# Next modify the according entries, and then compile by
#
# $ cd build
# $ cmake ..
#
# Then build in parallel with 8 threads
#
# $ make -j8
#--------------------------------------------------------------------
#---------------------------------------------
# Backend runtimes.
#---------------------------------------------
# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA OFF)
# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)
# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)
# Whether enable Metal runtime
set(USE_METAL OFF)
# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)
# Whether enable OpenGL runtime
set(USE_OPENGL OFF)
# Whether enable MicroTVM runtime
set(USE_MICRO OFF)
# Whether enable RPC runtime
set(USE_RPC ON)
# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)
# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)
# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)
# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)
# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)
# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)
# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)
# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
# which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)
#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)
# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)
# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL ) if using `pip install mkl`
set(USE_MKL OFF)
# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)
# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)
# Whether use contrib.random in runtime
set(USE_RANDOM ON)
# Whether use NNPack
set(USE_NNPACK OFF)
# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)
# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)
# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)
# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)
# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)
# Whether use cuBLAS
set(USE_CUBLAS OFF)
# Whether use MIOpen
set(USE_MIOPEN OFF)
# Whether use MPS
set(USE_MPS OFF)
# Whether use rocBlas
set(USE_ROCBLAS OFF)
# Whether use contrib sort
set(USE_SORT ON)
# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)
# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
# operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
# runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
# Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)
# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)
# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
# offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
# TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)
# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)
# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)
# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)
# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)
# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)
# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)
# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)
# Whether use Thrust
set(USE_THRUST OFF)
# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)
# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)
# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)
# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)
# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)
# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")
# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)
# Whether enable BNNS runtime
set(USE_BNNS OFF)
# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)
# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)
# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)
# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)
# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
# to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
# be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)
# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)
================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/arm_cuda/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#--------------------------------------------------------------------
# Template custom cmake configuration for compiling
#
# This file is used to override the build options in build.
# If you want to change the configuration, please use the following
# steps. Assume you are on the root directory. First copy the this
# file so that any local changes will be ignored by git
#
# $ mkdir build
# $ cp cmake/config.cmake build
#
# Next modify the according entries, and then compile by
#
# $ cd build
# $ cmake ..
#
# Then build in parallel with 8 threads
#
# $ make -j8
#--------------------------------------------------------------------
#---------------------------------------------
# Backend runtimes.
#---------------------------------------------
# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA ON)
# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)
# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)
# Whether enable Metal runtime
set(USE_METAL OFF)
# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)
# Whether enable OpenGL runtime
set(USE_OPENGL OFF)
# Whether enable MicroTVM runtime
set(USE_MICRO OFF)
# Whether enable RPC runtime
set(USE_RPC ON)
# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)
# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)
# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)
# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)
# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)
# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)
# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)
# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
# which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)
#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)
# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)
# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL ) if using `pip install mkl`
set(USE_MKL OFF)
# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)
# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)
# Whether use contrib.random in runtime
set(USE_RANDOM ON)
# Whether use NNPack
set(USE_NNPACK OFF)
# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)
# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)
# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)
# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)
# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)
# Whether use cuBLAS
set(USE_CUBLAS OFF)
# Whether use MIOpen
set(USE_MIOPEN OFF)
# Whether use MPS
set(USE_MPS OFF)
# Whether use rocBlas
set(USE_ROCBLAS OFF)
# Whether use contrib sort
set(USE_SORT ON)
# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)
# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
# operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
# runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
# Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)
# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)
# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
# offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
# TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)
# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)
# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)
# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)
# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)
# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)
# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)
# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)
# Whether use Thrust
set(USE_THRUST OFF)
# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)
# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)
# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)
# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)
# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)
# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")
# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)
# Whether enable BNNS runtime
set(USE_BNNS OFF)
# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)
# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)
# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)
# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)
# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
# to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
# be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)
# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)
================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#--------------------------------------------------------------------
# Template custom cmake configuration for compiling
#
# This file is used to override the build options in build.
# If you want to change the configuration, please use the following
# steps. Assume you are on the root directory. First copy the this
# file so that any local changes will be ignored by git
#
# $ mkdir build
# $ cp cmake/config.cmake build
#
# Next modify the according entries, and then compile by
#
# $ cd build
# $ cmake ..
#
# Then build in parallel with 8 threads
#
# $ make -j8
#--------------------------------------------------------------------
#---------------------------------------------
# Backend runtimes.
#---------------------------------------------
# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA OFF)
# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)
# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)
# Whether enable Metal runtime
set(USE_METAL OFF)
# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)
# Whether enable OpenGL runtime
set(USE_OPENGL OFF)
# Whether enable MicroTVM runtime
set(USE_MICRO OFF)
# Whether enable RPC runtime
set(USE_RPC ON)
# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)
# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)
# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)
# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)
# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)
# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)
# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)
# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
# which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)
#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)
# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)
# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL ) if using `pip install mkl`
set(USE_MKL OFF)
# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)
# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)
# Whether use contrib.random in runtime
set(USE_RANDOM ON)
# Whether use NNPack
set(USE_NNPACK OFF)
# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)
# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)
# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)
# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)
# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)
# Whether use cuBLAS
set(USE_CUBLAS OFF)
# Whether use MIOpen
set(USE_MIOPEN OFF)
# Whether use MPS
set(USE_MPS OFF)
# Whether use rocBlas
set(USE_ROCBLAS OFF)
# Whether use contrib sort
set(USE_SORT ON)
# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)
# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
# operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
# runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
# Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)
# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)
# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
# offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
# TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)
# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)
# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)
# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)
# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)
# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)
# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)
# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)
# Whether use Thrust
set(USE_THRUST OFF)
# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)
# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)
# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)
# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)
# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)
# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")
# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)
# Whether enable BNNS runtime
set(USE_BNNS OFF)
# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)
# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)
# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)
# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)
# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
# to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
# be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)
# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)
================================================
FILE: optimization/nebullvm/nebullvm/installers/tvm_installers/x86_cuda/config.cmake
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#--------------------------------------------------------------------
# Template custom cmake configuration for compiling
#
# This file is used to override the build options in build.
# If you want to change the configuration, please use the following
# steps. Assume you are on the root directory. First copy the this
# file so that any local changes will be ignored by git
#
# $ mkdir build
# $ cp cmake/config.cmake build
#
# Next modify the according entries, and then compile by
#
# $ cd build
# $ cmake ..
#
# Then build in parallel with 8 threads
#
# $ make -j8
#--------------------------------------------------------------------
#---------------------------------------------
# Backend runtimes.
#---------------------------------------------
# Whether enable CUDA during compile,
#
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
# - /path/to/cuda: use specific path to cuda toolkit
set(USE_CUDA ON)
# Whether enable ROCM runtime
#
# Possible values:
# - ON: enable ROCM with cmake's auto search
# - OFF: disable ROCM
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
set(USE_AOCL OFF)
# Whether enable OpenCL runtime
#
# Possible values:
# - ON: enable OpenCL with cmake's auto search
# - OFF: disable OpenCL
# - /path/to/opencl-sdk: use specific path to opencl-sdk
set(USE_OPENCL OFF)
# Whether enable Metal runtime
set(USE_METAL OFF)
# Whether enable Vulkan runtime
#
# Possible values:
# - ON: enable Vulkan with cmake's auto search
# - OFF: disable vulkan
# - /path/to/vulkan-sdk: use specific path to vulkan-sdk
set(USE_VULKAN OFF)
# Whether enable OpenGL runtime
set(USE_OPENGL OFF)
# Whether enable MicroTVM runtime
set(USE_MICRO OFF)
# Whether enable RPC runtime
set(USE_RPC ON)
# Whether to build the C++ RPC server binary
set(USE_CPP_RPC OFF)
# Whether to build the iOS RPC server application
set(USE_IOS_RPC OFF)
# Whether embed stackvm into the runtime
set(USE_STACKVM_RUNTIME OFF)
# Whether enable tiny embedded graph executor.
set(USE_GRAPH_EXECUTOR ON)
# Whether enable tiny graph executor with CUDA Graph
set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
# Whether enable pipeline executor.
set(USE_PIPELINE_EXECUTOR OFF)
# Whether to enable the profiler for the graph executor and vm
set(USE_PROFILER ON)
# Whether enable microTVM standalone runtime
set(USE_MICRO_STANDALONE_RUNTIME OFF)
# Whether build with LLVM support
# Requires LLVM version >= 4.0
#
# Possible values:
# - ON: enable llvm with cmake's find search
# - OFF: disable llvm, note this will disable CPU codegen
# which is needed for most cases
# - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
set(USE_LLVM ON)
#---------------------------------------------
# Contrib libraries
#---------------------------------------------
# Whether to build with BYODT software emulated posit custom datatype
#
# Possible values:
# - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
# - OFF: disable BYODT posit
#
# set(UNIVERSAL_PATH /path/to/stillwater-universal) for ON
set(USE_BYODT_POSIT OFF)
# Whether use BLAS, choices: openblas, atlas, apple
set(USE_BLAS none)
# Whether to use MKL
# Possible values:
# - ON: Enable MKL
# - /path/to/mkl: mkl root path
# - OFF: Disable MKL
# set(USE_MKL /opt/intel/mkl) for UNIX
# set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
# set(USE_MKL ) if using `pip install mkl`
set(USE_MKL OFF)
# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
set(USE_MKLDNN OFF)
# Whether use OpenMP thread pool, choices: gnu, intel
# Note: "gnu" uses gomp library, "intel" uses iomp5 library
set(USE_OPENMP none)
# Whether use contrib.random in runtime
set(USE_RANDOM ON)
# Whether use NNPack
set(USE_NNPACK OFF)
# Possible values:
# - ON: enable tflite with cmake's find search
# - OFF: disable tflite
# - /path/to/libtensorflow-lite.a: use specific path to tensorflow lite library
set(USE_TFLITE OFF)
# /path/to/tensorflow: tensorflow root path when use tflite library
set(USE_TENSORFLOW_PATH none)
# Required for full builds with TFLite. Not needed for runtime with TFLite.
# /path/to/flatbuffers: flatbuffers root path when using tflite library
set(USE_FLATBUFFERS_PATH none)
# Possible values:
# - OFF: disable tflite support for edgetpu
# - /path/to/edgetpu: use specific path to edgetpu library
set(USE_EDGETPU OFF)
# Possible values:
# - ON: enable cuDNN with cmake's auto search in CUDA directory
# - OFF: disable cuDNN
# - /path/to/cudnn: use specific path to cuDNN path
set(USE_CUDNN OFF)
# Whether use cuBLAS
set(USE_CUBLAS OFF)
# Whether use MIOpen
set(USE_MIOPEN OFF)
# Whether use MPS
set(USE_MPS OFF)
# Whether use rocBlas
set(USE_ROCBLAS OFF)
# Whether use contrib sort
set(USE_SORT ON)
# Whether use MKL-DNN (DNNL) codegen
set(USE_DNNL_CODEGEN OFF)
# Whether to use Arm Compute Library (ACL) codegen
# We provide 2 separate flags since we cannot build the ACL runtime on x86.
# This is useful for cases where you want to cross-compile a relay graph
# on x86 then run on AArch.
#
# An example of how to use this can be found here: docs/deploy/arm_compute_lib.rst.
#
# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
# operators to Arm Compute Library. OFF/ON
# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
# runtime. OFF/ON/"path/to/ACL"
set(USE_ARM_COMPUTE_LIB OFF)
set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
# Whether to build with Arm Ethos-N support
# Possible values:
# - OFF: disable Arm Ethos-N support
# - path/to/arm-ethos-N-stack: use a specific version of the
# Ethos-N driver stack
set(USE_ETHOSN OFF)
# If USE_ETHOSN is enabled, use ETHOSN_HW (ON) if Ethos-N hardware is available on this machine
# otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
set(USE_ETHOSN_HW OFF)
# Whether to build with Arm(R) Ethos(TM)-U NPU codegen support
set(USE_ETHOSU OFF)
# Whether to build with TensorRT codegen or runtime
# Examples are available here: docs/deploy/tensorrt.rst.
#
# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
# offloaded to TensorRT. OFF/ON
# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
# TensorRT library. OFF/ON/"path/to/TensorRT"
set(USE_TENSORRT_CODEGEN OFF)
set(USE_TENSORRT_RUNTIME OFF)
# Whether use VITIS-AI codegen
set(USE_VITIS_AI OFF)
# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)
# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
# - OFF: disable ANTLR
# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
set(USE_ANTLR OFF)
# Whether use Relay debug mode
set(USE_RELAY_DEBUG OFF)
# Whether to build fast VTA simulator driver
set(USE_VTA_FSIM OFF)
# Whether to build cycle-accurate VTA simulator driver
set(USE_VTA_TSIM OFF)
# Whether to build VTA FPGA driver (device side only)
set(USE_VTA_FPGA OFF)
# Whether use Thrust
set(USE_THRUST OFF)
# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)
# Whether to build the PyTorch custom class module
set(USE_PT_TVMDSOOP OFF)
# Whether to use STL's std::unordered_map or TVM's POD compatible Map
set(USE_FALLBACK_STL_MAP OFF)
# Whether to use hexagon device
set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)
# Whether to build the hexagon launcher
set(USE_HEXAGON_LAUNCHER OFF)
# Hexagon architecture to target when compiling TVM itself (not the target for
# compiling _by_ TVM). This applies to components like the TVM runtime, but is
# also used to select correct include/library paths from the Hexagon SDK when
# building offloading runtime for Android.
# Valid values are v60, v62, v65, v66, v68.
set(USE_HEXAGON_ARCH "v66")
# Whether to use ONNX codegen
set(USE_TARGET_ONNX OFF)
# Whether enable BNNS runtime
set(USE_BNNS OFF)
# Whether to use libbacktrace
# Libbacktrace provides line and column information on stack traces from errors.
# It is only supported on linux and macOS.
# Possible values:
# - AUTO: auto set according to system information and feasibility
# - ON: enable libbacktrace
# - OFF: disable libbacktrace
set(USE_LIBBACKTRACE AUTO)
# Whether to build static libtvm_runtime.a, the default is to build the dynamic
# version: libtvm_runtime.so.
#
# The static runtime library needs to be linked into executables with the linker
# option --whole-archive (or its equivalent). The reason is that the TVM registry
# mechanism relies on global constructors being executed at program startup.
# Global constructors alone are not sufficient for the linker to consider a
# library member to be used, and some of such library members (object files) may
# not be included in the final executable. This would make the corresponding
# runtime functions to be unavailable to the program.
set(BUILD_STATIC_RUNTIME OFF)
# Caches the build so that building is faster when switching between branches.
# If you switch branches, build and then encounter a linking error, you may
# need to regenerate the build tree through "make .." (the cache will
# still provide significant speedups).
# Possible values:
# - AUTO: search for path to ccache, disable if not found.
# - ON: enable ccache by searching for the path to ccache, report an error if not found
# - OFF: disable ccache
# - /path/to/ccache: use specific path to ccache
set(USE_CCACHE AUTO)
# Whether to enable PAPI support in profiling. PAPI provides access to hardware
# counters while profiling.
# Possible values:
# - ON: enable PAPI support. Will search PKG_CONFIG_PATH for a papi.pc
# - OFF: disable PAPI support.
# - /path/to/folder/containing/: Path to folder containing papi.pc.
set(USE_PAPI OFF)
# Whether to use GoogleTest for C++ unit tests. When enabled, the generated
# build file (e.g. Makefile) will have a target "cpptest".
# Possible values:
# - ON: enable GoogleTest. The package `GTest` will be required for cmake
# to succeed.
# - OFF: disable GoogleTest.
# - AUTO: cmake will attempt to find the GTest package, if found GTest will
# be enabled, otherwise it will be disabled.
# Note that cmake will use `find_package` to find GTest. Please use cmake's
# predefined variables to specify the path to the GTest package if needed.
set(USE_GTEST AUTO)
# Enable using CUTLASS as a BYOC backend
# Need to have USE_CUDA=ON
set(USE_CUTLASS OFF)
================================================
FILE: optimization/nebullvm/nebullvm/operations/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/base.py
================================================
import abc
from typing import Dict, Union
from loguru import logger
from nebullvm.core.models import Device, DeviceType
from nebullvm.tools.feedback_collector import FeedbackCollector
from nebullvm.tools.utils import check_device
class Operation(abc.ABC):
def __init__(self):
self._state = {}
self.device = Device(DeviceType.CPU)
self.execute_count = 0
self.logger = logger
self.feedback_collector = None
def set_feedback_collector(self, feedback_collector: FeedbackCollector):
self.feedback_collector = feedback_collector
for value in self.__dict__.values():
if isinstance(value, Operation):
value.set_feedback_collector(feedback_collector)
@abc.abstractmethod
def execute(self, **kwargs):
raise NotImplementedError()
@property
def state(self) -> Dict[str, any]:
return self._state
def to(self, device: Union[str, Device]):
if isinstance(device, str):
self.device = check_device(device)
else:
self.device = device
return self
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/converters.py
================================================
import abc
from pathlib import Path
from typing import Optional, List, Union
from nebullvm.core.models import DeviceType, DeepLearningFramework, ModelParams
from nebullvm.operations.base import Operation
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.operations.conversions.tensorflow import convert_tf_to_onnx
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
class Converter(Operation, abc.ABC):
ONNX_EXTENSION = ".onnx"
TORCH_EXTENSION = ".pt"
TF_EXTENSION = ".pb"
SUPPORTED_DEVICES = [DeviceType.GPU, DeviceType.CPU]
def __init__(self, model_name: Optional[str] = None):
super().__init__()
self.model = None
self.data = None
self.converted_models = None
self.model_params = None
self.device = None
self.model_name = model_name or "temp"
def set_state(
self, model: Union[torch.nn.Module, tf.Module, str], data: DataManager
):
self.model = model
self.data = data
return self
def get_result(self) -> List:
return [model for model in self.converted_models if model is not None]
class PytorchConverter(Converter):
DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]
def execute(
self,
save_path: Path,
model_params: ModelParams,
):
self.converted_models = [self.model]
if self.device.type not in self.SUPPORTED_DEVICES:
return
for framework in self.DEST_FRAMEWORKS:
if framework is DeepLearningFramework.NUMPY:
self.onnx_conversion(save_path, model_params)
else:
raise NotImplementedError()
def onnx_conversion(self, save_path, model_params):
onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
onnx_model_path = convert_torch_to_onnx(
torch_model=self.model,
input_data=self.data,
model_params=model_params,
output_file_path=onnx_path,
device=self.device,
)
if self.converted_models is None:
self.converted_models = [onnx_model_path]
else:
self.converted_models.append(onnx_model_path)
def tensorflow_conversion(self):
# TODO: Implement conversion from Pytorch to Tensorflow
raise NotImplementedError()
class TensorflowConverter(Converter):
DEST_FRAMEWORKS = [DeepLearningFramework.NUMPY]
def execute(
self,
save_path: Path,
model_params: ModelParams,
):
self.converted_models = [self.model]
if self.device.type not in self.SUPPORTED_DEVICES:
return
for framework in self.DEST_FRAMEWORKS:
if framework is DeepLearningFramework.NUMPY:
self.onnx_conversion(save_path, model_params)
else:
raise NotImplementedError()
def onnx_conversion(self, save_path, model_params):
onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
onnx_model_path = convert_tf_to_onnx(
model=self.model,
model_params=model_params,
output_file_path=onnx_path,
)
if self.converted_models is None:
self.converted_models = [onnx_model_path]
else:
self.converted_models.append(onnx_model_path)
def pytorch_conversion(self):
# TODO: Implement conversion from Tensorflow to Pytorch
raise NotImplementedError()
class ONNXConverter(Converter):
DEST_FRAMEWORKS = []
def execute(self, save_path, model_params):
onnx_path = save_path / f"{self.model_name}{self.ONNX_EXTENSION}"
try:
model_onnx = onnx.load(str(self.model))
onnx.save(model_onnx, str(onnx_path))
except Exception:
self.logger.error(
"The provided onnx model path is invalid. Please provide"
" a valid path to a model in order to use Nebullvm."
)
self.converted_models = []
self.converted_models = [str(onnx_path)]
def tensorflow_conversion(self):
# TODO: Implement conversion from ONNX to Tensorflow
raise NotImplementedError()
def pytorch_conversion(self):
# TODO: Implement conversion from ONNX to Pytorch
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/huggingface.py
================================================
from typing import (
List,
Dict,
Sequence,
Optional,
)
import numpy as np
from nebullvm.core.models import Device
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.huggingface import (
PreTrainedTokenizer,
PreTrainedModel,
)
from nebullvm.tools.huggingface import (
get_output_structure_from_dict,
get_output_structure_from_text,
PyTorchTransformerWrapper,
TensorFlowTransformerWrapper,
)
from nebullvm.tools.utils import is_dict_type
class _HFTextDataset(Sequence):
def __init__(
self,
input_texts: List,
ys: Optional[List],
keywords: List[str],
batch_size: int,
tokenizer: PreTrainedTokenizer,
tokenizer_args: Dict,
):
self._input_texts = input_texts
self._ys = ys
self._bs = batch_size
self._keys = keywords
self._tokenizer = tokenizer
if self._tokenizer.pad_token is None:
self._tokenizer.pad_token = self._tokenizer.eos_token
_tokenizer_args = {"truncation": True, "padding": True}
_tokenizer_args.update(tokenizer_args)
self._tokenizer_args = _tokenizer_args
def __getitem__(self, item: int):
pointer = self._bs * item
if pointer >= len(self._input_texts):
raise IndexError
mini_batch = self._input_texts[
pointer : pointer + self._bs # noqa E203
]
if self._ys is not None:
mini_batch_y = self._ys[pointer : pointer + self._bs] # noqa E203
else:
mini_batch_y = None
encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args)
return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y
def __len__(self):
return len(self._input_texts) // self._bs
class _HFDictDataset(Sequence):
def __init__(
self,
input_data: List,
ys: Optional[List],
keywords: List[str],
):
self._input_data = input_data
self._ys = ys
self._keys = keywords
def __getitem__(self, item: int):
pointer = item
if pointer >= len(self._input_data):
raise IndexError
mini_batch = self._input_data[pointer]
if self._ys is not None:
mini_batch_y = self._ys[pointer]
else:
mini_batch_y = None
return (
tuple(self._concatenate(mini_batch, key) for key in self._keys),
mini_batch_y,
)
def __len__(self):
return len(self._input_data)
@staticmethod
def _concatenate(mini_batch, key):
if isinstance(mini_batch[key], torch.Tensor):
return torch.concat([mini_batch[key]])
elif isinstance(mini_batch[key], tf.Tensor):
return tf.concat([mini_batch[key]], 0)
else:
return np.concatenate([mini_batch[key]])
def convert_hf_model(
model: PreTrainedModel,
input_data: List,
device: Device,
tokenizer: Optional[PreTrainedTokenizer] = None,
tokenizer_args: Optional[Dict] = None,
batch_size: int = 1,
**kwargs,
):
if is_dict_type(input_data[0]):
# already tokenized data
if "labels" in input_data[0]:
labels = [data.pop("labels") for data in input_data]
else:
labels = None
input_example = input_data[0]
output_structure, output_type = get_output_structure_from_dict(
input_example=input_example,
model=model,
device=device,
)
input_data = _HFDictDataset(
input_data=input_data,
ys=labels,
keywords=list(input_example.keys()),
)
else:
assert tokenizer is not None, (
"Tokenizer is needed when passing data in string format. Please "
"provide the tokenizer as keyword argument."
)
if tokenizer_args is None:
tokenizer_args = {}
if not isinstance(input_data[0], str):
ys = [data[1] for data in input_data]
input_data = [data[0] for data in input_data]
else:
ys = None
output_structure, output_type = get_output_structure_from_text(
text=input_data[0],
model=model,
tokenizer=tokenizer,
tokenizer_args=tokenizer_args,
device=device,
)
input_example = tokenizer(input_data, **tokenizer_args)
input_data = _HFTextDataset(
input_texts=input_data,
ys=ys,
keywords=list(input_example.keys()),
batch_size=batch_size,
tokenizer=tokenizer,
tokenizer_args=tokenizer_args,
)
if isinstance(model, torch.nn.Module):
wrapper_model = PyTorchTransformerWrapper(
core_model=model, encoded_input=input_example
)
else:
wrapper_model = TensorFlowTransformerWrapper(
core_model=model, encoded_input=input_example
)
return (
wrapper_model,
input_data,
list(wrapper_model.inputs_types.keys()),
output_structure,
output_type,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/pytorch.py
================================================
from contextlib import nullcontext
from pathlib import Path
from loguru import logger
from nebullvm.config import ONNX_OPSET_VERSION
from nebullvm.core.models import ModelParams, Device, DeviceType, DataType
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.pytorch import (
create_model_inputs_torch,
)
@torch.inference_mode()
def convert_torch_to_onnx(
torch_model: Module,
input_data: DataManager,
model_params: ModelParams,
output_file_path: Path,
device: Device,
):
"""Function importing a custom model in pytorch and converting it in ONNX
Args:
torch_model (Module): Pytorch model.
input_data (DataManager): Custom data provided by user to be
used as input for the converter.
model_params (ModelParams): Model Parameters as input sizes and
dynamic axis information.
output_file_path (str or Path): Path where storing the output
ONNX file.
device (Device): Device where the model will be run.
"""
if input_data is not None:
input_tensors = list(input_data.get_list(1)[0])
else:
input_tensors = create_model_inputs_torch(model_params.input_infos)
output_sizes = model_params.output_sizes
output_types = model_params.output_types
input_names = [f"input_{i}" for i in range(len(input_tensors))]
output_names = [f"output_{i}" for i in range(len(output_sizes))]
dynamic_info = model_params.dynamic_info
if dynamic_info is not None:
# This check is needed to enable backward compatibility with
# previous versions of nebullvm
if isinstance(list(dynamic_info.inputs[0].values())[0], str):
onnx_format_inputs = dynamic_info.inputs
else:
onnx_format_inputs = [
{k: v["name"] for (k, v) in d.items()}
for d in dynamic_info.inputs
]
assert len(dynamic_info.outputs) == len(output_names), (
f"The number of dynamic outputs provided in the dynamic info "
f"dict ({len(dynamic_info.outputs)}) is not equal to the number "
f"of outputs of the model ({len(output_names)}), Detected model "
f"output shapes are: {output_sizes} "
)
dynamic_info = {
name: dynamic_dict
for name, dynamic_dict in zip(
input_names + output_names,
onnx_format_inputs + dynamic_info.outputs,
)
}
try:
# try conversion with model on cpu
if device.type is DeviceType.GPU:
input_tensors = [x.cpu() for x in input_tensors]
torch_model.cpu()
torch.onnx.export(
torch_model, # model being run
tuple(
input_tensors
), # model input (or a tuple for multiple inputs)
str(output_file_path),
# where to save the model (can be a file or file-like object)
export_params=True,
# store the trained parameter weights inside the model file
opset_version=ONNX_OPSET_VERSION,
# the ONNX version to export the model to
do_constant_folding=True,
# whether to execute constant folding for optimization
input_names=input_names,
# the model's input names
output_names=output_names,
# the model's output names
dynamic_axes=dynamic_info,
)
# Put again model on gpu
if device.type is DeviceType.GPU:
torch_model.to(device.to_torch_format())
return output_file_path
except Exception:
# try conversion with model on gpu
if device.type is DeviceType.GPU:
input_tensors = [
x.to(device.to_torch_format()) for x in input_tensors
]
torch_model.to(device.to_torch_format())
try:
with torch.autocast("cuda") if output_types[
0
] is DataType.FLOAT16 else nullcontext():
torch.onnx.export(
torch_model, # model being run
tuple(
input_tensors
), # model input (or a tuple for multiple inputs)
str(output_file_path),
# where to save the model
# (can be a file or file-like object)
export_params=True,
# store the trained parameter weights inside the model
opset_version=ONNX_OPSET_VERSION,
# the ONNX version to export the model to
do_constant_folding=True,
# whether to execute constant folding for optimization
input_names=input_names,
# the model's input names
output_names=output_names,
# the model's output names
dynamic_axes=dynamic_info,
)
return output_file_path
except Exception:
logger.warning(
"Exception raised during conversion from torch"
" to onnx model. ONNX pipeline will be unavailable."
)
return None
else:
logger.warning(
"Exception raised during conversion from torch"
" to onnx model. ONNX pipeline will be unavailable."
)
return None
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/tensorflow.py
================================================
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Union
from loguru import logger
from nebullvm.config import ONNX_OPSET_VERSION
from nebullvm.core.models import ModelParams
from nebullvm.optional_modules.tensorflow import tensorflow as tf, tf2onnx
from nebullvm.optional_modules.onnx import onnx
from nebullvm.tools.huggingface import TensorFlowTransformerWrapper
def convert_tf_to_onnx(
model: Union[tf.Module, tf.keras.Model],
model_params: ModelParams,
output_file_path: Union[str, Path],
):
"""Convert TF models into ONNX.
Args:
model (Union[tf.Module, tf.keras.Model]): TF model.
model_params (ModelParams): Info about model parameters.
output_file_path (Path): Path where storing the output file.
"""
try:
if isinstance(model, tf.keras.Model) or (
isinstance(model, TensorFlowTransformerWrapper)
and isinstance(model.core_model, tf.keras.Model)
):
return convert_keras_to_onnx(model, model_params, output_file_path)
else:
return convert_tf_saved_model_to_onnx(model, output_file_path)
except Exception:
logger.warning(
"Something went wrong during conversion from tensorflow"
" to onnx model. ONNX pipeline will be unavailable."
)
return None
def convert_tf_saved_model_to_onnx(
model: tf.Module, output_file_path: Union[str, Path]
):
"""Convert TF models into ONNX.
Args:
model (tf.Module): TF model.
output_file_path (Path): Path where storing the output file.
"""
with TemporaryDirectory() as temp_dir:
tf.saved_model.save(model, export_dir=temp_dir)
try:
subprocess.check_output(["python3", "--version"])
python_cmd = "python3"
except subprocess.CalledProcessError:
python_cmd = "python"
onnx_cmd = [
python_cmd,
"-m",
"tf2onnx.convert",
"--saved-model",
f"{temp_dir}",
"--output",
f"{output_file_path}",
"--opset",
f"{ONNX_OPSET_VERSION}",
]
subprocess.run(onnx_cmd)
onnx.load(output_file_path)
return output_file_path
def convert_keras_to_onnx(
model: tf.keras.Model,
model_params: ModelParams,
output_file_path: Union[str, Path],
):
"""Convert keras models into ONNX.
Args:
model (tf.keras.Model): keras model.
model_params (ModelParams): Model Parameters as input sizes and
dynamic axis information.
output_file_path (Path): Path where storing the output file.
"""
# get data types for each input
dtypes = [
model_params.input_infos[i].dtype.value
for i in range(len(model_params.input_infos))
]
# get input shapes for each input
shapes = [
[int(x) for x in model_params.input_infos[i].size]
for i in range(len(model_params.input_infos))
]
# set the dynamic axes for each input
if isinstance(model, TensorFlowTransformerWrapper):
names = list(model.inputs_types.keys())
else:
names = [f"input_{i}" for i in range(len(model_params.input_infos))]
input_signature = tuple(
tf.TensorSpec(
(
None
if model_params.dynamic_info is not None
and dim in model_params.dynamic_info.inputs[i]
else shape[dim]
for dim in range(len(shape))
),
dtype,
name=name,
)
for i, (shape, dtype, name) in enumerate(zip(shapes, dtypes, names))
)
onnx_model, _ = tf2onnx.convert.from_keras(
model,
input_signature,
opset=ONNX_OPSET_VERSION,
output_path=output_file_path,
)
return output_file_path
================================================
FILE: optimization/nebullvm/nebullvm/operations/conversions/utils.py
================================================
from nebullvm.core.models import DeepLearningFramework
from nebullvm.operations.conversions.converters import (
PytorchConverter,
TensorflowConverter,
ONNXConverter,
Converter,
)
def get_conversion_op(framework: DeepLearningFramework) -> Converter:
if framework == DeepLearningFramework.PYTORCH:
conversion_op = PytorchConverter()
elif framework == DeepLearningFramework.TENSORFLOW:
conversion_op = TensorflowConverter()
else:
conversion_op = ONNXConverter()
return conversion_op
================================================
FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/fetch_operations/local.py
================================================
from typing import Any, Union, Iterable, Sequence
from nebullvm.operations.base import Operation
class FetchModelFromLocal(Operation):
def execute(self, model: Any):
self.state["model"] = model
def get_model(self) -> any:
return self.state.get("model")
def get_result(self) -> Any:
pass
class FetchDataFromLocal(Operation):
def execute(self, data: Union[Iterable, Sequence]):
self.state["data"] = data
def get_data(self) -> any:
return self.state.get("data")
def get_result(self) -> Any:
pass
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/base.py
================================================
import json
import os
import shutil
from abc import ABC, abstractmethod
from dataclasses import dataclass, InitVar
from pathlib import Path
from tempfile import mkdtemp, TemporaryDirectory
from typing import Union, Dict, Any, List, Optional
import numpy as np
from nebullvm.config import LEARNER_METADATA_FILENAME
from nebullvm.core.models import ModelParams, Device, QuantizationType
from nebullvm.operations.base import Operation
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.onnx import create_model_inputs_onnx
from nebullvm.tools.pytorch import (
create_model_inputs_torch,
get_torch_model_size,
)
from nebullvm.tools.tf import create_model_inputs_tf
from nebullvm.tools.transformations import MultiStageTransformation
class BuildInferenceLearner(Operation, ABC):
def __init__(self):
super().__init__()
self.inference_learner = None
@abstractmethod
def execute(self, **kwargs):
raise NotImplementedError()
def get_result(self) -> Any:
return self.inference_learner
@dataclass
class BaseInferenceLearner(ABC):
"""Base class for Inference Learners."""
network_parameters: ModelParams
input_tfms: Optional[MultiStageTransformation] = None
input_data: InitVar[List[Any]] = None
device: Device = None
quantization_type: QuantizationType = None
@property
@abstractmethod
def name(self) -> str:
"""The name of the InferenceLearner"""
def __post_init__(self, input_data):
if self.input_tfms is not None and len(self.input_tfms) < 0:
self.input_tfms = None
self._tmp_folder = Path(mkdtemp())
self._input_data = input_data
def _store_file(self, file_path: Union[str, Path]):
return shutil.copy(str(file_path), str(self._tmp_folder))
def _store_dir(self, dir_path: Union[str, Path]):
try:
# For python >= 3.8
return shutil.copytree(
str(dir_path), str(self._tmp_folder), dirs_exist_ok=True
)
except TypeError:
# For python <=3.7
if os.path.isdir(self._tmp_folder):
shutil.rmtree(str(self._tmp_folder))
return shutil.copytree(str(dir_path), str(self._tmp_folder))
def __del__(self, shutil=shutil):
try:
shutil.rmtree(self._tmp_folder, ignore_errors=True)
except Exception:
pass
def predict_from_files(
self, input_files: List[str], output_files: List[str]
):
"""Get a model prediction from file.
The input file is read, processed and a prediction is run on top of it.
The prediction is then returned into another file (in the same
directory of the input file itself).
Args:
input_files (List[str]): List of paths to the input file.
output_files (List[str]): List of paths to the file storing
the prediction.
"""
inputs = (self._read_file(input_file) for input_file in input_files)
preds = self(*inputs)
for pred, output_file in zip(preds, output_files):
self._save_file(pred, output_file)
def predict_from_listified_tensors(self, *listified_tensors: List):
"""Predict from listified tensor.
Method useful to be used in services receiving the input tensor
from an HTTP call.
Args:
listified_tensors (List): List of list-like version of the
input tensors. Note that each element of the external list is
a listified input tensor.
Returns:
List: List of list-like predictions.
"""
inputs = (
self.list2tensor(listified_tensor)
for listified_tensor in listified_tensors
)
if self.input_tfms is not None:
inputs = (self.input_tfms(_input) for _input in inputs)
preds = self.predict(*inputs)
return [self.tensor2list(pred) for pred in preds]
def list2tensor(self, listified_tensor: List) -> Any:
"""Convert list to tensor.
Args:
listified_tensor (List): Listified version of the input tensor.
Returns:
Any: Tensor for the prediction.
"""
raise NotImplementedError()
def tensor2list(self, tensor: Any) -> List:
"""Convert tensor to list.
Args:
tensor (any): Input tensor.
Returns:
List: Listified version of the tensor.
"""
raise NotImplementedError()
def _read_file(self, input_file: str) -> Any:
"""Read tensor from file.
Args:
input_file (str): Path to the file containing the input tensor.
Returns:
Any: Tensor read from the file.
"""
raise NotImplementedError()
def _save_file(self, prediction: Any, output_file: str):
"""Save prediction in the appropriate format.
Args:
prediction (any): The predicted tensor.
output_file (str): Path to the file where storing the prediction.
"""
raise NotImplementedError
def predict(self, *args, **kwargs) -> Any:
"""Take as input a tensor and returns a prediction"""
out = self(*args, **kwargs)
# TensorFlow predict method must return a np array
if isinstance(out[0], tf.Tensor):
out = tuple(t.numpy() for t in out)
return out
@abstractmethod
def run(self, *args, **kwargs) -> Any:
"""Abstract method implementing the prediction code."""
raise NotImplementedError()
def forward(self, *args, **kwargs):
"""Alternative method to the predict one."""
return self(*args, **kwargs)
def __call__(self, *args, **kwargs):
if self.input_tfms is not None:
args = (self.input_tfms(_input) for _input in args)
return self.run(*args, **kwargs)
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path): Path to the directory where saving the model.
"""
raise NotImplementedError()
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path): Path to the directory where the model is stored.
Returns:
BaseInferenceLearner: Loaded model.
"""
raise NotImplementedError()
@abstractmethod
def get_size(self):
"""The function returns the size of the optimized model."""
raise NotImplementedError()
@abstractmethod
def free_gpu_memory(self):
"""The function cleans the gpu occupied by the inference learner."""
raise NotImplementedError
@abstractmethod
def get_inputs_example(self):
"""The function returns an example of the input for the optimized
model predict method.
"""
raise NotImplementedError()
@property
@abstractmethod
def output_format(self):
return ".txt"
@property
@abstractmethod
def input_format(self):
return ".txt"
class LearnerMetadata:
"""Class for storing all the metadata about a model.
The stored information can be used for loading the appropriate model.
Attributes:
class_name (str): Name of the model class. For instance, for the model
object `CustomModel()`, the class name is 'CustomModel'.
module_name (str): Path to the python module where the model class
is defined.
network_parameters (Dict): Dictionaty containing the network
parameters, i.e. batch_size, input_size and output_size.
kwargs: External attributes that will be stored in the Metadata file.
"""
NAME: str = LEARNER_METADATA_FILENAME
class_name: str
module_name: str
device: str
quantization_type: str
def __init__(
self,
class_name: str,
module_name: str,
network_parameters: Union[ModelParams, Dict],
input_tfms: Union[MultiStageTransformation, Dict] = None,
**kwargs,
):
self.class_name = class_name
self.module_name = module_name
self.network_parameters = (
network_parameters.dict()
if isinstance(network_parameters, ModelParams)
else network_parameters
)
self.input_tfms = (
input_tfms.to_dict()
if isinstance(input_tfms, MultiStageTransformation)
else input_tfms
)
self.__dict__.update(**kwargs)
def __getitem__(self, item):
if not isinstance(item, str):
raise TypeError(
f"Error in key type. Expected str got {type(item)}"
)
elif item.startswith("_"):
raise ValueError("Trying to access a private attribute.")
return self.__dict__.get(item)
@classmethod
def from_model(cls, model: BaseInferenceLearner, **kwargs):
"""Create the metadata from the Inference Learner.
Args:
model (BaseInferenceLearner): Model from which extract the
metadata.
kwargs: External attributes that will be stored in the Metadata
file.
Returns:
LearnerMetadata: Metadata associated with the model.
"""
return cls(
class_name=model.__class__.__name__,
module_name=model.__module__,
network_parameters=model.network_parameters,
input_tfms=model.input_tfms,
device=model.device.type.value
if model.device is not None
else None,
quantization_type=model.quantization_type.value
if model.quantization_type is not None
else None,
**kwargs,
)
@classmethod
def from_dict(cls, dictionary: Dict):
"""Create the metadata file from a dictionary.
This method is the reverse one of `to_dict`.
Args:
dictionary (Dict): Dictionary containing the metadata.
Returns:
LearnerMetadata: Metadata associated with the model.
"""
if any(
key not in dictionary
for key in ("class_name", "module_name", "network_parameters")
):
raise ValueError(
"The input dictionary should contain both the model class "
"name and module."
)
return cls(**dictionary)
def to_dict(self) -> Dict:
"""Method for converting the LearnerMetadata in a python dictionary.
Returns:
Dict: Dictionary containing the metadata.
"""
return {
key: value
for key, value in self.__dict__.items()
if (
len(key) > 0
and key[0].islower()
and not key.startswith("_")
and value is not None
)
}
@classmethod
def read(cls, path: Union[Path, str]):
"""Read the metadata file and store it into a LearnerMetadata object.
Args:
path (Path): Path to the directory containing the metadata file.
Returns:
LearnerMetadata: Metadata associated with the model.
"""
path = Path(path)
with open(path / cls.NAME, "r") as fin:
metadata_dict = json.load(fin)
return cls(**metadata_dict)
def save(self, path: Union[Path, str]):
"""Save the metadata of the model in a file.
Args:
path (Path): Path to the directory where saving the model metadata.
"""
path = Path(path)
path.mkdir(exist_ok=True)
metadata_dict = self.to_dict()
with open(path / self.NAME, "w") as fout:
json.dump(metadata_dict, fout)
def load_model(
self, path: Union[Path, str], **kwargs
) -> BaseInferenceLearner:
"""Method for loading the InferenceLearner from its metadata.
The ModelMetadata file contains all the information necessary for
loading the Learner, as it contains both the module where the model
is defined and the class name of the model object. This method calls
the appropriate class method of the Model object, thus the actual
model loading is delegate to its methods.
Args:
path (Path): Path to the directory containing the files where
the model optimization is saved.
kwargs: Dictionary containing the arguments for the model's load
function.
"""
exec(f"from {self.module_name} import {self.class_name}")
model = eval(self.class_name).load(path=path, **kwargs)
return model
class PytorchBaseInferenceLearner(BaseInferenceLearner, ABC):
@property
def input_format(self):
return ".pt"
@property
def output_format(self):
return ".pt"
def list2tensor(self, listified_tensor: List) -> torch.Tensor:
"""Convert list to tensor.
Args:
listified_tensor (List): Listified version of the input tensor.
Returns:
torch.Tensor: Tensor for the prediction.
"""
return torch.tensor(listified_tensor)
def tensor2list(self, tensor: torch.Tensor) -> List:
"""Convert tensor to list.
Args:
tensor (any): Input tensor.
Returns:
List: Listified version of the tensor.
"""
return tensor.cpu().detach().numpy().tolist()
def free_gpu_memory(self):
self.model.cpu()
self._is_gpu_ready = False
def set_model_on_gpu(self):
self.model.to(self.device.to_torch_format())
self._is_gpu_ready = True
def _read_file(self, input_file: Union[str, Path]) -> torch.Tensor:
input_tensor = torch.load(input_file)
return input_tensor
def _save_file(
self, prediction: torch.Tensor, output_file: Union[str, Path]
):
torch.save(prediction, output_file)
def get_inputs_example(self, random=False):
if self._input_data is None or random:
return tuple(
create_model_inputs_torch(
input_infos=self.network_parameters.input_infos,
)
)
else:
return self._input_data
def get_size(self):
try:
if hasattr(self.model, "core_model"):
return get_torch_model_size(self.model.core_model)
else:
# Normal torch model
return get_torch_model_size(self.model)
except RuntimeError:
with TemporaryDirectory() as tmp_dir:
self.save(tmp_dir)
return sum(
os.path.getsize(Path(tmp_dir) / f)
for f in os.listdir(Path(tmp_dir))
if os.path.isfile(Path(tmp_dir) / f)
)
class TensorflowBaseInferenceLearner(BaseInferenceLearner, ABC):
@property
def input_format(self):
return ".npy"
@property
def output_format(self):
return ".npy"
def free_gpu_memory(self):
tf.keras.backend.clear_session()
self._is_gpu_ready = False
def set_model_on_gpu(self):
self._is_gpu_ready = True
def list2tensor(self, listified_tensor: List) -> tf.Tensor:
"""Convert list to tensor.
Args:
listified_tensor (List): Listified version of the input tensor.
Returns:
tf.Tensor: Tensor ready to be used for prediction.
"""
return tf.convert_to_tensor(listified_tensor)
def tensor2list(self, tensor: tf.Tensor) -> List:
"""Convert tensor to list.
Args:
tensor (tf.Tensor): Input tensor.
Returns:
List: Listified version of the tensor.
"""
return tensor.numpy().tolist()
def _read_file(self, input_file: Union[str, Path]) -> tf.Tensor:
numpy_array = np.load(input_file)
input_tensor = tf.convert_to_tensor(numpy_array)
return input_tensor
def _save_file(self, prediction: tf.Tensor, output_file: Union[str, Path]):
prediction.numpy().save(output_file)
def get_inputs_example(self, random=False):
if self._input_data is None or random:
return tuple(
create_model_inputs_tf(
input_infos=self.network_parameters.input_infos,
)
)
else:
return self._input_data
class NumpyBaseInferenceLearner(BaseInferenceLearner, ABC):
@property
def input_format(self):
return ".npy"
@property
def output_format(self):
return ".npy"
def list2tensor(self, listified_tensor: List) -> np.ndarray:
"""Convert list to numpy arrays.
Args:
listified_tensor (List): Listified version of the input tensor.
Returns:
np.array: Tensor ready to be used for prediction.
"""
return np.array(listified_tensor)
def tensor2list(self, tensor: np.ndarray) -> List:
"""Convert tensor to list.
Args:
tensor (tf.Tensor): Input tensor.
Returns:
List: Listified version of the tensor.
"""
return tensor.tolist()
def _read_file(self, input_file: Union[str, Path]) -> np.ndarray:
numpy_array = np.load(input_file)
return numpy_array
def _save_file(
self, prediction: np.ndarray, output_file: Union[str, Path]
):
np.save(output_file, prediction)
def get_inputs_example(self, random=False):
if self._input_data is None or random:
return tuple(
create_model_inputs_onnx(
input_infos=self.network_parameters.input_infos,
)
)
else:
return self._input_data
class InferenceLearnerWrapper(BaseInferenceLearner, ABC):
"""Wrapper model around InferenceLearners. It's a base class: cannot be
instantiated.
For all the BaseInferenceLearner-related methods, the implementation of
the core model will be used. This class just re-implement the load and save
methods, allowing (and forcing) then the child class to re-implement the
`predict` method.
Attributes:
network_parameters (ModelParams): Model parameters.
core_inference_learner (BaseInferenceLearner): Inference Learner.
"""
CORE_MODEL_SAVE_DIR = "core_model"
def __init__(self, core_inference_learner: BaseInferenceLearner):
super().__init__(
network_parameters=core_inference_learner.network_parameters
)
self.core_inference_learner = core_inference_learner
def list2tensor(self, listified_tensor: List) -> Any:
return self.core_inference_learner.list2tensor(listified_tensor)
def tensor2list(self, tensor: Any) -> List:
return self.core_inference_learner.tensor2list(tensor)
def _read_file(self, input_file: str) -> Any:
return self.core_inference_learner._read_file(input_file)
def _save_file(self, prediction: Any, output_file: str):
self.core_inference_learner._save_file(prediction, output_file)
def save(self, path: Union[str, Path], **kwargs):
core_model_path = Path(path) / self.CORE_MODEL_SAVE_DIR
core_model_path.mkdir(exist_ok=True, parents=True)
self.core_inference_learner.save(core_model_path, **kwargs)
extra_metadata_kwargs = self._get_extra_metadata_kwargs()
metadata = LearnerMetadata.from_model(self, **extra_metadata_kwargs)
metadata.save(path)
self._save_wrapper_extra_info()
def _get_extra_metadata_kwargs(self) -> Dict:
raise NotImplementedError
def _save_wrapper_extra_info(self):
raise NotImplementedError
@staticmethod
def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:
raise NotImplementedError
@staticmethod
def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
raise NotImplementedError
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
core_model_path = Path(path) / cls.CORE_MODEL_SAVE_DIR
core_learner = LearnerMetadata.read(core_model_path).load_model(
core_model_path, **kwargs
)
metadata = LearnerMetadata.read(path)
input_dict = cls._convert_metadata_to_inputs(metadata)
input_dict = cls._load_wrapper_extra_info(input_dict)
input_dict.update({"core_inference_learner": core_learner})
return cls(**input_dict)
def free_gpu_memory(self):
return self.core_inference_learner.free_gpu_memory()
def get_inputs_example(self):
return self.core_inference_learner.get_inputs_example()
@property
def output_format(self):
return self.core_inference_learner.output_format
@property
def input_format(self):
return self.core_inference_learner.input_format
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/blade_disc.py
================================================
from typing import Optional
from nebullvm.core.models import ModelParams, Device
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
from nebullvm.optional_modules.torch import ScriptModule
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class BladeDISCInferenceLearner(TorchScriptInferenceLearner):
name = "BladeDISC"
@classmethod
def from_torch_model(
cls,
model: ScriptModule,
network_parameters: ModelParams,
device: Device,
input_tfms: Optional[MultiStageTransformation] = None,
input_data: DataManager = None,
):
return cls(
torch_model=model,
network_parameters=network_parameters,
input_tfms=input_tfms,
input_data=input_data,
device=device,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/builders.py
================================================
from pathlib import Path
from typing import Any, Union
from nebullvm.core.models import (
ModelParams,
DeepLearningFramework,
QuantizationType,
DeviceType,
)
from nebullvm.operations.inference_learners.base import BuildInferenceLearner
from nebullvm.operations.inference_learners.deepsparse import (
PytorchDeepSparseInferenceLearner,
)
from nebullvm.operations.inference_learners.faster_transformer import (
FasterTransformerInferenceLearner,
)
from nebullvm.operations.inference_learners.neural_compressor import (
PytorchNeuralCompressorInferenceLearner,
)
from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS
from nebullvm.operations.inference_learners.openvino import (
OPENVINO_INFERENCE_LEARNERS,
)
from nebullvm.operations.inference_learners.tensor_rt import (
TENSOR_RT_INFERENCE_LEARNERS,
PytorchTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tensorflow import (
TensorflowBackendInferenceLearner,
TFLiteBackendInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
TorchDynamoInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_neuron import (
TorchNeuronInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_xla import (
TorchXLAInferenceLearner,
)
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
APACHE_TVM_INFERENCE_LEARNERS,
PytorchApacheTVMInferenceLearner,
)
from nebullvm.optional_modules.tensor_rt import tensorrt as trt
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import (
ScriptModule,
Module,
GraphModule,
torch,
)
from nebullvm.optional_modules.tvm import tvm, ExecutorFactoryModule
from nebullvm.tools.onnx import get_input_names, get_output_names
from nebullvm.tools.transformations import (
MultiStageTransformation,
VerifyContiguity,
)
class TorchScriptBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ScriptModule,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TorchScriptInferenceLearner(
torch_model=model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class TorchXLABuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: torch.nn.Module,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TorchXLAInferenceLearner(
torch_model=model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class TorchNeuronBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ScriptModule,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TorchNeuronInferenceLearner(
torch_model=model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class TorchDynamoBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ScriptModule,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TorchDynamoInferenceLearner(
torch_model=model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class TensorflowBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: tf.Module,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TensorflowBackendInferenceLearner(
model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class TFLiteBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: bytes,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = TFLiteBackendInferenceLearner(
model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
class DeepSparseBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: Union[str, Path],
model_params: ModelParams,
**kwargs,
):
input_names = get_input_names(str(model))
output_names = get_output_names(str(model))
self.inference_learner = PytorchDeepSparseInferenceLearner(
onnx_path=model,
network_parameters=model_params,
input_names=input_names,
output_names=output_names,
device=self.device,
)
class ONNXBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: Union[str, Path],
model_params: ModelParams,
input_tfms: MultiStageTransformation,
source_dl_framework: DeepLearningFramework,
quantization_type: QuantizationType,
**kwargs,
):
input_names = get_input_names(str(model))
output_names = get_output_names(str(model))
self.inference_learner = ONNX_INFERENCE_LEARNERS[source_dl_framework](
onnx_path=model,
network_parameters=model_params,
input_names=input_names,
output_names=output_names,
input_tfms=input_tfms,
device=self.device,
quantization_type=quantization_type,
)
class OpenVINOBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: str,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
source_dl_framework: DeepLearningFramework,
**kwargs,
):
self.inference_learner = OPENVINO_INFERENCE_LEARNERS[
source_dl_framework
].from_model_name(
model_name=model + ".xml",
model_weights=model + ".bin",
input_tfms=input_tfms,
network_parameters=model_params,
device=self.device,
)
class PyTorchTensorRTBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ScriptModule,
input_tfms: MultiStageTransformation,
model_params: ModelParams,
**kwargs,
):
self.inference_learner = PytorchTensorRTInferenceLearner(
torch_model=model,
input_tfms=input_tfms,
network_parameters=model_params,
device=self.device,
)
class ONNXTensorRTBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: Any,
model_orig: Union[str, Path],
model_params: ModelParams,
input_tfms: MultiStageTransformation,
source_dl_framework: DeepLearningFramework,
**kwargs,
):
nvidia_logger = trt.Logger(trt.Logger.ERROR)
input_names = get_input_names(str(model_orig))
output_names = get_output_names(str(model_orig))
input_tfms.append(VerifyContiguity())
runtime = trt.Runtime(nvidia_logger)
engine = runtime.deserialize_cuda_engine(model)
self.inference_learner = TENSOR_RT_INFERENCE_LEARNERS[
source_dl_framework
](
engine=engine,
input_tfms=input_tfms,
network_parameters=model_params,
input_names=input_names,
output_names=output_names,
nvidia_logger=nvidia_logger,
device=self.device,
)
class IntelNeuralCompressorBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: GraphModule,
model_orig: Module,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = PytorchNeuralCompressorInferenceLearner(
model=model_orig,
model_quant=model,
input_tfms=input_tfms,
network_parameters=model_params,
device=self.device,
)
class PyTorchApacheTVMBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ExecutorFactoryModule,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
target_device = (
str(tvm.target.cuda())
if self.device.type is DeviceType.GPU
else "llvm"
)
dev = tvm.device(str(target_device), 0)
input_names = [
f"input_{i}" for i in range(len(model_params.input_infos))
]
graph_executor_module = tvm.contrib.graph_executor.GraphModule(
model["default"](dev)
)
self.inference_learner = PytorchApacheTVMInferenceLearner(
input_tfms=input_tfms,
network_parameters=model_params,
graph_executor_module=graph_executor_module,
input_names=input_names,
lib=model,
target=target_device,
device=self.device,
)
class ONNXApacheTVMBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ExecutorFactoryModule,
model_orig: str,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
source_dl_framework: DeepLearningFramework,
**kwargs,
):
target_device = (
str(tvm.target.cuda())
if self.device.type is DeviceType.GPU
else "llvm"
)
dev = tvm.device(str(target_device), 0)
input_names = (
get_input_names(model_orig)
if model_orig is not None
else [f"input_{i}" for i in range(len(model_params.input_infos))]
)
graph_executor_module = tvm.contrib.graph_executor.GraphModule(
model["default"](dev)
)
self.inference_learner = APACHE_TVM_INFERENCE_LEARNERS[
source_dl_framework
](
input_tfms=input_tfms,
network_parameters=model_params,
graph_executor_module=graph_executor_module,
input_names=input_names,
lib=model,
target=target_device,
device=self.device,
)
class FasterTransformerBuildInferenceLearner(BuildInferenceLearner):
def execute(
self,
model: ScriptModule,
model_params: ModelParams,
input_tfms: MultiStageTransformation,
**kwargs,
):
self.inference_learner = FasterTransformerInferenceLearner(
torch_model=model,
network_parameters=model_params,
input_tfms=input_tfms,
device=self.device,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/deepsparse.py
================================================
import os
import shutil
from abc import ABC
from pathlib import Path
from typing import Union, List, Generator, Tuple, Dict, Type
import numpy as np
from loguru import logger
from nebullvm.config import ONNX_FILENAMES
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
)
from nebullvm.optional_modules.deepsparse import cpu, compile_model
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.transformations import MultiStageTransformation
class DeepSparseInferenceLearner(BaseInferenceLearner, ABC):
"""Model optimized on CPU using DeepSparse. DeepSparse is an engine
accelerating sparse computations on CPUs.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
name = "DeepSparse"
def __init__(
self,
onnx_path: Union[str, Path],
input_names: List[str],
output_names: List[str],
device: Device,
**kwargs,
):
super().__init__(**kwargs)
self.onnx_path = self._store_file(onnx_path)
# Compile model
cores_per_socket, _, _ = cpu.cpu_details()
# Define the number of cores to use, by default it will make use of
# all physical cores on the system
num_cores = cores_per_socket
batch_size = kwargs["network_parameters"].batch_size
self.engine = compile_model(onnx_path, batch_size, num_cores)
self.input_names = input_names
self.output_names = output_names
self.device = device
def get_size(self):
return os.path.getsize(self.onnx_path)
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
metadata = LearnerMetadata.from_model(
self,
input_names=self.input_names,
output_names=self.output_names,
**kwargs,
)
metadata.save(path)
shutil.copy(
self.onnx_path,
Path(path) / ONNX_FILENAMES["model_name"],
)
def free_gpu_memory(self):
raise NotImplementedError("DeepSparse does not support GPU inference.")
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for consistency
with other Learners.
Returns:
DeepSparseInferenceLearner: The optimized model.
"""
if len(kwargs) > 0:
logger.warning(
f"No extra keywords expected for the load method. "
f"Got {kwargs}."
)
onnx_path = os.path.join(str(path), ONNX_FILENAMES["model_name"])
metadata = LearnerMetadata.read(path)
input_tfms = metadata.input_tfms
if input_tfms is not None:
input_tfms = MultiStageTransformation.from_dict(
metadata.input_tfms
)
device = Device.from_str(metadata.device)
return cls(
input_tfms=input_tfms,
network_parameters=ModelParams(**metadata.network_parameters),
onnx_path=onnx_path,
input_names=metadata["input_names"],
output_names=metadata["output_names"],
device=device,
)
def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):
inputs = [array for array in input_arrays]
outputs = self.engine(inputs)
return outputs
class PytorchDeepSparseInferenceLearner(
DeepSparseInferenceLearner, PytorchBaseInferenceLearner
):
"""Model optimized on CPU using DeepSparse. DeepSparse is an engine
accelerating sparse computations on CPUs.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (
input_tensor.cpu().detach().numpy()
for input_tensor in input_tensors
)
outputs = self._predict_arrays(input_arrays)
return tuple(torch.from_numpy(output) for output in outputs)
DEEPSPARSE_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[DeepSparseInferenceLearner]
] = {DeepLearningFramework.PYTORCH: PytorchDeepSparseInferenceLearner}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/faster_transformer.py
================================================
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
class FasterTransformerInferenceLearner(TorchScriptInferenceLearner):
MODEL_NAME = "faster_transformer_model_scripted.pt"
name = "FasterTransformer"
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/huggingface.py
================================================
from abc import ABC
from collections import OrderedDict
from pathlib import Path
from typing import List, Any, Dict, Union
from nebullvm.operations.inference_learners.base import (
InferenceLearnerWrapper,
PytorchBaseInferenceLearner,
LearnerMetadata,
BaseInferenceLearner,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.diffusers import postprocess_diffusers
from nebullvm.tools.huggingface import restructure_output
from nebullvm.tools.pytorch import get_torch_model_size
class HuggingFaceInferenceLearner(InferenceLearnerWrapper):
"""Class wrapping an InferenceLearner model and giving to it the
huggingface interface.
The class fuse both the InterfaceLearner and HuggingFace interfaces, giving
to the final user a model which can be used whit the prefered API without
the need of adapting the previous code.
Attributes:
network_parameters (ModelParams): Model parameters of the model.
core_inference_learner (PytorchBaseInferenceLearner): Inference learner
built using the Pytorch interface.
output_structure (Dict): Original output structure of the HuggingFace
model.
input_names (List[str]): List of all the input keys used for the
original HuggingFace model.
output_type (Any, optional): Original output type of the HuggingFace
model.
"""
@property
def name(self) -> str:
return self.core_inference_learner.name
def __init__(
self,
core_inference_learner: PytorchBaseInferenceLearner,
output_structure: OrderedDict,
input_names: List[str],
output_type: Any = None,
):
super().__init__(core_inference_learner)
self.output_structure = output_structure
self.input_names = input_names
self.output_type = output_type
def _save_wrapper_extra_info(self):
pass
def get_size(self):
return self.core_inference_learner.get_size()
@staticmethod
def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
return builder_inputs
def run(self, *args, **kwargs) -> Any:
"""Run the underlying optimized model for getting a prediction.
The method has an hybrid interface. It accepts inputs either as
positional or keyword arguments. If only positional arguments are given
the method expects the inputs to be in the canonical
nebullvm interface. If only keyword arguments are given the method
expects them to be in the HuggingFace interface. Mixed representation
is not allowed and will result in an error.
"""
if len(args) > 0 and len(kwargs) > 0:
raise RuntimeError(
"Not allowed usage of the predict method. "
"Either the positional or the keyword arguments must be given."
)
if len(args) > 0:
return self.core_inference_learner(*args)
inputs = (kwargs.pop(name) for name in self.input_names)
outputs = self.core_inference_learner(*inputs)
if self.output_type is tuple:
return outputs
else:
return restructure_output(
outputs, self.output_structure, self.output_type
)
def _get_extra_metadata_kwargs(self) -> Dict:
metadata_kwargs = {
"output_structure": self.output_structure,
"output_structure_keys": list(self.output_structure.keys()),
"input_names": self.input_names,
}
if self.output_type is not None:
metadata_kwargs.update(
{
"output_type": self.output_type.__name__,
"output_type_module": self.output_type.__module__,
}
)
return metadata_kwargs
@staticmethod
def _convert_metadata_to_inputs(metadata: LearnerMetadata) -> Dict:
# we need to guarantee the preservation of the output structure
# elements order.
output_structure = OrderedDict()
for key in metadata["output_structure_keys"]:
output_structure[key] = metadata["output_structure"][key]
inputs = {
"output_structure": output_structure,
"input_names": metadata["input_names"],
}
if metadata["output_type"] is not None:
exec(
f"from {metadata['output_type_module']} "
f"import {metadata['output_type']}"
)
inputs["output_type"] = eval(metadata["output_type"])
return inputs
class DiffusionInferenceLearner(BaseInferenceLearner, ABC):
@property
def name(self) -> str:
return self.pipeline.unet.model.name
def __init__(self, pipeline: StableDiffusionPipeline):
self.pipeline = pipeline
def __call__(self, *args, **kwargs):
return self.pipeline(*args, **kwargs)
def run(self, *args, **kwargs) -> Any:
self.pipeline(*args, **kwargs)
def save(self, path: Union[str, Path], **kwargs):
self.pipeline.unet.model.save(path)
@classmethod
def load(
cls,
path: Union[Path, str],
**kwargs,
):
try:
pipe = kwargs["pipe"]
except KeyError:
raise TypeError("Missing required argument 'pipe'")
optimized_model = LearnerMetadata.read(path).load_model(path)
return postprocess_diffusers(
optimized_model,
pipe,
optimized_model.device,
)
def get_size(self):
(
self.pipeline.unet.model.get_size()
+ sum(
[
get_torch_model_size(v)
for (k, v) in self.pipeline.__dict__.items()
if isinstance(v, torch.nn.Module) and k != "unet"
]
)
/ 1e6
)
def free_gpu_memory(self):
raise self.pipeline.unet.model.free_gpu_memory()
def get_inputs_example(self):
raise NotImplementedError()
@property
def output_format(self):
return ".pt"
@property
def input_format(self):
return ".pt"
def list2tensor(self, listified_tensor: List) -> Any:
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/neural_compressor.py
================================================
from abc import ABC
from pathlib import Path
from typing import Union, Tuple, Dict, Type
from loguru import logger
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
)
from nebullvm.optional_modules.neural_compressor import (
cfgs_to_fx_cfgs,
cfg_to_qconfig,
)
from nebullvm.optional_modules.torch import (
torch,
prepare_fx,
convert_fx,
Module,
GraphModule,
)
from nebullvm.tools.pytorch import (
save_with_torch_fx,
load_with_torch_fx,
create_model_inputs_torch,
get_torch_model_size,
)
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import check_module_version
class NeuralCompressorInferenceLearner(BaseInferenceLearner, ABC):
"""Model optimized on CPU using IntelNeuralCompressor.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
model (torch.fx.GraphModule): Torch fx graph model.
"""
name = "IntelNeuralCompressor"
def __init__(
self,
model: Union[Module, GraphModule],
model_quant: GraphModule,
device: Device,
**kwargs,
):
super().__init__(**kwargs)
self.model = model
self.model_quant = model_quant
self.device = device
def get_size(self):
return get_torch_model_size(self.model_quant) + get_torch_model_size(
self.model
)
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
path_orig_model = Path(path) / Path("model_orig")
path_quant_model = Path(path) / Path("model_quant")
save_with_torch_fx(self.model, path_orig_model)
self.model_quant.save(str(path_quant_model))
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for consistency
with other Learners.
Returns:
DeepSparseInferenceLearner: The optimized model.
"""
if len(kwargs) > 0:
logger.warning(
f"No extra keywords expected for the load method. "
f"Got {kwargs}."
)
metadata = LearnerMetadata.read(path)
input_tfms = metadata.input_tfms
if input_tfms is not None:
input_tfms = MultiStageTransformation.from_dict(
metadata.input_tfms
)
network_parameters = ModelParams(**metadata.network_parameters)
path_orig_model = Path(path) / Path("model_orig")
path_quant_model = Path(path) / Path("model_quant") / "best_model.pt"
model = load_with_torch_fx(
Path(path_orig_model), "state_dict.pt"
).eval()
state_dict = torch.load(path_quant_model)
tune_cfg = state_dict.pop("best_configure")
op_cfgs = cfg_to_qconfig(tune_cfg, tune_cfg["approach"])
fx_op_cfgs = cfgs_to_fx_cfgs(op_cfgs, tune_cfg["approach"])
additional_arguments = {}
if check_module_version(torch, min_version="1.13.0"):
additional_arguments["example_inputs"] = tuple(
create_model_inputs_torch(
input_infos=network_parameters.input_infos,
)
)
q_model = prepare_fx(
model,
fx_op_cfgs,
**additional_arguments,
)
q_model = convert_fx(q_model)
q_model.load_state_dict(state_dict)
device = Device.from_str(metadata.device)
return cls(
model=model,
model_quant=q_model,
device=device,
input_tfms=input_tfms,
network_parameters=ModelParams(**metadata.network_parameters),
)
class PytorchNeuralCompressorInferenceLearner(
NeuralCompressorInferenceLearner, PytorchBaseInferenceLearner
):
"""Model optimized on CPU using IntelNeuralCompressor.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
model (torch.fx.GraphModule): Torch fx graph model.
"""
def free_gpu_memory(self):
raise NotImplementedError(
"NeuralCompressor does not support GPU inference."
)
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
inputs = (t.cpu() for t in input_tensors)
outputs = self.model_quant(*inputs)
if isinstance(outputs, torch.Tensor):
outputs = (outputs,)
return outputs
NEURAL_COMPRESSOR_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[NeuralCompressorInferenceLearner]
] = {DeepLearningFramework.PYTORCH: PytorchNeuralCompressorInferenceLearner}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/onnx.py
================================================
import multiprocessing
import os
import shutil
from abc import ABC
from pathlib import Path
from typing import Union, List, Generator, Tuple, Dict, Type
import cpuinfo
import numpy as np
from loguru import logger
from nebullvm.config import (
ONNX_FILENAMES,
ONNX_PROVIDERS,
)
from nebullvm.core.models import (
QuantizationType,
Device,
DeviceType,
ModelParams,
DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
TensorflowBaseInferenceLearner,
NumpyBaseInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import (
tensorrt_is_available,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.transformations import MultiStageTransformation
def _running_on_intel_cpu(use_gpu):
if use_gpu:
return False # running on GPU
cpu_info = cpuinfo.get_cpu_info()["brand_raw"].lower()
if "intel" in cpu_info:
return True
return False
def _get_ort_session_options(use_gpu) -> ort.SessionOptions:
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
)
if not use_gpu:
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.inter_op_num_threads = 1
sess_options.intra_op_num_threads = max(
int(
os.environ.get("NEBULLVM_THREADS_PER_MODEL")
or multiprocessing.cpu_count()
),
1,
)
return sess_options
class ONNXInferenceLearner(BaseInferenceLearner, ABC):
"""Model converted to ONNX and run with Microsoft's onnxruntime.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
name = "ONNXRuntime"
def __init__(
self,
onnx_path: Union[str, Path],
input_names: List[str],
output_names: List[str],
device: Device,
quantization_type: QuantizationType,
**kwargs,
):
super().__init__(**kwargs)
filename = Path(onnx_path).name
dir_path = str(Path(onnx_path).parent)
self.device = device
self.onnx_path = Path(self._store_dir(dir_path)) / filename
self.sess_options = _get_ort_session_options(
self.device.type is DeviceType.GPU
)
self.quantization_type = quantization_type
if _running_on_intel_cpu(self.device.type is DeviceType.GPU):
self.sess_options.add_session_config_entry(
"session.set_denormal_as_zero", "1"
)
self.set_model_on_gpu()
self._is_gpu_ready = self.device.type is DeviceType.GPU
self.input_names = input_names
self.output_names = output_names
@staticmethod
def _setup_tensorrt(quantization_type: QuantizationType, device: Device):
if (
tensorrt_is_available()
and os.environ.get("LD_LIBRARY_PATH", False)
and "tensorrt" in os.environ["LD_LIBRARY_PATH"]
):
ONNX_PROVIDERS["cuda"][0] = (
"TensorrtExecutionProvider",
{
"device_id": device.idx,
"trt_max_workspace_size": device.get_free_memory(),
"trt_fp16_enable": True
if quantization_type is not None
else False,
"trt_int8_enable": True
if quantization_type is QuantizationType.STATIC
else False,
},
)
else:
if tensorrt_is_available():
logger.warning(
"TensorrtExecutionProvider for onnx is not "
"available. If you want to use it, please "
"add the path to tensorrt to the "
"LD_LIBRARY_PATH environment variable. "
"CUDA provider will be used instead. "
)
else:
logger.warning(
"TensorRT is not available. "
"If you want to use it, please install it and "
"add the path to the LD_LIBRARY_PATH "
"environment variable."
"CUDA provider will be used instead. "
)
if "TensorrtExecutionProvider" in ONNX_PROVIDERS["cuda"]:
ONNX_PROVIDERS["cuda"].remove("TensorrtExecutionProvider")
def get_size(self):
return sum(
os.path.getsize(self.onnx_path.parents[0] / f)
for f in os.listdir(self.onnx_path.parents[0])
if os.path.isfile(self.onnx_path.parents[0] / f)
)
def free_gpu_memory(self):
del self._session
self._is_gpu_ready = False
def set_model_on_gpu(self):
if (
self.device.type is DeviceType.GPU
and len(ONNX_PROVIDERS["cuda"]) == 3
):
ONNX_PROVIDERS["cuda"][1] = (
"CUDAExecutionProvider",
{
"device_id": self.device.idx,
},
)
self._setup_tensorrt(self.quantization_type, self.device)
ort_session = ort.InferenceSession(
str(self.onnx_path),
sess_options=self.sess_options,
providers=ONNX_PROVIDERS["cuda"]
if self.device.type is DeviceType.GPU
else ONNX_PROVIDERS["cpu"],
)
self._session = ort_session
self._is_gpu_ready = True
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
metadata = LearnerMetadata.from_model(
self,
input_names=self.input_names,
output_names=self.output_names,
**kwargs,
)
path = Path(path)
path.mkdir(exist_ok=True)
metadata.save(path)
shutil.copy(
self.onnx_path,
os.path.join(str(path), ONNX_FILENAMES["model_name"]),
)
try:
# Tries to load the model
onnx.load(os.path.join(str(path), ONNX_FILENAMES["model_name"]))
except FileNotFoundError:
# If missing files, it means it's saved in onnx external_data
# format
src_dir = str(Path(self.onnx_path).parent)
files = os.listdir(src_dir)
for fname in files:
if ".onnx" not in fname:
shutil.copy2(
os.path.join(src_dir, fname), os.path.join(path, fname)
)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for consistency
with other Learners.
Returns:
ONNXInferenceLearner: The optimized model.
"""
if len(kwargs) > 0:
logger.warning(
f"No extra keywords expected for the load method. "
f"Got {kwargs}."
)
path = Path(path)
onnx_path = path / ONNX_FILENAMES["model_name"]
metadata = LearnerMetadata.read(path)
input_tfms = metadata.input_tfms
device = Device.from_str(metadata.device)
quantization_type = (
QuantizationType(metadata.quantization_type)
if hasattr(metadata, "quantization_type")
else None
)
if input_tfms is not None:
input_tfms = MultiStageTransformation.from_dict(
metadata.input_tfms
)
return cls(
input_tfms=input_tfms,
network_parameters=ModelParams(**metadata.network_parameters),
onnx_path=onnx_path,
input_names=metadata["input_names"],
output_names=metadata["output_names"],
device=device,
quantization_type=quantization_type,
)
def _predict_arrays(self, input_arrays: Generator[np.ndarray, None, None]):
input_dict = {
name: input_array
for name, input_array in zip(self.input_names, input_arrays)
}
outputs = self._session.run(self.output_names, input_dict)
return outputs
class PytorchONNXInferenceLearner(
ONNXInferenceLearner, PytorchBaseInferenceLearner
):
"""Model run with Microsoft's onnxruntime using a Pytorch interface.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
input_arrays = (
input_tensor.cpu().detach().numpy()
for input_tensor in input_tensors
)
outputs = self._predict_arrays(input_arrays)
return tuple(
torch.from_numpy(output).to(self.device.to_torch_format())
for output in outputs
)
class TensorflowONNXInferenceLearner(
ONNXInferenceLearner, TensorflowBaseInferenceLearner
):
"""Model run with Microsoft's onnxruntime using a tensorflow interface.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
input_arrays = (
input_tensor.numpy()
if not isinstance(input_tensor, np.ndarray)
else input_tensor
for input_tensor in input_tensors
)
outputs = self._predict_arrays(input_arrays)
# noinspection PyTypeChecker
return tuple(tf.convert_to_tensor(output) for output in outputs)
class NumpyONNXInferenceLearner(
ONNXInferenceLearner, NumpyBaseInferenceLearner
):
"""Model run with Microsoft's onnxruntime using a numpy interface.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
onnx_path (str or Path): Path to the onnx model.
input_names (List[str]): Input names used when the onnx model
was produced.
output_names (List[str]): Output names used when the onnx model
was produced.
"""
def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[np.ndarray, ...]): Input tensors belonging to
the same batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
input_arrays = (input_tensor for input_tensor in input_tensors)
outputs = self._predict_arrays(input_arrays)
return tuple(outputs)
ONNX_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[ONNXInferenceLearner]
] = {
DeepLearningFramework.PYTORCH: PytorchONNXInferenceLearner,
DeepLearningFramework.TENSORFLOW: TensorflowONNXInferenceLearner,
DeepLearningFramework.NUMPY: NumpyONNXInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/openvino.py
================================================
import json
import shutil
from abc import ABC
from pathlib import Path
from typing import Dict, Union, Type, Generator, Tuple, List, Optional
import numpy as np
from loguru import logger
from nebullvm.config import OPENVINO_FILENAMES
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
TensorflowBaseInferenceLearner,
NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.openvino import (
Core,
Model,
CompiledModel,
InferRequest,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class OpenVinoInferenceLearner(BaseInferenceLearner, ABC):
"""Model optimized using OpenVINO.
The class cannot be directly instantiated, but implements all the core
methods needed for using OpenVINO at inference time.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
exec_network (any): The graph executor. This is the
central component in the OpenVino optimized model execution.
input_keys (List): Keys associated to the inputs.
output_keys (List): Keys associated to the outputs.
description_file (str): File containing a description of the optimized
model.
weights_file (str): File containing the model weights.
"""
MODEL_NAME = "model.bin"
name = "OpenVINO"
def __init__(
self,
compiled_model: CompiledModel,
infer_request: InferRequest,
input_keys: List,
output_keys: List,
description_file: str,
weights_file: str,
device: Device,
**kwargs,
):
super().__init__(**kwargs)
self.compiled_model = compiled_model
self.infer_request = infer_request
self.input_keys = input_keys
self.output_keys = output_keys
self.device = device
self.description_file = self._store_file(description_file)
self.weights_file = self._store_file(weights_file)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for the
`from_model_name` class method.
Returns:
OpenVinoInferenceLearner: The optimized model.
"""
path = Path(path)
with open(path / OPENVINO_FILENAMES["metadata"], "r") as fin:
metadata = json.load(fin)
metadata.update(kwargs)
metadata["network_parameters"] = ModelParams(
**metadata["network_parameters"]
)
input_tfms = metadata.get("input_tfms")
if input_tfms is not None:
metadata["input_tfms"] = MultiStageTransformation.from_dict(
input_tfms
)
model_name = str(path / OPENVINO_FILENAMES["description_file"])
model_weights = str(path / OPENVINO_FILENAMES["weights"])
metadata["device"] = Device.from_str(metadata["device"])
return cls.from_model_name(
model_name=model_name, model_weights=model_weights, **metadata
)
def get_size(self):
return len(self.compiled_model.export_model())
def free_gpu_memory(self):
raise NotImplementedError("OpenVino does not support GPU inference.")
@classmethod
def from_model_name(
cls,
network_parameters: ModelParams,
model_name: str,
model_weights: str,
device: Device,
input_tfms: MultiStageTransformation = None,
input_data: DataManager = None,
**kwargs,
):
"""Build the optimized model from the network description and its
weights.
Args:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
model_name (str): File containing a description of the optimized
model.
model_weights (str): File containing the model weights.
device (Device): Device used to run the model.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction.
input_data (DataManager, optional): User defined data.
"""
if len(kwargs) > 0:
logger.warning(f"Found extra parameters: {kwargs}")
core = Core()
model = core.read_model(model=model_name, weights=model_weights)
dynamic_shape = cls._get_dynamic_shape(model, network_parameters)
if dynamic_shape is not None:
model.reshape(dynamic_shape)
compiled_model = core.compile_model(model=model, device_name="CPU")
infer_request = compiled_model.create_infer_request()
input_keys = list(
map(lambda obj: obj.get_any_name(), compiled_model.inputs)
)
output_keys = list(
map(lambda obj: obj.get_any_name(), compiled_model.outputs)
)
return cls(
compiled_model,
infer_request,
input_keys,
output_keys,
input_tfms=input_tfms,
network_parameters=network_parameters,
description_file=model_name,
weights_file=model_weights,
input_data=input_data,
device=device,
)
@staticmethod
def _get_dynamic_shape(
model: Model, network_parameters: ModelParams
) -> Optional[Dict[str, Tuple[int]]]:
if network_parameters.dynamic_info is None:
return None
input_names = [
list(model_input.names)[0] for model_input in model.inputs
]
input_shapes = [
input_info.size for input_info in network_parameters.input_infos
]
dynamic_shapes = []
assert len(input_shapes) == len(
network_parameters.dynamic_info.inputs
), (
f"Number of inputs defined in dynamic info "
f"({len(input_shapes)}) is different from the one "
f"expected from the model "
f"({len(network_parameters.dynamic_info.inputs)})."
)
for input_shape, dynamic_shape_dict in zip(
input_shapes, network_parameters.dynamic_info.inputs
):
input_shape = list(input_shape)
for key in dynamic_shape_dict.keys():
input_shape[int(key)] = -1
dynamic_shapes.append(tuple(input_shape))
dynamic_shape_dict = {
k: v for k, v in zip(input_names, dynamic_shapes)
}
return dynamic_shape_dict
def _get_metadata(self, **kwargs) -> LearnerMetadata:
# metadata = {
# key: self.__dict__[key] for key in ("input_keys", "output_keys")
# }
metadata = {}
metadata.update(kwargs)
return LearnerMetadata.from_model(self, **metadata)
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
path = Path(path)
path.mkdir(exist_ok=True)
metadata = self._get_metadata(**kwargs)
metadata.save(path)
shutil.copy(
self.description_file,
path / OPENVINO_FILENAMES["description_file"],
)
shutil.copy(self.weights_file, path / OPENVINO_FILENAMES["weights"])
def _predict_array(
self,
input_arrays: Generator[np.ndarray, None, None],
) -> Generator[np.ndarray, None, None]:
results = self.infer_request.infer(
inputs={
input_key: input_array
for input_key, input_array in zip(
self.input_keys, input_arrays
)
}
)
results = {
output_key.get_any_name(): output_arr
for output_key, output_arr in results.items()
}
return (results[output_key] for output_key in self.output_keys)
class PytorchOpenVinoInferenceLearner(
OpenVinoInferenceLearner, PytorchBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a Pytorch interface.
This class can be used exactly in the same way as a pytorch Module object.
At prediction time it takes as input pytorch tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
exec_network (any): The graph executor. This is the
central component in the OpenVino optimized model execution.
input_keys (List): Keys associated to the inputs.
output_keys (List): Keys associated to the outputs.
description_file (str): File containing a description of the optimized
model.
weights_file (str): File containing the model weights.
"""
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (
input_tensor.cpu().detach().numpy()
for input_tensor in input_tensors
)
output_arrays = self._predict_array(input_arrays)
return tuple(
torch.from_numpy(output_array) for output_array in output_arrays
)
class TensorflowOpenVinoInferenceLearner(
OpenVinoInferenceLearner, TensorflowBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a tensorflow interface.
This class can be used exactly in the same way as a tf.Module or
keras.Model object.
At prediction time it takes as input tensorflow tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
exec_network (any): The graph executor. This is the
central component in the OpenVino optimized model execution.
input_keys (List): Keys associated to the inputs.
output_keys (List): Keys associated to the outputs.
description_file (str): File containing a description of the optimized
model.
weights_file (str): File containing the model weights.
"""
def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)
output_arrays = self._predict_array(input_arrays)
# noinspection PyTypeChecker
return tuple(
tf.convert_to_tensor(output_array)
for output_array in output_arrays
)
class NumpyOpenVinoInferenceLearner(
OpenVinoInferenceLearner, NumpyBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a numpy interface.
This class can be used exactly in the same way as a sklearn or
numpy-based model.
At prediction time it takes as input numpy arrays given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
exec_network (any): The graph executor. This is the
central component in the OpenVino optimized model execution.
input_keys (List): Keys associated to the inputs.
output_keys (List): Keys associated to the outputs.
description_file (str): File containing a description of the optimized
model.
weights_file (str): File containing the model weights.
"""
def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[np.ndarray]): Input tensors belonging to
the same batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[np.ndarray]: Output tensors. Note that the output tensors
does not correspond to the prediction on the input tensors
with a 1 to 1 mapping. In fact the output tensors are produced
as the multiple-output of the model given a (multi-) tensor
input.
"""
input_arrays = (input_tensor for input_tensor in input_tensors)
output_arrays = self._predict_array(input_arrays)
return tuple(output_arrays)
OPENVINO_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[OpenVinoInferenceLearner]
] = {
DeepLearningFramework.PYTORCH: PytorchOpenVinoInferenceLearner,
DeepLearningFramework.TENSORFLOW: TensorflowOpenVinoInferenceLearner,
DeepLearningFramework.NUMPY: NumpyOpenVinoInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensor_rt.py
================================================
import json
import os
from abc import ABC
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Union, Dict, Type, List, Tuple, Generator, Optional
import numpy as np
from loguru import logger
from nebullvm.config import NVIDIA_FILENAMES
from nebullvm.core.models import (
Device,
DeviceType,
ModelParams,
DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
TensorflowBaseInferenceLearner,
NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.tensor_rt import tensorrt as trt, polygraphy
from nebullvm.optional_modules.torch import torch, ScriptModule
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
MultiStageTransformation,
VerifyContiguity,
)
class ONNXTensorRTInferenceLearner(BaseInferenceLearner, ABC):
"""Model optimized using TensorRT.
The class cannot be directly instantiated, but implements all the core
methods needed for using TensorRT at inference time.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
engine (any): The tensorRT engine.
input_names (List[str]): Names associated to the model input tensors.
output_names (List[str]): Names associated to the model output tensors.
cuda_stream (any, optional): Stream used for communication with Nvidia
GPUs.
nvidia_logger (any, optional): Logger used by the Nvidia service
"""
name = "TensorRT"
def __init__(
self,
engine: Any,
input_names: List[str],
output_names: List[str],
device: Device,
cuda_stream: Any = None,
nvidia_logger: Any = None,
**kwargs,
):
super().__init__(**kwargs)
self.engine = engine
self.context = self.engine.create_execution_context()
self.input_names = input_names
self.output_names = output_names
self.cuda_stream = cuda_stream
self.nvidia_logger = nvidia_logger
self.output_tensors = None
self.device = device
self._set_cuda_env(device.type is DeviceType.GPU)
def _get_metadata(self, **kwargs) -> LearnerMetadata:
metadata = {
key: self.__dict__[key] for key in ("input_names", "output_names")
}
metadata.update(kwargs)
return LearnerMetadata.from_model(self, **metadata)
def _synchronize_stream(self):
raise NotImplementedError()
@property
def stream_ptr(self):
raise NotImplementedError()
@staticmethod
def _get_default_cuda_stream() -> Any:
raise NotImplementedError()
@staticmethod
def check_env(use_gpu):
if not use_gpu:
raise SystemError(
"You are trying to run an optimizer developed for NVidia gpus "
"on a machine not connected to any GPU supporting CUDA."
)
def _set_cuda_env(self, use_gpu):
self.check_env(use_gpu)
if self.nvidia_logger is None:
self.nvidia_logger = trt.Logger(trt.Logger.WARNING)
if self.cuda_stream is None:
self.cuda_stream = self._get_default_cuda_stream()
@classmethod
def from_engine_path(
cls,
network_parameters: ModelParams,
engine_path: Union[str, Path],
input_names: List[str],
output_names: List[str],
device: Device,
nvidia_logger: Any = None,
cuda_stream: Any = None,
input_tfms: MultiStageTransformation = None,
input_data: DataManager = None,
**kwargs,
):
"""Build the model from the serialised engine.
Args:
network_parameters (ModelParams): Model parameters.
engine_path (str or Path): Path to the serialised engine. The
serialised engine is the serialised version of the engine
used for accelerating the inference.
input_names (List[str]): Names associated to the model input
tensors.
output_names (List[str]): Names associated to the model output
tensors.
device: (Device): Device where the model wil be run.
cuda_stream (any, optional): Stream used for communication with
Nvidia GPUs.
nvidia_logger (any, optional): Logger used by the Nvidia service
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction.
input_data (DataManager, optional): User defined data.
Returns:
NvidiaInferenceLearner: The optimized model.
"""
if kwargs:
logger.warning(
f"Debug: Got extra keywords in "
f"NvidiaInferenceLearner::from_engine_path: {kwargs}"
)
if nvidia_logger is None:
nvidia_logger = trt.Logger(trt.Logger.WARNING)
if input_tfms is None:
input_tfms = MultiStageTransformation([])
input_tfms.append(VerifyContiguity())
runtime = trt.Runtime(nvidia_logger)
with open(engine_path, "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
return cls(
input_tfms=input_tfms,
network_parameters=network_parameters,
engine=engine,
input_names=input_names,
output_names=output_names,
nvidia_logger=nvidia_logger,
cuda_stream=cuda_stream,
input_data=input_data,
device=device,
)
def _predict_tensors(
self,
input_ptrs: Generator[Any, None, None],
output_ptrs: Generator[Any, None, None],
input_shapes: Generator[Any, None, None] = None,
):
buffers = [None] * (len(self.input_names) + len(self.output_names))
input_idxs = (
self.engine[input_name] for input_name in self.input_names
)
output_idxs = (
self.engine[output_name] for output_name in self.output_names
)
input_shapes = input_shapes or [None] * len(self.input_names)
for input_idx, input_ptr, input_shape in zip(
input_idxs, input_ptrs, input_shapes
):
buffers[input_idx] = input_ptr
if input_shape is not None:
# If the input shape is empty, we set it to (1,) because
# TensorRT doesn't accept empty shapes.
if input_shape == torch.Size([]):
input_shape = torch.Size((1,))
self.context.set_binding_shape(input_idx, input_shape)
for output_idx, output_ptr in zip(output_idxs, output_ptrs):
buffers[output_idx] = output_ptr
self.context.execute_async_v2(buffers, self.stream_ptr)
self._synchronize_stream()
def get_size(self):
return self.engine.serialize().nbytes
def free_gpu_memory(self):
# ONNXtensorrt doesn't need to release gpu memory
pass
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
path = Path(path)
path.mkdir(exist_ok=True)
serialized_engine = self.engine.serialize()
with open(path / NVIDIA_FILENAMES["engine"], "wb") as fout:
fout.write(serialized_engine)
metadata = self._get_metadata(**kwargs)
with open(path / NVIDIA_FILENAMES["metadata"], "w") as fout:
json.dump(metadata.to_dict(), fout)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for the
`from_engine_path` class method.
Returns:
ONNXTensorRTInferenceLearner: The optimized model.
"""
path = Path(path)
with open(path / NVIDIA_FILENAMES["metadata"], "r") as fin:
metadata = json.load(fin)
metadata.update(kwargs)
metadata["network_parameters"] = ModelParams(
**metadata["network_parameters"]
)
input_tfms = metadata.get("input_tfms")
if input_tfms is not None:
metadata["input_tfms"] = MultiStageTransformation.from_dict(
input_tfms
)
metadata["device"] = Device(DeviceType.GPU)
return cls.from_engine_path(
engine_path=path / NVIDIA_FILENAMES["engine"],
**metadata,
)
class PytorchTensorRTInferenceLearner(PytorchBaseInferenceLearner):
MODEL_NAME = "model_optimized.pt"
name = "TensorRT"
def __init__(
self,
torch_model: ScriptModule,
device: Device,
**kwargs,
):
super().__init__(**kwargs)
self.model = torch_model.eval()
if device.type is DeviceType.GPU:
self.model.to(device.to_torch_format())
self.use_gpu = True
else:
self.use_gpu = False
self.device = device
self._is_gpu_ready = device.type is DeviceType.GPU
def get_size(self):
with TemporaryDirectory() as tmp_dir:
self.save(tmp_dir)
return sum(
os.path.getsize(Path(tmp_dir) / f)
for f in os.listdir(Path(tmp_dir))
if os.path.isfile(Path(tmp_dir) / f)
)
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
# PyTorch-TensorRT does not support int64
input_tensors = (
t.to(self.device.to_torch_format())
if t.dtype != torch.int64
else t.to(torch.int32).to(self.device.to_torch_format())
for t in input_tensors
)
with torch.no_grad():
res = self.model(*input_tensors)
if not isinstance(res, tuple):
res = res.to(self.device.to_torch_format())
return (res,)
return tuple(out.to(self.device.to_torch_format()) for out in res)
def save(self, path: Union[str, Path], **kwargs):
path = Path(path)
path.mkdir(exist_ok=True)
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
torch.jit.save(self.model, path / self.MODEL_NAME)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
path = Path(path)
model = torch.jit.load(path / cls.MODEL_NAME)
metadata = LearnerMetadata.read(path)
device = Device(DeviceType.GPU)
return cls(
torch_model=model,
network_parameters=ModelParams(**metadata.network_parameters),
input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
if metadata.input_tfms is not None
else None,
device=device,
)
class PytorchONNXTensorRTInferenceLearner(
ONNXTensorRTInferenceLearner, PytorchBaseInferenceLearner
):
"""Model optimized using TensorRT with a Pytorch interface.
This class can be used exactly in the same way as a pytorch Module object.
At prediction time it takes as input pytorch tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
engine (any): The tensorRT engine.
input_names (List[str]): Names associated to the model input tensors.
output_names (List[str]): Names associated to the model output tensors.
cuda_stream (any, optional): Stream used for communication with Nvidia
GPUs.
nvidia_logger (any, optional): Logger used by the Nvidia service.
"""
def _synchronize_stream(self):
self.cuda_stream.synchronize()
@staticmethod
def _get_default_cuda_stream() -> Any:
return torch.cuda.default_stream()
@property
def stream_ptr(self):
return self.cuda_stream.cuda_stream
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_tensors = [
input_tensor.to(self.device.to_torch_format())
for input_tensor in input_tensors
]
if self.network_parameters.dynamic_info is None:
if self.output_tensors is None:
self.output_tensors = [
torch.Tensor(*output_size)
.to(self.device.to_torch_format())
.to(output_type.to_torch_format())
for output_size, output_type in zip(
self.network_parameters.output_sizes,
self.network_parameters.output_types,
)
]
input_sizes = None
else:
dynamic_info = self.network_parameters.dynamic_info
input_sizes = [
input_tensor.size() for input_tensor in input_tensors
]
self.output_tensors = [
torch.Tensor(
*(
x
if i not in dynamic_axis.keys()
else dynamic_info.retrieve_output_dim(
input_sizes, j, i, x
)
for i, x in enumerate(output_size)
),
)
.to(self.device.to_torch_format())
.to(output_type.to_torch_format())
for j, (output_size, output_type, dynamic_axis) in enumerate(
zip(
self.network_parameters.output_sizes,
self.network_parameters.output_types,
dynamic_info.outputs,
)
)
]
input_ptrs = (
input_tensor.data_ptr() for input_tensor in input_tensors
)
output_ptrs = (
output_tensor.data_ptr() for output_tensor in self.output_tensors
)
self._predict_tensors(input_ptrs, output_ptrs, input_sizes)
return tuple(
output_tensor.to(self.device.to_torch_format())
for output_tensor in self.output_tensors
)
class BaseArrayONNXTensorRTInferenceLearner(ONNXTensorRTInferenceLearner, ABC):
"""Base Model that can be used for all array-based
NvidiaInferenceLearners.
"""
def _synchronize_stream(self):
self.cuda_stream.synchronize()
@staticmethod
def _get_default_cuda_stream() -> Any:
return polygraphy.cuda.Stream()
@property
def stream_ptr(self):
return self.cuda_stream.ptr
@staticmethod
def _convert_to_array_and_free_memory(cuda_array) -> np.ndarray:
array = cuda_array.numpy()
cuda_array.free()
return array
def _predict_array(
self,
cuda_input_arrays: List,
input_shapes: Optional[List[Tuple[int, ...]]],
) -> Generator[np.ndarray, None, None]:
if self.network_parameters.dynamic_info is None:
cuda_output_arrays = [
polygraphy.cuda.DeviceArray(
shape=output_size,
dtype=output_type.to_numpy_format(),
)
for output_size, output_type in zip(
self.network_parameters.output_sizes,
self.network_parameters.output_types,
)
]
else:
dynamic_info = self.network_parameters.dynamic_info
cuda_output_arrays = [
polygraphy.cuda.DeviceArray(
shape=tuple(
x
if i not in dyn_out_axis.keys()
else dynamic_info.retrieve_output_dim(
input_shapes, j, i, x
)
for i, x in enumerate(output_size)
),
dtype=output_type.to_numpy_format(),
)
for j, (output_size, output_type, dyn_out_axis) in enumerate(
zip(
self.network_parameters.output_sizes,
self.network_parameters.output_types,
dynamic_info.outputs,
)
)
]
input_ptrs = (cuda_array.ptr for cuda_array in cuda_input_arrays)
output_ptrs = (cuda_array.ptr for cuda_array in cuda_output_arrays)
self._predict_tensors(input_ptrs, output_ptrs, input_shapes)
for cuda_input_array in cuda_input_arrays:
cuda_input_array.free()
return (
self._convert_to_array_and_free_memory(array)
for array in cuda_output_arrays
)
class TensorflowONNXTensorRTInferenceLearner(
BaseArrayONNXTensorRTInferenceLearner, TensorflowBaseInferenceLearner
):
"""Model optimized using TensorRT with a tensorflow interface.
This class can be used exactly in the same way as a tf.Module or
keras.Model object.
At prediction time it takes as input tensorflow tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
engine (any): The tensorRT engine.
input_names (List[str]): Names associated to the model input tensors.
output_names (List[str]): Names associated to the model output tensors.
cuda_stream (any, optional): Stream used for communication with Nvidia
GPUs.
nvidia_logger (any, optional): Logger used by the Nvidia service.
"""
def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
cuda_input_arrays = [
polygraphy.cuda.DeviceArray(
shape=tuple(input_tensor.shape),
dtype=input_tensor.numpy().dtype,
).copy_from(input_tensor.numpy(), stream=self.cuda_stream)
for input_tensor in input_tensors
]
input_shapes = (
[tuple(input_tensor.shape) for input_tensor in input_tensors]
if self.network_parameters.dynamic_info is not None
else None
)
out_arrays = self._predict_array(cuda_input_arrays, input_shapes)
return tuple(tf.convert_to_tensor(array) for array in out_arrays)
class NumpyONNXTensorRTInferenceLearner(
BaseArrayONNXTensorRTInferenceLearner, NumpyBaseInferenceLearner
):
"""Model optimized using TensorRT with a tensorflow interface.
This class can be used exactly in the same way as a tf.Module or
keras.Model object.
At prediction time it takes as input tensorflow tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
engine (any): The tensorRT engine.
input_names (List[str]): Names associated to the model input tensors.
output_names (List[str]): Names associated to the model output tensors.
cuda_stream (any, optional): Stream used for communication with Nvidia
GPUs.
nvidia_logger (any, optional): Logger used by the Nvidia service.
"""
def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[np.ndarray]): Input tensors belonging to
the same batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[np.ndarray]: Output tensors. Note that the output tensors
does not correspond to the prediction on the input tensors
with a 1 to 1 mapping. In fact the output tensors are produced
as the multiple-output of the model given a (multi-) tensor
input.
"""
cuda_input_arrays = [
polygraphy.cuda.DeviceArray(
shape=tuple(input_tensor.shape), dtype=input_tensor.dtype
).copy_from(input_tensor, stream=self.cuda_stream)
for input_tensor in input_tensors
]
input_shapes = (
[tuple(input_tensor.shape) for input_tensor in input_tensors]
if self.network_parameters.dynamic_info is not None
else None
)
return tuple(self._predict_array(cuda_input_arrays, input_shapes))
TENSOR_RT_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[ONNXTensorRTInferenceLearner]
] = {
DeepLearningFramework.PYTORCH: PytorchONNXTensorRTInferenceLearner,
DeepLearningFramework.TENSORFLOW: TensorflowONNXTensorRTInferenceLearner,
DeepLearningFramework.NUMPY: NumpyONNXTensorRTInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tensorflow.py
================================================
import pickle
from pathlib import Path
from typing import Tuple, Union, Dict, Type
from nebullvm.config import TENSORFLOW_BACKEND_FILENAMES
from nebullvm.core.models import DeviceType, Device, ModelParams
from nebullvm.operations.inference_learners.base import (
TensorflowBaseInferenceLearner,
LearnerMetadata,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
class TensorflowBackendInferenceLearner(TensorflowBaseInferenceLearner):
name = "XLA"
def __init__(self, tf_model: tf.Module, device: Device, **kwargs):
super(TensorflowBackendInferenceLearner, self).__init__(**kwargs)
self.model = tf_model
self.device = device
self._is_gpu_ready = self.device.type is DeviceType.GPU
def get_size(self):
return len(pickle.dumps(self.model, -1))
def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
with tf.device(self.device.to_tf_format()):
res = self.model(input_tensors)
if not isinstance(res, tuple):
return (res,)
return res
def save(self, path: Union[str, Path], **kwargs):
path = Path(path)
path.mkdir(exist_ok=True)
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
self.model.save(path / TENSORFLOW_BACKEND_FILENAMES["tf_model"])
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
path = Path(path)
metadata = LearnerMetadata.read(path)
network_parameters = ModelParams(**metadata.network_parameters)
input_tfms = metadata.input_tfms
model = tf.keras.models.load_model(
path / TENSORFLOW_BACKEND_FILENAMES["tf_model"]
)
device = Device.from_str(metadata.device)
return cls(
tf_model=model,
network_parameters=network_parameters,
input_tfms=input_tfms,
device=device,
)
class TFLiteBackendInferenceLearner(TensorflowBaseInferenceLearner):
name = "TFLite"
def __init__(self, tflite_file: bytes, device: Device, **kwargs):
super(TFLiteBackendInferenceLearner, self).__init__(**kwargs)
self.tflite_file = tflite_file
self.interpreter = tf.lite.Interpreter(model_content=tflite_file)
self.device = device
def get_size(self):
return len(self.tflite_file)
def free_gpu_memory(self):
raise NotImplementedError(
"TFLite does not support GPU inference on Nvidia devices"
)
def run(self, *input_tensors: tf.Tensor):
input_details = self.interpreter.get_input_details()
output_details = self.interpreter.get_output_details()
if self.network_parameters.dynamic_info:
for i, (input_tensor, detail) in enumerate(
zip(input_tensors, input_details)
):
if input_tensor.shape != tuple(detail["shape"]):
self.interpreter.resize_tensor_input(i, input_tensor.shape)
self.interpreter.allocate_tensors()
for i, input_tensor in enumerate(input_tensors):
self.interpreter.set_tensor(i, input_tensor)
self.interpreter.invoke()
return tuple(
tf.convert_to_tensor(
self.interpreter.get_tensor(output_detail["index"])
)
for output_detail in output_details
)
def save(self, path: Union[str, Path], **kwargs):
path = Path(path)
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
with open(
path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"], "wb"
) as f:
f.write(self.tflite_file)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
path = Path(path)
tflite_file_path = str(
path / TENSORFLOW_BACKEND_FILENAMES["tflite_model"]
)
with open(tflite_file_path, "rb") as f:
tflite_file = f.read()
metadata = LearnerMetadata.read(path)
network_parameters = ModelParams(**metadata.network_parameters)
input_tfms = metadata.input_tfms
device = Device.from_str(metadata.device)
return cls(
tflite_file=tflite_file,
network_parameters=network_parameters,
input_tfms=input_tfms,
device=device,
)
TF_BACKEND_LEARNERS_DICT: Dict[
str,
Type[
Union[TensorflowBackendInferenceLearner, TFLiteBackendInferenceLearner]
],
] = {
"tf": TensorflowBackendInferenceLearner,
"tflite": TFLiteBackendInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_dynamo.py
================================================
from pathlib import Path
from typing import Union
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
class TorchDynamoInferenceLearner(TorchScriptInferenceLearner):
name = "TorchDynamo"
def save(self, path: Union[str, Path], **kwargs):
# TODO: Implement save function
# Saving it like a normal PyTorch model raises this error:
# https://github.com/pytorch/pytorch/issues/93470
raise NotImplementedError
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
# TODO: Implement load function
raise NotImplementedError
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_neuron.py
================================================
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
class TorchNeuronInferenceLearner(TorchScriptInferenceLearner):
name = "TorchNeuron"
def get_size(self):
with TemporaryDirectory() as tmp_dir:
self.save(tmp_dir)
return sum(
os.path.getsize(Path(tmp_dir) / f)
for f in os.listdir(Path(tmp_dir))
if os.path.isfile(Path(tmp_dir) / f)
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torch_xla.py
================================================
import os
import pickle
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, Union
from nebullvm.core.models import Device, DeviceType, ModelParams
from nebullvm.operations.inference_learners.base import (
PytorchBaseInferenceLearner,
LearnerMetadata,
)
from nebullvm.optional_modules.torch import (
torch,
)
from nebullvm.tools.transformations import MultiStageTransformation
class TorchXLAInferenceLearner(PytorchBaseInferenceLearner):
MODEL_NAME = "model_scripted.pt"
name = "TorchXLA"
def __init__(self, torch_model: torch.nn.Module, device: Device, **kwargs):
super().__init__(**kwargs)
self.model = torch_model.eval()
if device.type is DeviceType.TPU:
self.model.to(device.to_torch_format())
self.device = device
self._is_gpu_ready = self.device.type is DeviceType.TPU
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
if self.device.type is DeviceType.TPU and not self._is_gpu_ready:
self.set_model_on_gpu()
if self.device.type is DeviceType.TPU:
input_tensors = (
t.to(self.device.to_torch_format()) for t in input_tensors
)
with torch.no_grad():
res = self.model(*input_tensors)
if not isinstance(res, tuple):
return (res,)
return tuple(out for out in res)
def get_size(self):
try:
if hasattr(self.model, "core_model"):
return len(pickle.dumps(self.model.core_model, -1))
else:
# Normal torch model
return len(pickle.dumps(self.model, -1))
except RuntimeError:
with TemporaryDirectory() as tmp_dir:
self.save(tmp_dir)
return sum(
os.path.getsize(Path(tmp_dir) / f)
for f in os.listdir(Path(tmp_dir))
if os.path.isfile(Path(tmp_dir) / f)
)
def save(self, path: Union[str, Path], **kwargs):
path = Path(path)
path.mkdir(exist_ok=True)
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
self.model.cpu()
torch.save(self.model, path / self.MODEL_NAME)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
path = Path(path)
model = torch.load(path / cls.MODEL_NAME)
metadata = LearnerMetadata.read(path)
device = Device.from_str(metadata.device)
model.to(device.to_torch_format())
return cls(
torch_model=model,
network_parameters=ModelParams(**metadata.network_parameters),
input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
if metadata.input_tfms is not None
else None,
device=device,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/torchscript.py
================================================
from pathlib import Path
from typing import Tuple, Union, Optional, List
from nebullvm.core.models import Device, DeviceType, ModelParams
from nebullvm.operations.inference_learners.base import (
PytorchBaseInferenceLearner,
LearnerMetadata,
)
from nebullvm.optional_modules.torch import (
torch,
symbolic_trace,
Module,
ScriptModule,
GraphModule,
)
from nebullvm.tools.transformations import MultiStageTransformation
class TorchScriptInferenceLearner(PytorchBaseInferenceLearner):
MODEL_NAME = "model_scripted.pt"
name = "TorchScript"
def __init__(self, torch_model: ScriptModule, device: Device, **kwargs):
super().__init__(**kwargs)
self.model = torch_model.eval()
if device.type is DeviceType.GPU:
self.model.to(device.to_torch_format())
self.device = device
self._is_gpu_ready = self.device.type is DeviceType.GPU
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
if self.device.type is DeviceType.GPU and not self._is_gpu_ready:
self.set_model_on_gpu()
if self.device.type is DeviceType.GPU:
input_tensors = (
t.to(self.device.to_torch_format()) for t in input_tensors
)
with torch.no_grad():
res = self.model(*input_tensors)
if not isinstance(res, tuple):
res = res.to(self.device.to_torch_format())
return (res,)
return tuple(out.to(self.device.to_torch_format()) for out in res)
def save(self, path: Union[str, Path], **kwargs):
path = Path(path)
path.mkdir(exist_ok=True)
metadata = LearnerMetadata.from_model(self, **kwargs)
metadata.save(path)
torch.jit.save(self.model, path / self.MODEL_NAME)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
path = Path(path)
model = torch.jit.load(path / cls.MODEL_NAME)
metadata = LearnerMetadata.read(path)
device = Device.from_str(metadata.device)
return cls(
torch_model=model,
network_parameters=ModelParams(**metadata.network_parameters),
input_tfms=MultiStageTransformation.from_dict(metadata.input_tfms)
if metadata.input_tfms is not None
else None,
device=device,
)
@classmethod
def from_torch_model(
cls,
model: Union[Module, GraphModule],
network_parameters: ModelParams,
device: Device,
input_tfms: Optional[MultiStageTransformation] = None,
input_data: List[torch.Tensor] = None,
):
if device.type is DeviceType.GPU:
input_data = [t.to(device.to_torch_format()) for t in input_data]
if not isinstance(model, torch.fx.GraphModule):
model.eval()
try:
model_scripted = symbolic_trace(model)
model_scripted = torch.jit.script(model_scripted)
except Exception:
try:
model_scripted = torch.jit.script(model)
except Exception:
model_scripted = torch.jit.trace(model, tuple(input_data))
else:
model_scripted = torch.jit.script(model)
return cls(
torch_model=model_scripted,
network_parameters=network_parameters,
input_tfms=input_tfms,
input_data=input_data,
device=device,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/tvm.py
================================================
import os
import shutil
from abc import ABC
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Union, Type, Dict, Any, List, Generator, Tuple, Optional
import numpy as np
from nebullvm.config import (
TVM_FILENAMES,
)
from nebullvm.core.models import Device, ModelParams, DeepLearningFramework
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
LearnerMetadata,
PytorchBaseInferenceLearner,
TensorflowBaseInferenceLearner,
NumpyBaseInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.tvm import (
GraphModule,
tvm,
ExecutorFactoryModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
class ApacheTVMInferenceLearner(BaseInferenceLearner, ABC):
"""Model optimized using ApacheTVM.
The class cannot be directly instantiated, but implements all the core
methods needed for using ApacheTVM at inference time.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
graph_executor_module (GraphModule): The graph executor. This is the
central component in the ApacheTVM optimized model execution.
input_names (List[str]): Names associated to the model input tensors.
lib (Module): Component needed for loading the ApacheTVM optimized
model.
target (str): Target device. It can be wither `llvm` for targeting CPUs
or "cuda" for targeting GPUs.
engine_path (Path, optional): Path to the serialized engine. To be used
after loading the model (avoiding double engine serialization).
"""
name = "ApacheTVM"
def __init__(
self,
graph_executor_module: GraphModule,
input_names: List[str],
lib: ExecutorFactoryModule,
target: str,
device: Device,
engine_path: Path = None,
**kwargs
):
super().__init__(**kwargs)
self.graph_executor_module = graph_executor_module
self.input_names = input_names
self.lib = lib
self.target = target
self.engine_path = (
self._store_file(engine_path)
if engine_path is not None
else engine_path
)
self.device = device
def get_size(self):
with TemporaryDirectory() as tmp_dir:
self.save(tmp_dir)
return sum(
os.path.getsize(Path(tmp_dir) / f)
for f in os.listdir(Path(tmp_dir))
if os.path.isfile(Path(tmp_dir) / f)
)
def _has_half_precision_transformation(self):
for tfm in self.input_tfms.to_list():
if isinstance(tfm, HalfPrecisionTransformation):
return True
return False
def _predict_array(
self, input_arrays: Generator[np.ndarray, None, None]
) -> Generator[np.ndarray, None, None]:
for name, array in zip(self.input_names, input_arrays):
self.graph_executor_module.set_input(name, array)
self.graph_executor_module.run()
tvm_outputs = (
self.graph_executor_module.get_output(
i,
tvm.nd.empty(
shape=output_size,
dtype="float16"
if self._has_half_precision_transformation()
else "float32",
),
).numpy()
for i, output_size in enumerate(
self.network_parameters.output_sizes
)
)
return tvm_outputs
def free_gpu_memory(self):
# TODO: check if tvm needs to release GPU
pass
def save(self, path: Union[str, Path], **kwargs):
"""Save the model.
Args:
path (Path or str): Path to the directory where the model will
be stored.
kwargs (Dict): Dictionary of key-value pairs that will be saved in
the model metadata file.
"""
path = Path(path)
path.mkdir(exist_ok=True)
metadata = LearnerMetadata.from_model(
self, input_names=self.input_names, target=self.target, **kwargs
)
metadata.save(path)
if self.engine_path is None:
self.lib.export_library(path / TVM_FILENAMES["engine"])
else:
shutil.copy(self.engine_path, path)
@classmethod
def load(cls, path: Union[Path, str], **kwargs):
"""Load the model.
Args:
path (Path or str): Path to the directory where the model is
stored.
kwargs (Dict): Dictionary of additional arguments for the
`from_runtime_module` class method.
Returns:
ApacheTVMInferenceLearner: The optimized model.
"""
path = Path(path)
metadata = LearnerMetadata.read(path).to_dict()
network_parameters = ModelParams(**metadata["network_parameters"])
lib = tvm.runtime.load_module(path / TVM_FILENAMES["engine"])
target_device = metadata["target"]
input_names = metadata["input_names"]
input_tfms = metadata.get("input_tfms")
if input_tfms is not None:
metadata["input_tfms"] = MultiStageTransformation.from_dict(
input_tfms
)
device = Device.from_str(metadata["device"])
self = cls.from_runtime_module(
network_parameters=network_parameters,
lib=lib,
target_device=target_device,
input_names=input_names,
device=device,
)
self.engine_path = path / TVM_FILENAMES["engine"]
return self
@classmethod
def from_runtime_module(
cls,
network_parameters: ModelParams,
lib: ExecutorFactoryModule,
target_device: str,
input_names: List[str],
device: Device,
input_tfms: MultiStageTransformation = None,
input_data: DataManager = None,
):
"""Build the model from the runtime module (lib).
Args:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
lib (Module): Component needed for loading the ApacheTVM optimized
model.
target_device (str): The target device. Either `llvm` (CPU)
or `cuda`.
input_names (List[str]): Names associated to the model input
tensors.
device (Device): The device where the model will be executed.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction.
input_data (DataManager, optional): User defined data.
"""
dev = tvm.device(str(target_device), 0)
graph_executor_module = GraphModule(lib["default"](dev))
return cls(
input_tfms=input_tfms,
network_parameters=network_parameters,
graph_executor_module=graph_executor_module,
input_names=input_names,
lib=lib,
target=target_device,
input_data=input_data,
device=device,
)
class BaseArrayApacheTVMInferenceLearner(ApacheTVMInferenceLearner, ABC):
"""Base Model that can be used for all array-based
ApacheTVMInferenceLearners.
"""
def _inner_predict(
self,
input_arrays: Generator[np.ndarray, None, None],
input_shapes: Optional[List[Tuple[int, ...]]],
) -> Generator[np.ndarray, None, None]:
if self.network_parameters.dynamic_info is not None:
input_arrays = (
np.pad(
input_array,
[
(0, abs(x - y))
for x, y in zip(
input_array.shape,
input_size,
)
],
mode="constant",
constant_values=0,
)
for input_array, input_size in zip(
input_arrays, self.network_parameters.input_sizes
)
)
output_arrays = self._predict_array(input_arrays)
if self.network_parameters.dynamic_info is not None:
assert input_shapes is not None
dynamic_info = self.network_parameters.dynamic_info
return (
output_array[
tuple(
slice(
0,
None
if x not in out_dynamic_dict.keys()
else dynamic_info.retrieve_output_dim(
input_shapes, j, i, x
),
)
for i, x in enumerate(output_array.shape)
)
]
for j, (output_array, out_dynamic_dict) in enumerate(
zip(output_arrays, dynamic_info.outputs)
)
)
return output_arrays
class PytorchApacheTVMInferenceLearner(
BaseArrayApacheTVMInferenceLearner, PytorchBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a Pytorch interface.
This class can be used exactly in the same way as a pytorch Module object.
At prediction time it takes as input pytorch tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
graph_executor_module (GraphModule): The graph executor. This is the
central component in the ApacheTVM optimized model execution.
input_names (List[str]): Names associated to the model input tensors.
lib (Module): Component needed for loading the ApacheTVM optimized
model.
target (str): Target device. It can be wither `llvm` for targeting CPUs
or "cuda" for targeting GPUs.
"""
def run(self, *input_tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (
input_tensor.cpu().detach().numpy()
for input_tensor in input_tensors
)
input_shapes = (
[tuple(input_tensor.shape) for input_tensor in input_tensors]
if self.network_parameters.dynamic_info is not None
else None
)
output_arrays = self._inner_predict(input_arrays, input_shapes)
return tuple(
torch.from_numpy(array).to(self.device.to_torch_format())
for array in output_arrays
)
@staticmethod
def _convert_device(device: Any):
if isinstance(device, int):
return "cpu"
return device
class TensorflowApacheTVMInferenceLearner(
BaseArrayApacheTVMInferenceLearner, TensorflowBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a tensorflow interface.
This class can be used exactly in the same way as a tf.Module or
keras.Model object.
At prediction time it takes as input tensorflow tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
graph_executor_module (GraphModule): The graph executor. This is the
central component in the ApacheTVM optimized model execution.
input_names (List[str]): Names associated to the model input tensors.
lib (Module): Component needed for loading the ApacheTVM optimized
model.
target (str): Target device. It can be wither `llvm` for targeting CPUs
or "cuda" for targeting GPUs.
"""
def run(self, *input_tensors: tf.Tensor) -> Tuple[tf.Tensor, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[Tensor]): Input tensors belonging to the same
batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[Tensor]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (input_tensor.numpy() for input_tensor in input_tensors)
input_shapes = (
[tuple(input_tensor.shape) for input_tensor in input_tensors]
if self.network_parameters.dynamic_info is not None
else None
)
return tuple(
tf.convert_to_tensor(out)
for out in self._inner_predict(input_arrays, input_shapes)
)
class NumpyApacheTVMInferenceLearner(
BaseArrayApacheTVMInferenceLearner, NumpyBaseInferenceLearner
):
"""Model optimized using ApacheTVM with a tensorflow interface.
This class can be used exactly in the same way as a tf.Module or
keras.Model object.
At prediction time it takes as input tensorflow tensors given as positional
arguments.
Attributes:
network_parameters (ModelParams): The model parameters as batch
size, input and output sizes.
graph_executor_module (GraphModule): The graph executor. This is the
central component in the ApacheTVM optimized model execution.
input_names (List[str]): Names associated to the model input tensors.
lib (Module): Component needed for loading the ApacheTVM optimized
model.
target (str): Target device. It can be wither `llvm` for targeting CPUs
or "cuda" for targeting GPUs.
"""
def run(self, *input_tensors: np.ndarray) -> Tuple[np.ndarray, ...]:
"""Predict on the input tensors.
Note that the input tensors must be on the same batch. If a sequence
of tensors is given when the model is expecting a single input tensor
(with batch size >= 1) an error is raised.
Args:
input_tensors (Tuple[ndarray]): Input tensors belonging to the
same batch. The tensors are expected having dimensions
(batch_size, dim1, dim2, ...).
Returns:
Tuple[ndarray]: Output tensors. Note that the output tensors does
not correspond to the prediction on the input tensors with a
1 to 1 mapping. In fact the output tensors are produced as the
multiple-output of the model given a (multi-) tensor input.
"""
input_arrays = (input_tensor for input_tensor in input_tensors)
input_shapes = (
[tuple(input_tensor.shape) for input_tensor in input_tensors]
if self.network_parameters.dynamic_info is not None
else None
)
return tuple(self._inner_predict(input_arrays, input_shapes))
APACHE_TVM_INFERENCE_LEARNERS: Dict[
DeepLearningFramework, Type[ApacheTVMInferenceLearner]
] = {
DeepLearningFramework.PYTORCH: PytorchApacheTVMInferenceLearner,
DeepLearningFramework.TENSORFLOW: TensorflowApacheTVMInferenceLearner,
DeepLearningFramework.NUMPY: NumpyApacheTVMInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/inference_learners/utils.py
================================================
from pathlib import Path
from typing import Union, Any
from nebullvm.operations.inference_learners.base import LearnerMetadata
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.tools.diffusers import postprocess_diffusers
def load_model(path: Union[Path, str], pipe: StableDiffusionPipeline = None):
"""Load the optimized model previously saved in the given path.
Args:
path (Union[Path, str]): Path to the directory where the model is
saved.
pipe (StableDiffusionPipeline): Diffusion pipeline to be used for
loading the model. This parameter is only needed if the model
to be loaded is a diffusion model. Default: None.
Returns:
InferenceLearner: Model optimized by Speedster.
"""
optimized_model = LearnerMetadata.read(path).load_model(path)
if pipe is not None:
optimized_model = postprocess_diffusers(
optimized_model, pipe, optimized_model.device
)
return optimized_model
def save_model(model: Any, path: Union[Path, str]):
"""Save the optimized model in the given path.
Args:
model (Any): Model to be saved.
path (Union[Path, str]): Path to the directory where to
save the model.
Returns:
InferenceLearner: Model optimized by Speedster.
"""
if isinstance(model, StableDiffusionPipeline):
model.unet.model.save(path)
else:
model.save(path)
================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/base.py
================================================
import abc
from nebullvm.operations.base import Operation
class Measure(Operation, abc.ABC):
def __init__(self):
super().__init__()
self.measure_result = None
@abc.abstractmethod
def execute(self, **kwargs):
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/measures.py
================================================
from typing import List, Tuple, Any, Callable, Dict
import numpy as np
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import (
BenchmarkOriginalModelResult,
DeepLearningFramework,
)
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.operations.measures.base import Measure
from nebullvm.operations.measures.utils import (
compute_torch_latency,
compute_tf_latency,
compute_onnx_latency,
compute_relative_difference,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import run_onnx_model
from nebullvm.tools.pytorch import run_torch_model
from nebullvm.tools.tf import run_tf_model
COMPUTE_OUTPUT_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {
DeepLearningFramework.PYTORCH: run_torch_model,
DeepLearningFramework.TENSORFLOW: run_tf_model,
DeepLearningFramework.NUMPY: run_onnx_model,
}
COMPUTE_LATENCY_FRAMEWORK: Dict[DeepLearningFramework, Callable] = {
DeepLearningFramework.PYTORCH: compute_torch_latency,
DeepLearningFramework.TENSORFLOW: compute_tf_latency,
DeepLearningFramework.NUMPY: compute_onnx_latency,
}
class MetricDropMeasure(Measure):
def __init__(self):
super().__init__()
self.valid = None
def execute(
self,
optimized_learner: BaseInferenceLearner,
input_data: List[Tuple[Any, ...]],
base_outputs_list: List[Tuple[Any, ...]],
perf_loss_ths: float,
metric_func: Callable = None,
ys: List = None,
aggregation_func: Callable = np.mean,
):
metric_func = metric_func or compute_relative_difference
relative_differences = []
if ys is None:
ys = [None] * len(input_data)
assert len(input_data) == len(base_outputs_list) == len(ys), (
"INTERNAL ASSERT FAILED: error during computation of precision "
"of the optimized model, got wrong dimensions of the data. "
)
for inputs, base_outputs, y in zip(input_data, base_outputs_list, ys):
opt_outputs = optimized_learner(*inputs)
relative_difference = max(
metric_func(base_output, opt_output, y)
for base_output, opt_output in zip(base_outputs, opt_outputs)
)
relative_differences.append(relative_difference)
relative_difference = aggregation_func(relative_differences)
self.valid = relative_difference <= perf_loss_ths
self.measure_result = relative_difference
def get_result(self) -> Tuple[bool, float]:
return self.valid, self.measure_result
class LatencyOriginalModelMeasure(Measure):
def __init__(self):
super().__init__()
self.outputs = None
def execute(
self,
model: Any,
input_data: DataManager,
dl_framework: DeepLearningFramework,
) -> BenchmarkOriginalModelResult:
self.logger.info("Benchmark performance of original model")
self.outputs = [
tuple(
COMPUTE_OUTPUT_FRAMEWORK[dl_framework](
model, tuple(input_tensors[0]), self.device
)
)
for input_tensors in input_data
]
inputs = input_data.get_list(QUANTIZATION_DATA_NUM)
self.measure_result, _ = COMPUTE_LATENCY_FRAMEWORK[dl_framework](
inputs, model, self.device
)
self.logger.info(
f"Original model latency: {self.measure_result} sec/iter"
)
return BenchmarkOriginalModelResult(
latency_seconds=self.measure_result,
model_outputs=self.outputs,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/measures/utils.py
================================================
import time
from typing import Tuple, List, Union, Any
import numpy as np
from loguru import logger
from nebullvm.config import ONNX_PROVIDERS
from nebullvm.core.models import Device, DeviceType
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import (
convert_to_numpy,
get_input_names,
get_output_names,
)
def compute_torch_latency(
xs: List[Tuple[torch.Tensor]],
model: Module,
device: Device,
steps: int = 100,
warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
"""Compute the latency associated with the torch model.
Args:
xs (List[Tuple[torch.Tensor]]): List of tuples containing the
input tensors (a single batch for the model).
model (Module): Torch model.
device (Device): Device where computing the latency.
steps (int, optional): Number of input data to be used to compute the
latency of the model. It must be a number <= len(xs). Default: 100.
warmup_steps (int, optional): Number of input data to be used to warm
up the model. It must be a number <= len(xs). Default: 10.
Returns:
Float: Average latency.
List[Float]: List of latencies obtained.
"""
if device.type is not DeviceType.TPU:
xs = [
tuple(t.to(device.to_torch_format()) for t in tensors)
for tensors in xs
]
model = model.to(device.to_torch_format())
model.eval()
latencies = []
with torch.no_grad():
for i in range(warmup_steps):
_ = model.forward(*xs[i])
for i in range(steps):
starting_time = time.time()
_ = model.forward(*xs[i])
latencies.append(time.time() - starting_time)
latency = np.mean(latencies)
return latency, latencies
def compute_tf_latency(
xs: List[Tuple[tf.Tensor]],
model: Union[tf.Module, tf.keras.Model],
device: Device,
steps: int = 100,
warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
"""Compute the latency associated with the tensorflow model.
Args:
xs (List[Tuple[tf.Tensor]]): List of tuples containing the
input tensors (a single batch for the model).
model (Module or keras.Model): TF model.
device (Device): Device where computing the latency.
steps (int, optional): Number of input data to be used to compute the
latency of the model. It must be a number <= len(xs). Default: 100.
warmup_steps (int, optional): Number of input data to be used to warm
up the model. It must be a number <= len(xs). Default: 10.
Returns:
Float: Average latency.
List[Float]: List of latencies obtained.
"""
latencies = []
with tf.device(device.to_tf_format()):
for i in range(warmup_steps):
_ = model(xs[i])
for i in range(steps):
starting_time = time.time()
_ = model(xs[i])
latencies.append(time.time() - starting_time)
latency = np.mean(latencies)
return latency, latencies
def compute_onnx_latency(
xs: List[Tuple[np.array]],
model: str,
device: Device,
steps: int = 100,
warmup_steps: int = 10,
) -> Tuple[float, List[float]]:
"""Compute the latency associated with the ONNX model.
Args:
xs (List[Tuple[np.array]]): List of tuples containing the
inputs (a single batch for the model).
model (str): ONNX model path.
device (Device): Device where computing the latency.
steps (int, optional): Number of input data to be used to compute the
latency of the model. It must be a number <= len(xs). Default: 100.
warmup_steps (int, optional): Number of input data to be used to warm
up the model. It must be a number <= len(xs). Default: 10.
Returns:
Float: Average latency.
List[Float]: List of latencies obtained.
"""
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
input_names = get_input_names(model)
output_names = get_output_names(model)
if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3:
ONNX_PROVIDERS["cuda"][1] = (
"CUDAExecutionProvider",
{
"device_id": device.idx,
},
)
model = ort.InferenceSession(
model,
providers=ONNX_PROVIDERS["cuda"][1:]
if device.type is DeviceType.GPU
else ONNX_PROVIDERS["cpu"],
)
latencies = []
for i in range(warmup_steps):
inputs = {name: array for name, array in zip(input_names, xs[i])}
_ = model.run(output_names=output_names, input_feed=inputs)
for i in range(steps):
inputs = {name: array for name, array in zip(input_names, xs[i])}
starting_time = time.time()
_ = model.run(output_names=output_names, input_feed=inputs)
latencies.append(time.time() - starting_time)
latency = np.mean(latencies)
return latency, latencies
def compute_optimized_running_time(
optimized_model: BaseInferenceLearner,
input_data: DataManager,
steps: int = 100,
min_steps: int = 5,
warmup_steps: int = 10,
) -> float:
"""Compute the running time of the optimized model.
Args:
optimized_model (BaseInferenceLearner): Optimized model.
input_data: (DataManager): Dataset used to compute latency.
steps (int, optional): Number of input data to be used to
compute the latency of the model. Default: 100.
min_steps (int, optional): Minimum number of iterations to
be performed. Default: 5.
warmup_steps (int, optional): Number of input data to be used
to warm up the model. Default: 10.
Returns:
Float: Average latency.
"""
latencies = []
last_median = None
# Warmup
inputs_list = input_data.get_split("test").get_list(warmup_steps)
for model_inputs in inputs_list:
_ = optimized_model(*model_inputs)
# Compute latency
inputs_list = input_data.get_split("test").get_list(steps)
for model_inputs in inputs_list:
starting_time = time.time()
_ = optimized_model(*model_inputs)
latencies.append(time.time() - starting_time)
if len(latencies) > min_steps:
median = np.median(latencies)
diff = (
np.abs(median - last_median) / last_median
if last_median is not None
else 1.0
)
if diff < 0.05:
return median
last_median = median
return np.median(latencies)
def compute_relative_difference(
tensor_1: Any,
tensor_2: Any,
y: Any = None,
eps: float = 1e-5,
) -> float:
if y is not None:
logger.debug(
"Received a label for the precision computation. "
"It will be ignored."
)
tensor_1, tensor_2 = map(convert_to_numpy, (tensor_1, tensor_2))
assert tensor_1.shape == tensor_2.shape, (
"The outputs of the original and optimized models have "
"different shapes"
)
diff = np.abs(tensor_1 - tensor_2) / (
np.maximum(np.abs(tensor_1), np.abs(tensor_2)) + eps
)
return float(np.mean(diff))
def compute_accuracy_drop(tensor_1: Any, tensor_2: Any, y: Any) -> float:
assert y is not None, (
"No label found in the dataloader provided. "
"To use accuracy metric, you must set also the labels"
)
tensor_1, tensor_2, y = map(convert_to_numpy, (tensor_1, tensor_2, y))
accuracy_1 = np.mean(tensor_1.argmax(axis=-1) == y)
accuracy_2 = np.mean(tensor_2.argmax(axis=-1) == y)
return accuracy_1 - accuracy_2
QUANTIZATION_METRIC_MAP = {
"accuracy": compute_accuracy_drop,
"numeric_precision": compute_relative_difference,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/base.py
================================================
import abc
from typing import Any, Dict, List, Optional
from nebullvm.core.models import QuantizationType
from nebullvm.operations.base import Operation
class Compiler(Operation, abc.ABC):
supported_ops: Dict[str, List[Optional[QuantizationType]]]
def __init__(self):
super().__init__()
self.compiled_model = None
@abc.abstractmethod
def execute(self, **kwargs):
raise NotImplementedError()
@abc.abstractmethod
def _compile_model(self, **kwargs) -> Any:
raise NotImplementedError()
@abc.abstractmethod
def _quantize_model(self, **kwargs) -> Any:
raise NotImplementedError()
def get_result(self) -> Any:
return self.compiled_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/deepsparse.py
================================================
from pathlib import Path
from typing import Union
from nebullvm.core.models import (
ModelParams,
QuantizationType,
)
from nebullvm.operations.conversions.converters import (
PytorchConverter,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.optional_modules.torch import (
Module,
GraphModule,
)
from nebullvm.tools.data import DataManager
class DeepSparseCompiler(Compiler):
supported_ops = {
"cpu": [None],
"gpu": [],
}
def __init__(self):
super().__init__()
self.conversion_op = PytorchConverter()
def execute(
self,
model: Module,
onnx_output_path: str,
model_params: ModelParams,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using DeepSparse Compiler.
Args:
model (torch.nn.Module): The pytorch model.
onnx_output_path (str): Path where the converted ONNX model will be
stored.
model_params (ModelParams): The model parameters.
quantization_type (QuantizationType): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
self.compiled_model = self._compile_model(
model, onnx_output_path, input_data, model_params
)
def _compile_model(
self,
model: Union[Module, GraphModule],
onnx_output_path: str,
input_data: DataManager,
model_params: ModelParams,
) -> str:
self.conversion_op.model_name = "model_pruned"
onnx_pruned_path = Path(onnx_output_path)
self.conversion_op.to(self.device).set_state(
model, input_data
).execute(onnx_pruned_path, model_params)
onnx_pruned_path = str(onnx_pruned_path / "model_pruned.onnx")
return onnx_pruned_path
@staticmethod
def _quantize_model(**kwargs):
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/__init__.py
================================================
from copy import deepcopy
from typing import Union
from nebullvm.core.models import QuantizationType, DeviceType
from nebullvm.operations.optimizations.compilers.faster_transformer.bert import ( # noqa: E501
detect_and_swap_bert_model,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
TorchScriptCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.torch import (
GraphModule,
Module,
ScriptModule,
torch,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.huggingface import PyTorchTransformerWrapper
default_lib_path = str(
get_faster_transformer_repo_path()
/ "build"
/ "lib"
/ "libth_transformer.so"
)
def detect_and_swap_model(model, data_type="fp16", remove_padding=False):
"""currently only supports:
- BertModel and model with BertModel as .bert attribute
"""
model = detect_and_swap_bert_model(
model,
data_type=data_type,
lib_path=default_lib_path,
remove_padding=remove_padding,
)
if data_type == "fp16":
model.half()
elif data_type == "bf16":
model.bfloat16()
return model
class FasterTransformerCompiler(TorchScriptCompiler):
supported_ops = {
"cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],
"gpu": [
None,
QuantizationType.HALF,
],
}
@torch.no_grad()
def _compile_model(
self,
model: Union[Module, GraphModule],
input_data: DataManager,
quantization_type: QuantizationType,
) -> ScriptModule:
model = deepcopy(model) # Some operations modify the model in-place
if isinstance(model, PyTorchTransformerWrapper):
# .core_model is a huggingface model
data_type = (
"fp16"
if quantization_type is QuantizationType.HALF
else "fp32"
)
model.core_model = detect_and_swap_model(
model.core_model, data_type=data_type, remove_padding=False
)
if self.device.type is DeviceType.GPU:
model.cuda()
return super()._compile_model(model, input_data, quantization_type)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/__init__.py
================================================
import os
from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import ( # noqa: E501
BertModel as FasterBertModel,
)
from nebullvm.operations.optimizations.compilers.faster_transformer.bert.modeling_bert import ( # noqa: E501
CustomEncoder,
EncoderWeights,
)
from nebullvm.operations.optimizations.compilers.utils import (
get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.huggingface import BertModel as HFBertModel
from nebullvm.optional_modules.torch import torch
default_lib_path = str(
get_faster_transformer_repo_path()
/ "build"
/ "lib"
/ "libth_transformer.so"
)
def swap_bert_encoder(model, data_type, lib_path, remove_padding=False):
"""
Replace the encoder of the model with a custom encoder
that uses the Faster Transformer library.
"""
weights = EncoderWeights(
model.config.num_hidden_layers,
model.config.hidden_size,
model.state_dict(),
)
weights.to_cuda()
if data_type == "fp16":
weights.to_half()
elif data_type == "bf16":
weights.to_bfloat16()
lib_path = os.path.abspath(lib_path)
enc = CustomEncoder(
model.config.num_hidden_layers,
model.config.num_attention_heads,
model.config.hidden_size // model.config.num_attention_heads,
weights,
remove_padding=remove_padding,
path=lib_path,
)
enc_ = torch.jit.script(enc)
model.replace_encoder(enc_)
def swap_model(
model: HFBertModel, data_type, lib_path, remove_padding=False
) -> FasterBertModel:
# bert model need some custom code to call the custom encoder
# so we need to use custom bert class
new_model = FasterBertModel(model.config)
new_model.load_state_dict(model.state_dict())
swap_bert_encoder(new_model, data_type, lib_path, remove_padding)
return new_model
def detect_and_swap_bert_model(
model, data_type, lib_path=default_lib_path, remove_padding=False
):
if type(model) == HFBertModel:
model = swap_model(model, data_type, lib_path, remove_padding)
if hasattr(model, "bert") and type(model.bert) == HFBertModel:
model.bert = swap_model(
model.bert, data_type, lib_path, remove_padding
)
return model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/checkpoint_quantization.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/checkpoint_quantization.py # noqa: E501
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import numpy as np
from loguru import logger
from nebullvm.optional_modules.torch import torch
ACTIVATION_AMAX_NUM = 72
INT8O_GEMM_NUM = 8
TRT_FUSED_MHA_AMAX_NUM = 3
SCALE_RESERVE_NUM = 21
def checkpoint_quantization(
init_dict, sparse, ths_path="./lib/libth_transformer.so"
):
logger.info("Quantizing checkpoint ...")
torch.classes.load_library(ths_path)
weight_quantize = torch.ops.fastertransformer.weight_quantize
def init_graph():
layer_num = 0
regex = re.compile("layer.\d+") # noqa: W605
amaxTotalNum = 0
for name, tensor_value in init_dict.items():
if "intermediate.dense.weight" in name and amaxTotalNum == 0:
amaxTotalNum = (
ACTIVATION_AMAX_NUM
+ 9 * tensor_value.size(1)
+ INT8O_GEMM_NUM
+ TRT_FUSED_MHA_AMAX_NUM
+ SCALE_RESERVE_NUM
)
tmp = regex.findall(name)
if len(tmp) < 1:
continue
num_tmp = int(tmp[0].replace("layer.", ""))
if layer_num < num_tmp:
layer_num = num_tmp
layer_num = layer_num + 1
# add new var for amax
for i in range(layer_num):
init_dict[
"bert.encoder.layer.{}.amaxList".format(i)
] = torch.zeros((amaxTotalNum,), dtype=torch.float32)
return layer_num, amaxTotalNum
layer_num, amaxTotalNum = init_graph()
kernel_name_list = [
"attention.self.query",
"attention.self.key",
"attention.self.value",
"attention.output.dense",
"intermediate.dense",
"output.dense",
]
amax_name_list = [
"attention.self.query._input_quantizer",
"attention.self.query._aftergemm_quantizer",
"attention.self.matmul_q_input_quantizer",
"attention.self.key._aftergemm_quantizer",
"attention.self.matmul_k_input_quantizer",
"attention.self.value._aftergemm_quantizer",
"attention.self.matmul_v_input_quantizer",
"attention.self.softmax_input_quantizer",
"attention.self.matmul_a_input_quantizer",
"attention.output.dense._input_quantizer",
"attention.output.dense._aftergemm_quantizer",
"intermediate.dense._input_quantizer",
"intermediate.dense._aftergemm_quantizer",
"output.dense._input_quantizer",
"output.dense._aftergemm_quantizer",
"special_F2Bias_scale",
]
int8O_gemm_weight_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
int8O_gemm_weight_list = [
"attention.self.query",
"attention.self.key",
"attention.self.value",
"attention.self.matmul_k_input_quantizer",
"attention.self.matmul_v_input_quantizer",
"attention.output.dense",
"intermediate.dense",
"output.dense",
]
int8O_gemm_input_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
int8O_gemm_input_list = [
"attention.self.query._input_quantizer",
"attention.self.key._input_quantizer",
"attention.self.value._input_quantizer",
"attention.self.matmul_q_input_quantizer",
"attention.self.matmul_a_input_quantizer",
"attention.output.dense._input_quantizer",
"intermediate.dense._input_quantizer",
"output.dense._input_quantizer",
]
int8O_gemm_output_amax_list = [0 for i in range(INT8O_GEMM_NUM)]
int8O_gemm_output_list = [
"attention.self.query._aftergemm_quantizer",
"attention.self.key._aftergemm_quantizer",
"attention.self.value._aftergemm_quantizer",
"attention.self.softmax_input_quantizer",
"attention.output.dense._input_quantizer",
"attention.output.dense._aftergemm_quantizer",
"intermediate.dense._aftergemm_quantizer",
"output.dense._aftergemm_quantizer",
]
same_value_tuple_list = [
(
"attention.self.query._input_quantizer",
"attention.self.key._input_quantizer",
"attention.self.value._input_quantizer",
"attention.output.add_residual_input_quantizer",
),
(
"intermediate.dense._input_quantizer",
"output.add_residual_input_quantizer",
),
]
factor = 1000000.0 # noqa: F841
for i in range(layer_num):
amaxList = np.zeros([amaxTotalNum]).astype(np.float32)
amax_id = 0
# verify some quantizers have same value.
# input_quantizer is per-tensor quantization
for same_value_tuple in same_value_tuple_list:
tmp_v = init_dict[
"bert.encoder.layer.{}.{}._amax".format(i, same_value_tuple[0])
].numpy()
for same_value_name in same_value_tuple:
tmp_v_2 = init_dict[
"bert.encoder.layer.{}.{}._amax".format(i, same_value_name)
].numpy()
assert np.allclose(tmp_v, tmp_v_2)
for amax_name in amax_name_list:
if amax_name == "special_F2Bias_scale":
if i != layer_num - 1:
quant_max = init_dict[
"bert.encoder.layer.{}.{}._amax".format(
i + 1, amax_name_list[0]
)
].item()
amax = abs(quant_max)
else:
# not used, placeholder
amax = 1.0
amaxList[amax_id] = amax
amax_id += 1
amaxList[amax_id] = amax / 127.0
amax_id += 1
amaxList[amax_id] = amax / 127.0 / 127.0
amax_id += 1
amaxList[amax_id] = 127.0 / amax
amax_id += 1
continue
quant_max = init_dict[
"bert.encoder.layer.{}.{}._amax".format(i, amax_name)
].item()
amax = abs(quant_max) # round(abs(quant_max)*factor)/factor
if amax_name in int8O_gemm_input_list:
int8O_gemm_input_amax_list[
int8O_gemm_input_list.index(amax_name)
] = amax
if amax_name == "attention.self.query._input_quantizer":
int8O_gemm_input_amax_list[
int8O_gemm_input_list.index(
"attention.self.key._input_quantizer"
)
] = amax
int8O_gemm_input_amax_list[
int8O_gemm_input_list.index(
"attention.self.value._input_quantizer"
)
] = amax
if amax_name in int8O_gemm_output_list:
int8O_gemm_output_amax_list[
int8O_gemm_output_list.index(amax_name)
] = amax
if amax_name in int8O_gemm_weight_list:
int8O_gemm_weight_amax_list[
int8O_gemm_weight_list.index(amax_name)
] = amax
amaxList[amax_id] = amax
amax_id += 1
amaxList[amax_id] = amax / 127.0
amax_id += 1
amaxList[amax_id] = amax / 127.0 / 127.0
amax_id += 1
amaxList[amax_id] = 127.0 / amax
amax_id += 1
# kernel amax starts from ACTIVATION_AMAX_NUM
assert amax_id == 64
amax_id = ACTIVATION_AMAX_NUM
for kernel_id, kernel_name in enumerate(kernel_name_list):
kernel = (
init_dict[
"bert.encoder.layer.{}.{}.weight".format(i, kernel_name)
]
.transpose(-1, -2)
.contiguous()
)
quant_max2 = init_dict[
"bert.encoder.layer.{}.{}._weight_quantizer._amax".format(
i, kernel_name
)
]
amax2 = abs(quant_max2)
if amax2.dim() == 0:
quant_max_processed = torch.full(
(kernel.size(1),),
amax2.item(),
dtype=amax2.dtype,
device=amax2.device,
)
else:
quant_max_processed = amax2.view(-1)
kernel_processed = weight_quantize(
kernel, quant_max_processed.cuda(), sparse
)
init_dict[
"bert.encoder.layer.{}.{}.weight".format(i, kernel_name)
] = kernel_processed
if kernel_name in int8O_gemm_weight_list:
int8O_gemm_weight_amax_list[
int8O_gemm_weight_list.index(kernel_name)
] = quant_max_processed[0]
for e in quant_max_processed:
amaxList[amax_id] = e
amax_id += 1
# for int8O gemm deQuant
for j in range(INT8O_GEMM_NUM):
amaxList[amax_id] = (
int8O_gemm_input_amax_list[j] * int8O_gemm_weight_amax_list[j]
) / (127.0 * int8O_gemm_output_amax_list[j])
amax_id += 1
# for trt fused MHA amax
# QKV_addBias_amax
amaxList[amax_id] = np.maximum(
np.maximum(amaxList[8], amaxList[16]), amaxList[24]
)
amax_id += 1
# softmax amax
amaxList[amax_id] = amaxList[32]
amax_id += 1
# bmm2 amax
amaxList[amax_id] = amaxList[36]
amax_id += 1
init_dict["bert.encoder.layer.{}.amaxList".format(i)] = torch.tensor(
amaxList, dtype=torch.float32
)
logger.info("Quantizing checkpoint done.")
return init_dict
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/bert/modeling_bert.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501
# This file is mostly copied from the FasterTransformer repo
# https://github.com/NVIDIA/FasterTransformer
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from loguru import logger
from nebullvm.optional_modules.torch import torch, torch_distributed as dist
from nebullvm.optional_modules.huggingface import (
BertConfig,
BertEmbeddings,
BertEncoder,
BertPooler,
BertPreTrainedModel,
)
from .checkpoint_quantization import checkpoint_quantization
class EncoderWeights(object):
def __init__(
self,
layer_num,
hidden_dim,
weights=None,
sparse=False,
tensor_para_size=1,
pipeline_para_size=1,
):
"""weights need be a state_dict of bert model"""
self.layer_num = layer_num
self.int8 = False
self.hidden_dim = hidden_dim
self.weights = {}
self.tensor_para_size = tensor_para_size
self.pipeline_para_size = pipeline_para_size
self.use_mpi = dist.is_mpi_available()
if self.use_mpi:
try:
dist.init_process_group(backend="mpi")
except: # noqa: E722
logger.info(
"[INFO] WARNING: Exception occurred in "
"dist.init_process_group(backend='mpi')."
"Maybe the process group has been initialized somewhere else." # noqa: E501
)
else:
logger.info("[INFO] MPI is not available in this PyTorch build.")
assert (
tensor_para_size == 1
), "[FATAL] MPI is required for tensor_para_size > 1."
assert (
pipeline_para_size == 1
), "[FATAL] MPI is required for pipeline_para_size > 1."
self.rank = dist.get_rank() if self.use_mpi else 0
self.device_count = torch.cuda.device_count()
self.device = self.rank % self.device_count
torch.cuda.set_device(self.device)
world_size = dist.get_world_size() if self.use_mpi else 1 # noqa: F841
self.tensor_para_rank = self.rank % self.tensor_para_size
self.pipeline_para_rank = self.rank // self.tensor_para_size
if weights is None:
self._generated_weights = True
for i in range(layer_num):
pre = "encoder.layer." + str(i) + "."
self.weights[
pre + "attention.self.query.weight"
] = torch.zeros(hidden_dim, hidden_dim)
self.weights[pre + "attention.self.query.bias"] = torch.zeros(
hidden_dim
)
self.weights[pre + "attention.self.key.weight"] = torch.zeros(
hidden_dim, hidden_dim
)
self.weights[pre + "attention.self.key.bias"] = torch.zeros(
hidden_dim
)
self.weights[
pre + "attention.self.value.weight"
] = torch.zeros(hidden_dim, hidden_dim)
self.weights[pre + "attention.self.value.bias"] = torch.zeros(
hidden_dim
)
self.weights[
pre + "attention.output.dense.weight"
] = torch.zeros(hidden_dim, hidden_dim)
self.weights[
pre + "attention.output.dense.bias"
] = torch.zeros(hidden_dim)
self.weights[
pre + "attention.output.LayerNorm.weight"
] = torch.zeros(hidden_dim)
self.weights[
pre + "attention.output.LayerNorm.bias"
] = torch.zeros(hidden_dim)
self.weights[pre + "intermediate.dense.weight"] = torch.zeros(
4 * hidden_dim, hidden_dim
) # noqa: E501
self.weights[pre + "intermediate.dense.bias"] = torch.zeros(
4 * hidden_dim
)
self.weights[pre + "output.dense.weight"] = torch.zeros(
hidden_dim, 4 * hidden_dim
)
self.weights[pre + "output.dense.bias"] = torch.zeros(
hidden_dim
)
self.weights[pre + "output.LayerNorm.weight"] = torch.zeros(
hidden_dim
)
self.weights[pre + "output.LayerNorm.bias"] = torch.zeros(
hidden_dim
)
for k, v in self.weights.items():
if not k.endswith("_amax"):
self.weights[k] = torch.nn.init.uniform_(v, -1, 1)
if sparse:
for k, v in self.weights.items():
if (
"query.weight" in k
or "key.weight" in k
or "value.weight" in k
or "dense.weight" in k
):
v_shape = v.shape
v = v.view(-1, 4)
_, indices = torch.topk(
torch.abs(v), 2, dim=-1, largest=False
)
v.scatter_(1, indices, 0)
self.weights[k] = v.view(v_shape)
else:
self._generated_weights = False
for k, v in weights.items():
ks = k.split(".")
if ks[-2] == "LayerNorm":
if ks[-1] == "gamma":
ks[-1] = "weight"
elif ks[-1] == "beta":
ks[-1] = "bias"
self.weights[".".join(ks)] = v
def listed_weights(self):
ret = []
start_layer = (
self.pipeline_para_rank * self.layer_num // self.pipeline_para_size
)
end_layer = (
(self.pipeline_para_rank + 1)
* self.layer_num
// self.pipeline_para_size
)
if not self.int8:
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.query.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 0
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.query.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.key.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 2
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.key.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.value.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 4
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.value.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.dense.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 6
ret[-1] = (
ret[-1]
.split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.dense.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.LayerNorm.weight"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.LayerNorm.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "intermediate.dense.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 10
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "intermediate.dense.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret[-1] = (
ret[-1]
.split(ret[-1].shape[-1] // self.tensor_para_size, dim=-1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.dense.weight"
].transpose(-1, -2)
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
) # 12
ret[-1] = (
ret[-1]
.split(ret[-1].shape[1] // self.tensor_para_size, dim=1)[
self.tensor_para_rank
]
.contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.dense.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.LayerNorm.weight"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.LayerNorm.bias"
]
for layer_idx in range(start_layer, end_layer)
],
0,
).contiguous()
)
else:
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.query.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 0
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.query.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.key.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 2
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.key.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.value.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 4
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.self.value.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.dense.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 6
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.dense.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.LayerNorm.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "attention.output.LayerNorm.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "intermediate.dense.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 10
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "intermediate.dense.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.dense.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
) # 12
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.dense.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.LayerNorm.weight"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "output.LayerNorm.bias"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "amaxList"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
ret.append(
torch.stack(
[
self.weights[
"encoder.layer."
+ str(layer_idx)
+ "."
+ "h_amaxList"
]
for layer_idx in range(self.layer_num)
],
0,
).contiguous()
)
return ret
def to_cuda(self):
if not self.int8:
for k, v in self.weights.items():
self.weights[k] = v.cuda()
else:
h_scale_list = {}
for k, v in self.weights.items():
if "amaxList" in k:
k_h = k.replace("amaxList", "h_amaxList")
h_scale_list[k_h] = v
self.weights[k] = v.cuda()
for k, v in h_scale_list.items():
self.weights[k] = v
def to_half(self):
if self.int8:
raise RuntimeError(
"Cannot cast to half if the weights have been casted to int8."
)
for k, v in self.weights.items():
self.weights[k] = v.half()
def to_bfloat16(self):
if self.int8:
raise RuntimeError(
"Cannot cast to bfloat16 if the weights have been casted to int8." # noqa: E501
)
for k, v in self.weights.items():
self.weights[k] = v.bfloat16()
def to_int8(self, sparse=False, ths_path="./lib/libth_transformer.so"):
if self._generated_weights:
amax_tensor_1 = torch.Tensor(self.hidden_dim).fill_(127.0)
amax_tensor_2 = torch.Tensor(self.hidden_dim * 4).fill_(127.0)
for i in range(self.layer_num):
pre = "encoder.layer." + str(i) + "."
self.weights[
pre + "attention.self.query._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.query._weight_quantizer._amax"
] = amax_tensor_1
self.weights[
pre + "attention.self.query._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.key._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.key._weight_quantizer._amax"
] = amax_tensor_1
self.weights[
pre + "attention.self.key._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.value._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.value._weight_quantizer._amax"
] = amax_tensor_1
self.weights[
pre + "attention.self.value._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.matmul_q_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.matmul_k_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.matmul_v_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.matmul_a_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.self.softmax_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.output.dense._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.output.dense._weight_quantizer._amax"
] = amax_tensor_1
self.weights[
pre + "attention.output.dense._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.output.add_local_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "attention.output.add_residual_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "intermediate.dense._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "intermediate.dense._weight_quantizer._amax"
] = amax_tensor_2
self.weights[
pre + "intermediate.dense._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "output.dense._input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "output.dense._weight_quantizer._amax"
] = amax_tensor_1
self.weights[
pre + "output.dense._aftergemm_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "output.add_local_input_quantizer._amax"
] = torch.tensor(127.0)
self.weights[
pre + "output.add_residual_input_quantizer._amax"
] = torch.tensor(127.0)
if (
"encoder.layer.0.attention.self.query._input_quantizer._amax"
not in self.weights
):
raise RuntimeError(
"There is no quantization node in the checkpoint, cannot be quantized to int8." # noqa: E501
)
if self.int8:
return
self.int8 = True
for k, v in self.weights.items():
if k.endswith("bias") or k.endswith("LayerNorm.weight"):
self.weights[k] = v.half()
elif k.endswith("weight"):
self.weights[k] = v.float().cuda()
else:
self.weights[k] = v.float().cpu()
self.weights = checkpoint_quantization(
self.weights, sparse, ths_path, verbose=False
)
class CustomEncoder(torch.nn.Module):
def __init__(
self,
layer_num,
head_num,
head_size,
weights,
int8_mode=0,
remove_padding=False,
sparse=False,
path="./lib/libth_transformer.so",
tensor_para_size=1,
pipeline_para_size=1,
):
super().__init__()
self.layer_num = layer_num
self.remove_padding = remove_padding
self.int8_mode = int8_mode
logger.info(f"loading faster transformer library from {path}")
torch.classes.load_library(path)
weights_ = weights.listed_weights()
self.use_mpi = dist.is_mpi_available()
if self.use_mpi:
try:
dist.init_process_group(backend="mpi")
except: # noqa: E722
logger.info(
"[INFO] WARNING: Exception occurred in"
"dist.init_process_group(backend='mpi')."
"Maybe the process group has been initialized somewhere else." # noqa: E501
)
else:
logger.info("[INFO] MPI is not available in this PyTorch build.")
assert (
tensor_para_size == 1
), "[FATAL] MPI is required for tensor_para_size > 1."
assert (
pipeline_para_size == 1
), "[FATAL] MPI is required for pipeline_para_size > 1."
if int8_mode == 0:
assert len(weights_) == 16
try:
self.encoders = torch.classes.FasterTransformer.Bert(
*weights_,
head_num,
head_size,
4 * head_num * head_size,
remove_padding,
layer_num,
sparse,
1.0,
tensor_para_size,
pipeline_para_size,
)
except: # noqa: E722
# legacy ths for 20.03 image
self.encoders = torch.classes.FasterTransformerBert(
*weights_,
head_num,
head_size,
4 * head_num * head_size,
remove_padding,
layer_num,
sparse,
1.0,
tensor_para_size,
pipeline_para_size,
)
else:
assert len(weights_) == 18
assert (
tensor_para_size == 1
), "INT8 BERT still only support tensor_para_size = 1"
assert (
pipeline_para_size == 1
), "INT8 BERT still only support pipeline_para_size = 1"
try:
self.encoders = torch.classes.FasterTransformer.INT8Bert(
*weights_,
head_num,
head_size,
remove_padding,
layer_num,
int8_mode,
sparse,
1.0,
)
except: # noqa: E722
# legacy ths for 20.03 image
self.encoders = torch.classes.FasterTransformerINT8Bert(
*weights_,
head_num,
head_size,
remove_padding,
layer_num,
int8_mode,
sparse,
1.0,
)
def forward(self, hidden_states, attention_mask, sequence_lengths):
hidden_states = self.encoders.forward(hidden_states, sequence_lengths)
return (hidden_states,)
class HuggingFaceEncoder(torch.nn.Module):
def __init__(self, layer_num, head_num, head_size, weights=None):
super().__init__()
hidden_dim = head_num * head_size
# TODO(bhsueh) The implementation of hidden_act='gelu' is differen
# to FT's (and google BERT) implementation
# FT's implementation is equivalent to hidden_act='gelu_new',
# but there are some issues for int8 sparse under gelu_new
conf = BertConfig(
hidden_size=hidden_dim,
intermediate_size=4 * hidden_dim,
num_attention_heads=head_num,
num_hidden_layers=layer_num,
hidden_act="gelu",
)
self.encoder = BertEncoder(conf)
w = {}
for k, v in weights.weights.items():
if k.startswith("encoder") and not k.endswith("_amax"):
w[k[13:]] = weights.weights[k]
self.encoder.load_state_dict(w)
self.head_mask = [None] * layer_num
def forward(self, hidden_states, attention_mask):
extended_attention_mask = (1.0 - attention_mask) * -10000.0
output = self.encoder(
hidden_states,
extended_attention_mask,
self.head_mask,
return_dict=False,
)
return output
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/bert/utils/modeling_bert.py # noqa: E501
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # noqa: E501
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model modified from HuggingFace transformers. """
class BertModel(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)
self.init_weights()
self.use_ext_encoder = False
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time" # noqa: E501
)
elif input_ids is not None:
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError(
"You have to specify either input_ids or inputs_embeds"
)
device = (
input_ids.device if input_ids is not None else inputs_embeds.device
)
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(
input_shape, dtype=torch.long, device=device
)
if self.use_ext_encoder:
# if attention_mask.dim() == 3:
# extended_attention_mask = attention_mask
# elif attention_mask.dim() == 2:
# extended_attention_mask = attention_mask[:, None, :].repeat(1, input_shape[1], 1) # noqa: E501
# else:
# raise ValueError(
# "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(# noqa: E501
# input_shape, attention_mask.shape
# )
# )
assert attention_mask.dim() == 2
extended_attention_mask = attention_mask.view(
-1, 1, 1, attention_mask.size(-1)
)
m_2 = extended_attention_mask.transpose(-1, -2)
extended_attention_mask = extended_attention_mask * m_2
extended_attention_mask = extended_attention_mask.to(
dtype=next(self.parameters()).dtype
) # fp16 compatibility
seq_lens = torch.sum(attention_mask, 1, dtype=torch.int32).cuda()
else:
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # noqa: E501
# ourselves in which case we just need to make it broadcastable to all heads. # noqa: E501
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( # noqa: E501
input_shape, attention_mask.shape
)
)
# Since attention_mask is 1.0 for positions we want to attend
# and 0.0 for masked positions, this operation will create a
# tensor which is 0.0 for positions we want to attend
# and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax,
# this is effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(
dtype=next(self.parameters()).dtype
) # fp16 compatibility
extended_attention_mask = (
1.0 - extended_attention_mask
) * -10000.0
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
)
if self.use_ext_encoder:
encoder_outputs = self.encoder(
embedding_output, extended_attention_mask, seq_lens
)
else:
head_mask = [None] * self.config.num_hidden_layers
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions) # noqa: E501
def replace_encoder(self, new_encoder):
self.encoder = new_encoder
self.use_ext_encoder = True
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/__init__.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/gpt_summarization.py # noqa: E501
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
from typing import Callable, Iterable, List, Optional, Tuple, Union
from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils import \
gpt_decoder
from nebullvm.operations.optimizations.compilers.faster_transformer.gpt.utils.huggingface_gpt_convert import ( # noqa: E501
main as convert_huggingface_gpt_to_faster_transformer,
)
from nebullvm.operations.optimizations.compilers.utils import (
get_faster_transformer_repo_path,
)
from nebullvm.optional_modules.huggingface import GPT2LMHeadModel
from nebullvm.optional_modules.torch import torch
lib_path = default_lib_path = str(
get_faster_transformer_repo_path()
/ "build"
/ "lib"
/ "libth_transformer.so"
)
class FasterTransformerGPT2Wrapper(torch.nn.Module):
def __init__(self, model: gpt_decoder.Gpt, config):
super().__init__()
self.model = model
self.config = config
self.device = model.device
@torch.no_grad()
def generate(
self,
inputs: Optional[torch.Tensor] = None,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
do_sample: Optional[bool] = None,
early_stopping: Optional[bool] = None,
num_beams: Optional[int] = 1,
temperature: Optional[float] = None,
penalty_alpha: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
typical_p: Optional[float] = None,
repetition_penalty: Optional[float] = None,
bad_words_ids: Optional[Iterable[int]] = None,
force_words_ids: Optional[
Union[Iterable[int], Iterable[Iterable[int]]]
] = None,
bos_token_id: Optional[int] = None,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
length_penalty: Optional[float] = None,
no_repeat_ngram_size: Optional[int] = None,
encoder_no_repeat_ngram_size: Optional[int] = None,
num_return_sequences: Optional[int] = None,
max_time: Optional[float] = None,
max_new_tokens: Optional[int] = None,
decoder_start_token_id: Optional[int] = None,
use_cache: Optional[bool] = None,
num_beam_groups: Optional[int] = None,
diversity_penalty: Optional[float] = None,
prefix_allowed_tokens_fn: Optional[
Callable[[int, torch.Tensor], List[int]]
] = None,
# logits_processor: Optional[LogitsProcessorList] = None,
# renormalize_logits: Optional[bool] = None,
# stopping_criteria: Optional[StoppingCriteriaList] = None,
# constraints: Optional[List[Constraint]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
forced_bos_token_id: Optional[int] = None,
forced_eos_token_id: Optional[int] = None,
remove_invalid_values: Optional[bool] = None,
synced_gpus: Optional[bool] = False,
exponential_decay_length_penalty: Optional[Tuple[int, float]] = None,
suppress_tokens: Optional[List[int]] = None,
begin_suppress_tokens: Optional[List[int]] = None,
forced_decoder_ids: Optional[List[List[int]]] = None,
):
input_lengths = torch.tensor(
[len(input) for input in inputs],
dtype=torch.int32,
device=self.model.device,
)
batch_size = len(inputs)
def convert_to_tensor_if_not(value, dtype=torch.float32):
if value is None:
return value
if isinstance(value, torch.Tensor):
return value
return value * torch.ones(batch_size, dtype=dtype) # cpu tensor
top_k = convert_to_tensor_if_not(top_k, dtype=torch.int32)
top_p = convert_to_tensor_if_not(top_p, dtype=torch.float32)
temperature = convert_to_tensor_if_not(
temperature, dtype=torch.float32
)
repetition_penalty = convert_to_tensor_if_not(
repetition_penalty, dtype=torch.float32
)
min_length = convert_to_tensor_if_not(min_length, dtype=torch.int32)
len_penalty = convert_to_tensor_if_not(
length_penalty, dtype=torch.float32
)
if max_length is None:
# gen_length is required for faster transformer
# infer it from the model config
max_length = self.config.n_ctx
output_dict = self.model.generate(
input_token_ids=inputs,
input_lengths=input_lengths,
gen_length=max_length - len(inputs[0]),
eos_token_id=eos_token_id,
# local_batch_size=None,
beam_width=num_beams,
top_k=top_k,
top_p=top_p,
# top_p_decay: Optional[torch.FloatTensor] = None,
# top_p_min: Optional[torch.FloatTensor] = None,
# top_p_reset_ids: Optional[torch.IntTensor] = None,
temperature=temperature,
repetition_penalty=repetition_penalty,
# presence_penalty: Optional[torch.FloatTensor] = None,
min_length=min_length,
len_penalty=len_penalty,
# beam_search_diversity_rate: Optional[torch.FloatTensor] = None,
# stop_words_list: Optional[torch.IntTensor] = None,
# bad_words_list: Optional[torch.IntTensor] = None,
# sequence_limit_lengths: Optional[torch.IntTensor] = None,
# random_seed: Optional[torch.LongTensor] = None,
# memory_length: Optional[int] = None,
return_output_length=True,
return_log_probs=False,
)
output_token_ids = output_dict["output_token_ids"]
output_lengths = output_dict["output_lengths"]
# tokens = output_token_ids[0, 0, input_lengths[0]:output_lengths[0]]
tokens = [
# output_token_ids[i, 0, input_lengths[i]:output_lengths[i]]
output_token_ids[i, 0, : output_lengths[i]]
for i in range(batch_size)
]
return tokens
def convert_gpt2_lm_head_model(
model: GPT2LMHeadModel,
tokenizer,
weight_data_type="fp32",
data_type="fp16",
use_fp32_to_compute_logit=False,
):
"""
currently doens't support fp8 or multi-gpu
"""
weights_data_type = weight_data_type
temp_dir = tempfile.TemporaryDirectory()
temp_dir_path = temp_dir.name
ft_model_location = saved_dir = temp_dir_path + "/gpt2"
hf_config = model.config.to_dict()
# convert huggingface model to faster transformer model
convert_huggingface_gpt_to_faster_transformer(
saved_dir=saved_dir,
model=model.transformer,
weight_data_type=weight_data_type,
)
head_num = hf_config["n_head"]
layer_num = hf_config["n_layer"]
start_id = hf_config["bos_token_id"]
end_id = hf_config["eos_token_id"]
size_per_head = hf_config["n_embd"] // head_num
vocab_size = tokenizer.vocab_size
tensor_para_size = 1
pipeline_para_size = 1
ckpt_path = os.path.join(ft_model_location, f"{tensor_para_size}-gpu")
max_seq_len = hf_config["n_ctx"]
int8_mode = 0 # 0: no quantization, 1: quantize weights to int8
# load faster transformer model, note that the lm_head is not saved
# it's reconstructed during loading from the embedding weights
gpt = gpt_decoder.Gpt(
num_heads=head_num,
size_per_head=size_per_head,
num_layers=layer_num,
vocab_size=vocab_size,
start_id=start_id,
end_id=end_id,
tensor_para_size=tensor_para_size,
pipeline_para_size=pipeline_para_size,
lib_path=lib_path,
max_seq_len=max_seq_len,
int8_mode=int8_mode,
inference_data_type=data_type,
weights_data_type=weights_data_type,
use_fp32_to_compute_logit=use_fp32_to_compute_logit,
)
gpt.load(ckpt_path, data_type)
return FasterTransformerGPT2Wrapper(gpt, model.config)
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token
# model = hf_model = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda").eval()
# hf_config = hf_model.config.to_dict()
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# weight_data_type = weights_data_type = "fp32" # fp32 or fp16
# data_type = "fp32" # fp32 or fp16
# faster_model= convert_gpt2_lm_head_model(
# model, tokenizer,
# weight_data_type=weight_data_type,
# data_type=data_type)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/gpt_decoder.py
================================================
# Based on: https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/gpt_decoder.py # noqa: E501
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from abc import abstractmethod
from pathlib import Path
from typing import List, Literal, Optional, Union
import os
import numpy as np
from . import comm
from . import profiler
from .gpt import GptInitModelParameters
from nebullvm.optional_modules.torch import torch
PathLike = Union[str, Path]
def to_numpy_dtype(maybe_str_dtype: Union[str, np.dtype]):
assert isinstance(maybe_str_dtype, (str, np.dtype))
if isinstance(maybe_str_dtype, str):
try:
dtype = {
"fp16": np.float16,
"float16": np.float16,
"fp32": np.float32,
"float32": np.float32,
}[maybe_str_dtype]
except KeyError:
raise ValueError(
f"Cannot convert to numpy data type, got {maybe_str_dtype}"
)
else:
dtype = maybe_str_dtype
return dtype
def to_torch_dtype(maybe_str_dtype: Union[str, torch.dtype]):
if isinstance(maybe_str_dtype, torch.dtype):
dtype = maybe_str_dtype
else:
try:
dtype = {
"bf16": torch.bfloat16,
"fp16": torch.float16,
"fp32": torch.float32,
"bfloat16": torch.bfloat16,
"float16": torch.float16,
"float32": torch.float32,
}[maybe_str_dtype]
except KeyError:
raise ValueError(
f"Cannot convert to torch data type, got {maybe_str_dtype}"
)
return dtype
def load_weight_from_bin(
checkpoint_path: PathLike,
shape: List[int],
weight_dtype: Union[str, np.dtype],
):
"""Load a weight from a bin file.
# Args.
checkpoint_path: str or Path,
a checkpoint file path of an FT's layer weight.
shape: list of int, the shape of weight tensor.
weight_dtype: str or np.dtype, the data type of the stored weight.
"""
weight_dtype = to_numpy_dtype(weight_dtype)
return torch.from_numpy(np.fromfile(checkpoint_path, dtype=weight_dtype))
LayernormType = Literal["pre_layernorm", "post_layernorm"]
class GptLayerWeights:
def __init__(
self,
num_heads: int,
size_per_head: int,
inter_size: int,
num_layers: int,
tensor_para_size: int = 1,
pipeline_para_size: int = 1,
has_adapters: bool = False,
adapter_inter_size: int = 0,
int8_mode: int = 0,
):
assert num_heads % tensor_para_size == 0, (
f"num_heads ({num_heads}) is not multiple of "
"tensor para size ({tensor_para_size})"
)
self.num_heads = num_heads
self.size_per_head = size_per_head
self.hidden_units = num_heads * size_per_head
self.num_layers = num_layers
self.tensor_para_size = tensor_para_size
self.tensor_para_rank = comm.get_tensor_para_rank()
self.pipeline_para_size = pipeline_para_size
self.pipeline_para_rank = comm.get_pipeline_para_rank()
self.has_adapters = has_adapters
self.adapter_inter_size = adapter_inter_size
self.local_num_layers = num_layers // pipeline_para_size
self.local_num_heads = num_heads // tensor_para_size
self.local_hidden_units = self.local_num_heads * size_per_head
self.local_inter_size = inter_size // tensor_para_size
self.local_adapter_inter_size = (
self.adapter_inter_size // tensor_para_size
)
self.weight_transpose_calibrate_quantize = None
assert int8_mode in [0, 1], "Invalid int8 mode for GPT. Must be 0 or 1"
self.int8_mode = int8_mode
if self.int8_mode == 1:
quant = (
torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix # noqa: E501
)
self.weight_transpose_calibrate_quantize = lambda x: quant(
x, torch.int8
)
self.weights = None
self.int8_weights = None
self.int8_scales = None
self.expected_weight_shapes = list()
# pylint:disable=line-too-long
# Transformer blocks
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # input layernorm weight
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # input layernorm bias
self.expected_weight_shapes.extend(
[(self.hidden_units, self.local_hidden_units * 3)]
* self.local_num_layers
) # attention qkv weight
self.expected_weight_shapes.extend(
[(self.local_hidden_units * 3,)] * self.local_num_layers
) # attention qkv bias
self.expected_weight_shapes.extend(
[(self.local_hidden_units, self.hidden_units)]
* self.local_num_layers
) # attention dense weight
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # attention dense bias
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # post attention layernorm weight
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # post attention layernorm bias
self.expected_weight_shapes.extend(
[(self.hidden_units, self.local_inter_size)]
* self.local_num_layers
) # ffn_kernel1
self.expected_weight_shapes.extend(
[(self.local_inter_size,)] * self.local_num_layers
) # ffn_bias1
self.expected_weight_shapes.extend(
[(self.local_inter_size, self.hidden_units)]
* self.local_num_layers
) # ffn_kernel2
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # ffn_bias2
# Adapters
if self.has_adapters:
self.expected_weight_shapes.extend(
[(self.hidden_units, self.local_adapter_inter_size)]
* self.local_num_layers
) # adaptor1_kernel1
self.expected_weight_shapes.extend(
[(self.local_adapter_inter_size,)] * self.local_num_layers
) # adaptor1_bias1
self.expected_weight_shapes.extend(
[(self.local_adapter_inter_size, self.hidden_units)]
* self.local_num_layers
) # adaptor1_kernel2
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # adaptor1_bias2
self.expected_weight_shapes.extend(
[(self.hidden_units, self.local_adapter_inter_size)]
* self.local_num_layers
) # adaptor2_kernel1
self.expected_weight_shapes.extend(
[(self.local_adapter_inter_size,)] * self.local_num_layers
) # adaptor2_bias1
self.expected_weight_shapes.extend(
[(self.local_adapter_inter_size, self.hidden_units)]
* self.local_num_layers
) # adaptor2_kernel2
self.expected_weight_shapes.extend(
[(self.hidden_units,)] * self.local_num_layers
) # adaptor2_bias2
# pylint:enable=line-too-long
@classmethod
def from_config(cls, config: GptInitModelParameters):
return cls(
num_heads=config.head_num,
size_per_head=config.size_per_head,
inter_size=4 * config.head_num * config.size_per_head,
num_layers=config.layer_num,
tensor_para_size=config.tensor_para_size,
pipeline_para_size=config.pipeline_para_size,
has_adapters=config.has_adapters,
adapter_inter_size=config.adapter_inter_size,
int8_mode=config.int8_mode,
)
@property
def dtype(self):
return self.weights[0].dtype
@property
def device(self):
return self.weights[0].device
def _map(self, func):
for i in range(len(self.weights)):
if isinstance(self.weights[i], list):
for j in range(len(self.weights[i])):
self.weights[i][j] = func(self.weights[i][j])
else:
self.weights[i] = func(self.weights[i])
def _map_int8(self, func):
for i in range(len(self.int8_weights)):
if isinstance(self.int8_weights[i], list):
for j in range(len(self.int8_weights[i])):
self.int8_weights[i][j] = func(self.int8_weights[i][j])
else:
self.int8_weights[i] = func(self.int8_weights[i])
for i in range(len(self.int8_scales)):
if isinstance(self.int8_scales[i], list):
for j in range(len(self.int8_scales[i])):
self.int8_scales[i][j] = func(self.int8_scales[i][j])
else:
self.int8_scales[i] = func(self.int8_scales[i])
def float(self):
if self.dtype == torch.float32:
return
self._map(lambda x: x.float())
def half(self):
if self.dtype == torch.float16:
return
self._map(lambda x: x.half())
if self.int8_mode == 1:
self._map_int8(lambda w: w.half())
def bfloat16(self):
if self.dtype == torch.bfloat16:
return
self._map(lambda x: x.bfloat16())
if self.int8_mode == 1:
self._map_int8(lambda w: w.bfloat16())
def cuda(self, device=None):
self._map(lambda x: x.cuda(device))
if self.int8_mode == 1:
self._map_int8(lambda x: x.cuda(device))
def to(self, device=None):
self._map(lambda x: x.to(device))
if self.int8_mode == 1:
self._map_int8(lambda x: x.to(device))
def is_valid_pp_group(self, layer, pp_rank):
return layer // self.layers_per_device == pp_rank
def load(
self,
checkpoint_path: PathLike,
compute_dtype: torch.dtype,
weight_dtype: Optional[Union[str, np.dtype]] = None,
device: Optional[Union[int, str, torch.device]] = None,
):
"""Load checkpoint weights.
# Args.
checkpoint_path: str or Path,
a checkpoint directory where FT checkpoint files locate.
weight_dtype: str or np.dtype, the data type of stored weights.
"""
checkpoint_path = Path(checkpoint_path)
if not checkpoint_path.exists():
raise FileNotFoundError(
f"Could not find checkpoint {str(checkpoint_path)}"
)
weight_dtype = to_numpy_dtype(weight_dtype)
print(
f"Load weights from {str(checkpoint_path)} (data type: {weight_dtype}" # noqa: E501
)
self.weights = list()
self.int8_weights = list()
self.int8_scales = list()
torch.cuda.empty_cache()
def _load_from_file(fname):
quant_sub_names = [
"attention.query_key_value.weight",
"attention.dense.weight",
"dense_h_to_4h.weight",
"dense_4h_to_h.weight",
]
_weight = torch.from_numpy(
np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
)
_weight = _weight.to(compute_dtype)
weight_index = len(self.weights)
expected_shape = self.expected_weight_shapes[weight_index]
try:
if _weight.nelement() > 0:
_weight = _weight.reshape(expected_shape)
except: # noqa: E722
raise ValueError(
f"num_heads, size_per_head, vocab_size, and max_seq_len must be the same " # noqa: E501
f"as the ones during training (weight: {fname} expected shape: {expected_shape}, " # noqa: E501
f"got shape: {_weight.shape})."
)
should_quantize = any(
sub_name in fname for sub_name in quant_sub_names
)
if self.int8_mode != 0 and should_quantize:
calibrate = self.weight_transpose_calibrate_quantize
int8_weight, int8_scales = calibrate(_weight)
# int8 weights should appear in same order as FP weights.
# Move to device and add to the int8 list.
dummy_weight = torch.empty(0, dtype=compute_dtype)
if device is not None:
int8_weight = int8_weight.to(device)
int8_scales = int8_scales.to(device)
dummy_weight = dummy_weight.to(device)
self.int8_weights.append(int8_weight)
self.int8_scales.append(int8_scales)
self.weights.append(dummy_weight)
else:
if device is not None:
_weight = _weight.to(device)
self.weights.append(_weight)
# Load
# pylint:disable=line-too-long
layer_offset = self.local_num_layers * self.pipeline_para_rank
[
_load_from_file(
f"model.layers.{layer_offset + i}.input_layernorm.weight.bin"
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.input_layernorm.bias.bin"
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.attention.query_key_value.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.attention.query_key_value.bias.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.attention.dense.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.attention.dense.bias.bin"
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.post_attention_layernorm.weight.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.post_attention_layernorm.bias.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.mlp.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.mlp.dense_4h_to_h.bias.bin"
)
for i in range(self.local_num_layers)
]
if self.has_adapters:
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_attention_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_attention_adapter.dense_4h_to_h.bias.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_h_to_4h.bias.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.weight.{self.tensor_para_rank}.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
[
_load_from_file(
f"model.layers.{layer_offset + i}.after_ffn_adapter.dense_4h_to_h.bias.bin" # noqa: E501
)
for i in range(self.local_num_layers)
]
assert len(self.weights) == len(
self.expected_weight_shapes
), "Incorrect number of weights loaded"
class FtModuleBase:
def __init__(self):
self.weight = None
@classmethod
@abstractmethod
def from_config(cls, config: GptInitModelParameters, **kwargs):
raise NotImplementedError
@abstractmethod
def _initialize_model(self, force_init=False):
raise NotImplementedError
@abstractmethod
def forward(self, *args, **kwargs):
raise NotImplementedError
def set_weight(self, weight: GptLayerWeights):
old_weight_dtype = (
self.weight.dtype if self.weight is not None else None
)
self.weight = weight
if old_weight_dtype is None or old_weight_dtype != self.weight.dtype:
self._initialize_model(force_init=True)
@property
def dtype(self):
assert self.weight is not None
return self.weight.dtype
@property
def device(self):
assert self.weight is not None
return self.weight.device
def cuda(self, device=None):
assert torch.cuda.is_available()
self.weight.cuda(device)
return self
def to(self, device=None):
self.weight.to(device)
return self
def float(self):
self.weight.float()
self._initialize_model(force_init=True)
return self
def half(self):
self.weight.half()
self._initialize_model(force_init=True)
return self
def bfloat16(self):
self.weight.bfloat16()
self._initialize_model(force_init=True)
return self
class GptContextDecoder(FtModuleBase):
def __init__(
self,
num_heads: int,
size_per_head: int,
inter_size: int,
num_layers: int,
tensor_para_size: int = 1,
pipeline_para_size: int = 1,
remove_padding: bool = True,
shared_contexts_ratio: float = 1.0,
layernorm_eps: float = 1e-6,
layernorm_type: LayernormType = "pre_layernorm",
activation_type: str = "gelu",
has_adapters: bool = False,
adapter_inter_size: int = 0,
int8_mode: int = 0,
):
super().__init__()
self.num_heads = num_heads
self.size_per_head = size_per_head
self.hidden_size = self.num_heads * self.size_per_head
self.inter_size = inter_size
self.num_layers = num_layers
self.tensor_para_size = tensor_para_size
self.pipeline_para_size = pipeline_para_size
self.remove_padding = remove_padding
self.shared_contexts_ratio = shared_contexts_ratio
self.layernorm_eps = layernorm_eps
self.layernorm_type = layernorm_type
self.activation_type = activation_type
self.has_adapters = has_adapters
self.adapter_inter_size = adapter_inter_size
assert int8_mode in [0, 1]
self.int8_mode = int8_mode
self.ft_op = None
self.weight = None
def __repr__(self):
args_dict = dict(
num_heads=self.num_heads,
size_per_head=self.size_per_head,
hidden_size=self.hidden_size,
inter_size=self.inter_size,
num_layers=self.num_layers,
tensor_para_size=self.tensor_para_size,
pipeline_para_size=self.pipeline_para_size,
remove_padding=self.remove_padding,
shared_contexts_ratio=self.shared_contexts_ratio,
layernorm_eps=self.layernorm_eps,
layernorm_type=self.layernorm_type,
activation_type=self.activation_type,
has_adapters=self.has_adapters,
adapter_inter_size=self.adapter_inter_size,
int8_mode=self.int8_mode,
)
args_str = ",\n ".join([f"{k}: {v}" for k, v in args_dict.items()])
return f"{self.__class__.__name__}[\n{ args_str}\n]"
@classmethod
def from_config(cls, config: GptInitModelParameters, **kwargs):
return cls(
num_heads=config.head_num,
size_per_head=config.size_per_head,
inter_size=4 * config.head_num * config.size_per_head,
num_layers=config.layer_num,
tensor_para_size=config.tensor_para_size,
pipeline_para_size=config.pipeline_para_size,
remove_padding=kwargs.get("remove_padding", True),
shared_contexts_ratio=kwargs.get("shared_contexts_ratio", 1.0),
layernorm_eps=config.layernorm_eps,
layernorm_type=config.layernorm_type,
activation_type=config.activation_type,
has_adapters=config.has_adapters,
adapter_inter_size=config.adapter_inter_size,
int8_mode=config.int8_mode,
)
def _initialize_model(self, force_init=False):
if self.weight is None:
self.weight = GptLayerWeights(
num_heads=self.num_heads,
size_per_head=self.size_per_head,
inter_size=self.inter_size,
num_layers=self.num_layers,
tensor_para_size=self.tensor_para_size,
pipeline_para_size=self.pipeline_para_size,
has_adapters=self.has_adapters,
adapter_inter_size=self.adapter_inter_size,
int8_mode=self.int8_mode,
)
if not force_init and self.ft_op is not None:
return
if self.ft_op is not None:
del self.ft_op
self.ft_op = (
torch.classes.FasterTransformer.ParallelGptContextDecoderOp(
self.num_heads,
self.size_per_head,
self.inter_size,
self.num_layers,
self.tensor_para_size,
self.pipeline_para_size,
self.layernorm_eps,
self.layernorm_type,
self.activation_type,
self.has_adapters,
self.adapter_inter_size,
self.int8_mode,
self.weight.weights,
self.weight.int8_weights,
self.weight.int8_scales,
self.remove_padding,
)
)
def forward(
self,
input_embeds: torch.Tensor,
attention_mask: torch.Tensor,
input_lengths: torch.IntTensor,
memory_length: Optional[int] = None,
compact_index: Optional[torch.IntTensor] = None,
batch_to_compact_index: Optional[torch.IntTensor] = None,
linear_bias_slopes: Optional[torch.Tensor] = None,
):
"""
# Args.
input_embeds: Tensor, (batch * beam, max_input_length, hidden_dim),
input hidden states.
attention_mask: Tensor, (batch * beam, max_input_length, max_input_length),
input attention mask.
input_lengths: (batch * beam,), input sequence lengths.
memory_length: int, the length of memory to keep key/cache values.
compact_index: IntTensor, (compact_batch_size,)
The index of input sequences of a compact batch. If None, the FT op
doesn't apply the shared context feature and as result the inference
time may increase.
batch_to_compact_index: IntTensor, (batch * beam,)
The index map from the original input batch to the compact batch.
This must be provided if compact_index is not None.
linear_bias_slopes: (num_heads,)
The slope per head of linear attention bias - ALiBi. If None, a base
self attention will be performed.
# Returns
hidden_states: Tensor, (batch * beam, max_input_length, hidden_dim),
decoder outputs.
key_cache: Tensor, (num_layers, batch * beam, local_num_heads, size_per_head / x, memory_length, x), # noqa: E501
key cache of attention of inputs.
x = 16 / sizeof(T), memory_length = max_input_length or max_input_length + gen_length # noqa: E501
value_cache: Tensor, (num_layers, batch * beam, local_num_heads, memory_length, hidden_dim) # noqa: E501
value cache of attention
last_token_hidden_states: Tensor, (batch * beam, hidden_dim)
hidden states of the last input token.
"""
self._initialize_model()
# outputs: output hidden states
(
decoder_ouptut,
key_cache,
value_cache,
last_token_hidden_states,
) = self.ft_op.forward(
input_embeds,
attention_mask,
input_lengths,
memory_length,
compact_index,
batch_to_compact_index,
linear_bias_slopes,
)
return decoder_ouptut, key_cache, value_cache, last_token_hidden_states
class GptDecoder(FtModuleBase):
def __init__(
self,
num_heads: int,
size_per_head: int,
inter_size: int,
num_layers: int,
tensor_para_size: int = 1,
pipeline_para_size: int = 1,
layernorm_eps: float = 1e-6,
layernorm_type: LayernormType = "pre_layernorm",
activation_type: str = "gelu",
has_adapters: bool = False,
adapter_inter_size: int = 0,
int8_mode: int = 0,
):
super().__init__()
self.num_heads = num_heads
self.size_per_head = size_per_head
self.hidden_size = self.num_heads * self.size_per_head
self.inter_size = inter_size
self.num_layers = num_layers
self.tensor_para_size = tensor_para_size
self.pipeline_para_size = pipeline_para_size
self.layernorm_eps = layernorm_eps
self.layernorm_type = layernorm_type
self.activation_type = activation_type
self.has_adapters = has_adapters
self.adapter_inter_size = adapter_inter_size
self.int8_mode = int8_mode
self.ft_op = None
self.weight = None
def __repr__(self):
args_dict = dict(
num_heads=self.num_heads,
size_per_head=self.size_per_head,
hidden_size=self.hidden_size,
inter_size=self.inter_size,
num_layers=self.num_layers,
tensor_para_size=self.tensor_para_size,
pipeline_para_size=self.pipeline_para_size,
layernorm_eps=self.layernorm_eps,
layernorm_type=self.layernorm_type,
activation_type=self.activation_type,
has_adapters=self.has_adapters,
adapter_inter_size=self.adapter_inter_size,
int8_mode=self.int8_mode,
)
args_str = ",\n ".join(
[f"{k}: {v}" for k, v in args_dict.items()]
) # noqa: E501
return f"{self.__class__.__name__}[\n {args_str}\n]"
@classmethod
def from_config(cls, config: GptInitModelParameters, **kwargs):
hidden_dim = config.head_num * config.size_per_head
return cls(
num_heads=config.head_num,
size_per_head=config.size_per_head,
inter_size=4 * hidden_dim,
num_layers=config.layer_num,
tensor_para_size=config.tensor_para_size,
pipeline_para_size=config.pipeline_para_size,
layernorm_eps=config.layernorm_eps,
layernorm_type=config.layernorm_type,
activation_type=config.activation_type,
has_adapters=config.has_adapters,
adapter_inter_size=config.adapter_inter_size,
int8_mode=config.int8_mode,
)
def _initialize_model(self, force_init=False):
if self.weight is None:
self.weight = GptLayerWeights(
num_heads=self.num_heads,
size_per_head=self.size_per_head,
inter_size=self.inter_size,
num_layers=self.num_layers,
tensor_para_size=self.tensor_para_size,
pipeline_para_size=self.pipeline_para_size,
has_adapters=self.has_adapters,
adapter_inter_size=self.adapter_inter_size,
int8_mode=self.int8_mode,
)
if not force_init and self.ft_op is not None:
return
if self.ft_op is not None:
del self.ft_op
self.ft_op = torch.classes.FasterTransformer.ParallelGptDecoderOp(
self.num_heads,
self.size_per_head,
self.inter_size,
self.num_layers,
self.tensor_para_size,
self.pipeline_para_size,
self.layernorm_eps,
self.layernorm_type,
self.activation_type,
self.has_adapters,
self.adapter_inter_size,
self.weight.int8_mode,
self.weight.weights,
self.weight.int8_weights,
self.weight.int8_scales,
)
def forward(
self,
max_input_length: int,
step: int,
ite: int,
input_embeds: torch.Tensor,
sequence_lengths: torch.IntTensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
finished: torch.BoolTensor,
total_padding_tokens: torch.IntTensor,
masked_tokens: torch.BoolTensor,
cache_indirection: Optional[torch.IntTensor] = None,
linear_bias_slopes: Optional[torch.Tensor] = None,
):
"""
# Args.
max_input_length: int, maximum input context length.
step: int, the current step index.
ite: int, local batch iteration.
input_embeds: Tensor, (local_batch * beam, hidden_dim),
input hidden state to decoder.
sequence_lengths: IntTensor, (local_batch * beam,),
the current sequence lengths.
key_cache: Tensor, key cache buffer.
value_cache: Tensor, value cache buffer.
finished: BoolTensor, (local_batch * beam,),
whether to finish sentence generation.
total_padding_tokens IntTensor, (local_batch * beam,),
the number of padded tokens.
masked_tokens: BoolTensor, (local_batch * beam, memory_length),
a mask tensor that indicates padded tokens.
cache_indirection: IntTensor, (local_batch * beam,),
cache of beam positions if needed if beam > 1.
linear_bias_slopes Tensor, (num_heads,)
slopes head of linear position bias (ALiBi) (optional).
# Returns
IntTensor, (batch * beam,) output token ids.
"""
self._initialize_model()
outputs = self.ft_op.forward(
max_input_length,
step,
ite,
input_embeds,
sequence_lengths,
finished,
total_padding_tokens,
masked_tokens,
key_cache,
value_cache,
cache_indirection,
linear_bias_slopes,
)
return outputs[0]
class Gpt:
def __init__(
self,
num_heads: int,
size_per_head: int,
num_layers: int,
vocab_size: int,
start_id: int,
end_id: int,
lib_path: PathLike,
tensor_para_size: int = 1,
pipeline_para_size: int = 1,
remove_padding: bool = True,
shared_contexts_ratio: float = 1.0,
layernorm_eps: float = 1e-6,
layernorm_type: LayernormType = "pre_layernorm",
activation_type: str = "gelu",
has_positional_encoding: bool = True,
max_seq_len: int = 0,
has_pre_decoder_layernorm: bool = False,
has_post_decoder_layernorm: bool = True,
has_adapters: bool = False,
adapter_inter_size: int = 0,
int8_mode: int = 0,
inference_data_type: Optional[str] = None,
weights_data_type: str = "fp32",
use_fp32_to_compute_logit: bool = False,
**kwargs,
):
super().__init__()
inference_data_type = inference_data_type or weights_data_type
self.config = GptInitModelParameters(
head_num=num_heads,
size_per_head=size_per_head,
layer_num=num_layers,
max_seq_len=max_seq_len,
tensor_para_size=tensor_para_size,
vocab_size=vocab_size,
start_id=start_id,
end_id=end_id,
pipeline_para_size=pipeline_para_size,
data_type=inference_data_type,
weights_data_type=weights_data_type,
layernorm_eps=layernorm_eps,
layernorm_type=layernorm_type,
activation_type=activation_type,
has_positional_encoding=has_positional_encoding,
has_pre_decoder_layernorm=has_pre_decoder_layernorm,
has_post_decoder_layernorm=has_post_decoder_layernorm,
has_adapters=has_adapters,
adapter_inter_size=adapter_inter_size,
int8_mode=int8_mode,
sparse=kwargs.get("sparse", False),
)
self.use_fp32_to_compute_logit = use_fp32_to_compute_logit
self.weight = None
self.shared_contexts_ratio = shared_contexts_ratio
torch.classes.load_library(os.path.abspath(lib_path))
# Embeddings to encode or decode tokens.
hidden_dim = num_heads * size_per_head
# Pad vocab size for FT.
local_vocab_size = math.ceil(
self.config.vocab_size / self.config.tensor_para_size
)
if self.config.data_type == "fp16":
local_vocab_size = math.ceil(local_vocab_size / 8) * 8
self.vocab_size_padded = (
local_vocab_size * self.config.tensor_para_size
)
self.vocab_size = self.config.vocab_size
self.decode_op = torch.classes.FasterTransformer.DynamicDecodeOp(
self.vocab_size,
self.vocab_size_padded,
self.config.tensor_para_size,
self.config.pipeline_para_size,
torch.float,
)
self._parameters = {}
def register_param(name, p):
self._parameters[name] = p
setattr(self, name, p)
register_param(
"context_decoder",
GptContextDecoder.from_config(
self.config,
remove_padding=remove_padding,
shared_contexts_ratio=shared_contexts_ratio,
**kwargs,
),
)
register_param(
"decoder", GptDecoder.from_config(self.config, **kwargs)
)
compute_dtype = to_torch_dtype(inference_data_type)
if comm.is_pipeline_group_first():
register_param(
"word_embedding",
torch.nn.Embedding(
self.vocab_size_padded, hidden_dim, dtype=compute_dtype
),
)
self._mask_padded_vocab_weights(self.word_embedding.weight)
if self.config.has_positional_encoding:
register_param(
"position_encoding",
torch.nn.Embedding(
self.config.max_seq_len,
hidden_dim,
dtype=compute_dtype,
),
)
else:
self.position_encoding = None
if self.config.has_pre_decoder_layernorm:
register_param(
"pre_decoder_layernorm",
torch.nn.LayerNorm(
hidden_dim, eps=layernorm_eps, dtype=compute_dtype
),
)
else:
self.pre_decoder_layernorm = None
if comm.is_pipeline_group_last():
if has_post_decoder_layernorm:
register_param(
"post_decoder_layernorm",
torch.nn.LayerNorm(
hidden_dim, eps=layernorm_eps, dtype=compute_dtype
),
)
else:
self.post_decoder_layernorm = None
self.lm_head_ctype = (
compute_dtype
if not self.use_fp32_to_compute_logit
else torch.float32
)
register_param(
"lm_head",
torch.nn.Linear(
hidden_dim,
self.vocab_size_padded,
bias=False,
dtype=self.lm_head_ctype,
),
)
self._mask_padded_vocab_weights(self.lm_head.weight)
@classmethod
def from_config(cls, config: GptInitModelParameters, **kwargs):
return cls(
num_heads=config.head_num,
size_per_head=config.size_per_head,
num_layers=config.layer_num,
max_seq_len=config.max_seq_len,
tensor_para_size=config.tensor_para_size,
vocab_size=config.vocab_size,
start_id=config.start_id,
end_id=config.end_id,
pipeline_para_size=config.pipeline_para_size,
inference_data_type=config.data_type,
weights_data_type=config.weights_data_type,
layernorm_eps=config.layernorm_eps,
layernorm_type=config.layernorm_type,
activation_type=config.activation_type,
has_positional_encoding=config.has_positional_encoding,
has_pre_decoder_layernorm=config.has_pre_decoder_layernorm,
has_post_decoder_layernorm=config.has_post_decoder_layernorm,
has_adapters=config.has_adapters,
adapter_inter_size=config.adapter_inter_size,
int8_mode=config.int8_mode,
**kwargs,
)
def load(
self,
checkpoint_path: PathLike,
inference_data_type: Optional[Union[str, torch.dtype]] = None,
config: Optional[GptInitModelParameters] = None,
device: Optional[Union[str, int, torch.device]] = None,
):
checkpoint_path = Path(checkpoint_path)
device = device or comm.get_device()
config = config or self.config
compute_dtype = to_torch_dtype(inference_data_type or self.dtype)
self.weight = GptLayerWeights.from_config(config)
self.weight.load(
checkpoint_path, compute_dtype, config.weights_data_type, device
)
self.context_decoder.set_weight(self.weight)
self.decoder.set_weight(self.weight)
weight_dtype = to_numpy_dtype(config.weights_data_type)
def _safe_load_from_bin(param: torch.nn.Parameter, fname):
if (checkpoint_path / fname).exists():
# np_w is 1-D array since a bin file doesn't have shape info.
w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
param.data = (
torch.from_numpy(w_)
.reshape(param.data.shape)
.to(compute_dtype)
)
else:
raise FileNotFoundError(f"Faile to load {fname}")
def _safe_load_lm_head_from_bin(param, fname, ctype):
if (checkpoint_path / fname).exists():
shape = (
self.vocab_size,
self.config.head_num * self.config.size_per_head,
)
# np_w is 1-D array since a bin file doesn't have shape info.
w_ = np.fromfile(checkpoint_path / fname, dtype=weight_dtype)
param.data = param.data.to(ctype)
param.data[: self.vocab_size, :] = (
torch.from_numpy(w_).reshape(shape).to(ctype)
)
else:
print(f"Faile to load {fname}")
torch.nn.init.normal_(param).to(compute_dtype)
self._mask_padded_vocab_weights(param)
# pylint:disable=line-too-long
if comm.is_pipeline_group_first():
_safe_load_lm_head_from_bin(
self.word_embedding.weight, "model.wte.bin", compute_dtype
)
self._mask_padded_vocab_weights(self.word_embedding.weight)
if self.position_encoding is not None:
_safe_load_from_bin(
self.position_encoding.weight, "model.wpe.bin"
)
if self.pre_decoder_layernorm is not None:
_safe_load_from_bin(
self.pre_decoder_layernorm.weight,
"model.pre_decoder_layernorm.weight.bin",
)
_safe_load_from_bin(
self.pre_decoder_layernorm.bias,
"model.pre_decoder_layernorm.bias.bin",
)
if comm.is_pipeline_group_last():
if self.post_decoder_layernorm is not None:
_safe_load_from_bin(
self.post_decoder_layernorm.weight,
"model.final_layernorm.weight.bin",
)
_safe_load_from_bin(
self.post_decoder_layernorm.bias,
"model.final_layernorm.bias.bin",
)
if (checkpoint_path / "model.lm_head.weight.bin").exists():
_safe_load_lm_head_from_bin(
self.lm_head.weight,
"model.lm_head.weight.bin",
self.lm_head_ctype,
)
else:
if self.use_fp32_to_compute_logit:
_safe_load_lm_head_from_bin(
self.lm_head.weight, "model.wte.bin", torch.float32
)
else:
# In this branch we can share the pre and post
# decoder embeddings, but ONLY pipeline size is 1.
# When pipeline size > 1, these two weights will end up on
# different GPUs, so we must load the
# post decoder weight again (else case).
if comm.get_pipeline_para_size() == 1:
self.lm_head.weight = self.word_embedding.weight
else:
_safe_load_lm_head_from_bin(
self.lm_head.weight, "model.wte.bin", compute_dtype
)
self.to(device)
@property
def dtype(self):
assert self.weight is not None
return self.weight.dtype
@property
def device(self):
assert self.weight is not None
return self.weight.device
def cuda(self, device=None):
assert torch.cuda.is_available()
for name, param in self._parameters.items():
setattr(self, name, param.cuda(device))
return self
def to(self, device=None):
for name, param in self._parameters.items():
setattr(self, name, param.to(device))
return self
def float(self):
for name, param in self._parameters.items():
setattr(self, name, param.float())
return self
def half(self):
for name, param in self._parameters.items():
setattr(self, name, param.half())
return self
def bfloat16(self):
for name, param in self._parameters.items():
setattr(self, name, param.bfloat16())
return self
def _mask_padded_vocab_weights(self, weight: torch.Tensor):
assert self.vocab_size_padded >= self.vocab_size
if self.vocab_size_padded > self.vocab_size:
weight.data[self.vocab_size :, ...] = 0 # noqa: E203
def generate_pad_mask(self, input_lengths, memory_length, init_step=0):
"""Generate a pad mask tensor.
# Args.
input_lengths: (batch_size * beam_width,), input lengths
memory_length: the length of key/value cache memory.
init_step: int, initial step.
# Return
masked_tokens: BoolTensor,
(batch_size * beam_width, memory_length),
True if init_step + input_length[i] <= j <
init_step + max_input_length,
where i is a batch-beam index and j is a time step
modulo by memory_length.
"""
max_input_length = input_lengths.max()
input_lengths = input_lengths.unsqueeze(1)
shift = init_step % memory_length
step_indices = torch.arange(
init_step, init_step + memory_length, device=input_lengths.device
)
step_indices = (
step_indices.roll(shift)
.unsqueeze(0)
.tile(input_lengths.shape[0], 1)
)
masked_tokens = torch.logical_and(
step_indices >= input_lengths,
step_indices < init_step + max_input_length,
)
return masked_tokens
def get_local_batch_size(self, batch_size):
"""Get a local batch size by the same way that FT Gpt does."""
local_batch_size = batch_size
pp_size = self.decoder.pipeline_para_size
if pp_size > 1:
if local_batch_size % pp_size == 0:
local_batch_size //= pp_size
while local_batch_size > 1024 and local_batch_size % 2 == 0:
local_batch_size //= 2
return local_batch_size
@torch.no_grad()
def generate(
self,
input_token_ids: torch.IntTensor,
input_lengths: torch.IntTensor,
gen_length: int,
eos_token_id: Optional[int] = None,
local_batch_size: Optional[int] = None,
beam_width: int = 1,
top_k: Optional[torch.IntTensor] = None,
top_p: Optional[torch.FloatTensor] = None,
top_p_decay: Optional[torch.FloatTensor] = None,
top_p_min: Optional[torch.FloatTensor] = None,
top_p_reset_ids: Optional[torch.IntTensor] = None,
temperature: Optional[torch.FloatTensor] = None,
repetition_penalty: Optional[torch.FloatTensor] = None,
presence_penalty: Optional[torch.FloatTensor] = None,
min_length: Optional[torch.IntTensor] = None,
len_penalty: Optional[torch.FloatTensor] = None,
beam_search_diversity_rate: Optional[torch.FloatTensor] = None,
stop_words_list: Optional[torch.IntTensor] = None,
bad_words_list: Optional[torch.IntTensor] = None,
sequence_limit_lengths: Optional[torch.IntTensor] = None,
random_seed: Optional[torch.LongTensor] = None,
memory_length: Optional[int] = None,
return_output_length: bool = False,
return_log_probs: bool = False,
):
"""
# Args.
input_token_ids: IntTensor, (batch_size, max_input_length),
input hidden state to decoder.
input_lengths: IntTensor, (batch_size),
the lengths of input context sequences.
gen_length: int, the number of tokens to generate.
local_batch_size: int, optional, a batch size of
local iteration. (disabled)
eos_token_id: int, eos token id.
beam_width: int, number of beams for beam search.
If 1, sampling decode will be used.
top_k: IntTensor, (batch_size,) top-k sampling.
The number of most probable tokens to keep
for sampling per sentence in a batcch.
top_p: FloatTensor, (batch_size,), top-p sampling.
The cumulative probability
of to filter the set of most probable tokens.
top_p_decay: FloatTensor, (batch_size,)
The decay of top-p value for top_p sampling.
top_p_min: FloatTensor, (batch_size,)
The minimum top p values in top-p decaying.
top_p_reset_ids: IntTensor, (batch_size,)
reset ids for resetting top_p values for top p sampling
temperature: FloatTensor, (batch_size,),
The temperature value for smoothing the logit distribution.
repetition_penalty: FloatTensor, (batch_size,),
The repetition penalty.
presence_penalty: FloatTensor, (batch_size,),
The presence penalty, which is exclusive with
repetition_penalty.
Only one of repetition and presence penalties is allowed.
min_length: IntTensor, (batch_size,),
Minimum length for each sentences. EOS is masked if length is
below min.
len_penalty: FloatTensor, (batch_size,)
The exponent of the length penalty of beam scores.
beam_search_diversity_rate: FloatTensor, (batch_size,),
The diversity rate of beam search.
stop_words_list: IntTensor, (batch_size, 2, stop_words_length)
When FT generates words in this list, it will stop the
generation. An extension of stop id.
bad_words_list IntTensor, (batch_size, 2, bad_words_length)
The words in the list will never be sampled.
sequence_limit_lengths: IntTensor, (batch_size,), The maximum
length of a generated sequence.
memory_length: int, the length of cache memory. If None, it will
be max_input_length + gen_length.
# Returns
IntTensor, (batch_size, beam_width, max_seq_length) output
token ids.
"""
assert (
self.weight is not None
), "Please call load() first to initialize weights."
input_token_ids = input_token_ids.type(torch.int32).to(self.device)
input_lengths = input_lengths.type(torch.int32).to(self.device)
batch_size = len(input_token_ids)
max_input_length = input_token_ids.shape[-1]
max_seq_length = max_input_length + gen_length
memory_length = memory_length or max_seq_length
# TODO: Enable local batch later. We currently disable local batching due to # noqa: E501
# an input mismatch issue of FT's decode_op: FT's decode_op requires logits # noqa: E501
# of shape (batch_size, ...) but we have logits of shape (local_batch_size, ...) # noqa: E501
# After fixing FT's side, we will enable local batch.
# local_batch_size = local_batch_size or self.get_local_batch_size(batch_size) # noqa: E501
# num_local_batches, last_chunk = divmod(batch_size, local_batch_size)
# if last_chunk > 0:
# num_local_batches += 1
assert local_batch_size is None or local_batch_size == batch_size
local_batch_size = batch_size
num_local_batches = 1
device = self.device
eos_token_id = (
eos_token_id if eos_token_id is not None else self.config.end_id
)
assert (
eos_token_id is not None
), "eos_token-id must be specified in generation."
eos_token_ids = eos_token_id * torch.ones(
batch_size, dtype=torch.int32, device=device
)
assert repetition_penalty is None or presence_penalty is None, (
"Found ambiguous parameters repetition_penalty and "
"presence_penalty which are mutually exclusive. "
"Please provide one of repetition_penalty and presence_penalty."
)
# Setup decoder_op prior to calling the forward function.
self.decode_op.setup(
batch_size,
beam_width,
top_k,
top_p,
temperature,
repetition_penalty,
presence_penalty,
min_length,
len_penalty,
beam_search_diversity_rate,
random_seed,
top_p_decay,
top_p_min,
top_p_reset_ids,
)
# Prepare input and output arguments.
if beam_width > 1:
# Tiling for beam search.
input_token_ids = input_token_ids.repeat(1, beam_width).view(
batch_size * beam_width, -1
)
input_lengths = (
input_lengths.view(-1, 1).repeat(1, beam_width).view(-1)
)
if sequence_limit_lengths is not None:
sequence_limit_lengths = (
sequence_limit_lengths.view(-1, 1)
.repeat(1, beam_width)
.view(-1)
)
# src/tgt cache indirections.
cache_indirection = torch.zeros(
(2, batch_size, beam_width, memory_length),
dtype=torch.int32,
device=device,
)
parent_ids = torch.zeros(
max_seq_length,
batch_size * beam_width,
dtype=torch.int32,
device=device,
)
else:
cache_indirection = None
src_cache_indirection = None
tgt_cache_indirection = None
parent_ids = None
pad_lengths = max_input_length - input_lengths
# Since tril() doesn't support bf16 dtype,
# we create of bool type and then cast it to dtype.
attention_mask = (
torch.ones(
(max_input_length, max_input_length),
dtype=torch.bool,
device=device,
)
.tril()
.unsqueeze(0)
.tile(input_token_ids.shape[0], 1, 1)
.to(self.dtype)
)
for b, input_length in enumerate(input_lengths):
attention_mask[b, input_length:, ...] = 0
masked_tokens = self.generate_pad_mask(input_lengths, memory_length)
finished = torch.zeros_like(input_lengths).bool()
sequence_lengths = (max_input_length - 1) * torch.ones_like(
input_lengths
)
if return_log_probs or beam_width > 1:
cum_log_probs = torch.zeros(batch_size * beam_width, device=device)
output_log_probs = torch.zeros(
(gen_length, batch_size * beam_width), device=device
)
else:
cum_log_probs = None
output_log_probs = None
# Contiguous buffer for each decode_op step,
# it will be transposed tensor for the final output.
output_token_ids = torch.zeros(
(max_seq_length, batch_size * beam_width),
dtype=torch.int32,
device=device,
)
output_token_ids[:max_input_length, ...] = input_token_ids.T
if comm.is_pipeline_group_first():
# Prepare input tensors of decoder.
input_embeds = self.word_embedding(input_token_ids)
if self.position_encoding is not None:
position_ids = torch.arange(
0, max_input_length, dtype=torch.int, device=device
)
position_ids = position_ids.unsqueeze(0).view(
-1, max_input_length
)
input_embeds += self.position_encoding(position_ids)
if self.pre_decoder_layernorm is not None:
input_embeds = self.pre_decoder_layernorm(input_embeds)
else:
# Dummy input_embeds
input_embeds = torch.empty(
size=(
batch_size * beam_width,
max_input_length,
self.context_decoder.hidden_size,
),
dtype=self.context_decoder.dtype,
device=device,
)
use_shared_contexts = (
(self.shared_contexts_ratio > 0.0)
and (max_input_length >= 1)
and (batch_size > 1)
)
batch_to_compact, compact_to_batch = None, None
if use_shared_contexts:
find_context_duplications = (
torch.ops.fastertransformer.find_context_duplications
)
batch_to_compact, compact_to_batch = find_context_duplications(
input_token_ids
)
use_shared_contexts = (
compact_to_batch.shape[0]
<= self.shared_contexts_ratio * batch_size
)
if not use_shared_contexts:
batch_to_compact, compact_to_batch = None, None
profiler.start("ft-context-decoder")
(
_,
k_cache,
v_cache,
last_token_hidden_states,
) = self.context_decoder.forward(
input_embeds=input_embeds,
attention_mask=attention_mask,
input_lengths=input_lengths,
memory_length=memory_length,
batch_to_compact_index=batch_to_compact,
compact_index=compact_to_batch,
)
profiler.stop("ft-context-decoder")
for step in range(max_input_length, max_seq_length):
src_indir_idx = (step - max_input_length) % 2
tgt_indir_idx = 1 - src_indir_idx
is_generation_done = torch.tensor(
[True], dtype=torch.bool, device=device
)
for ite in range(num_local_batches):
# The indices of the current local batch-beam.
bbidx = range(
ite * local_batch_size * beam_width,
min(
(ite + 1) * local_batch_size * beam_width,
batch_size * beam_width,
),
)
if cache_indirection is not None:
bidx = range(
ite * local_batch_size,
min((ite + 1) * local_batch_size, batch_size),
)
src_cache_indirection = cache_indirection[
src_indir_idx, bidx, ...
]
tgt_cache_indirection = cache_indirection[
tgt_indir_idx, bidx, ...
]
if step == max_input_length:
hidden_states = last_token_hidden_states[bbidx, ...]
else:
if comm.is_pipeline_group_first():
input_embeds = self.word_embedding(
output_token_ids[step - 1, bbidx]
)
if self.position_encoding is not None:
position_ids = (step - 1) * torch.ones_like(
pad_lengths[bbidx]
)
input_embeds += self.position_encoding(
position_ids
)
if self.pre_decoder_layernorm is not None:
input_embeds = self.pre_decoder_layernorm(
input_embeds
)
else:
# Dummy input_imbeds
input_embeds = torch.empty(
size=(len(bbidx), self.decoder.hidden_size),
dtype=self.decoder.dtype,
device=device,
)
profiler.start("ft-decoder")
hidden_states = self.decoder.forward(
max_input_length=max_input_length,
step=step,
ite=ite,
input_embeds=input_embeds,
sequence_lengths=sequence_lengths[bbidx],
key_cache=k_cache,
value_cache=v_cache,
finished=finished[bbidx],
total_padding_tokens=pad_lengths[bbidx],
cache_indirection=src_cache_indirection,
masked_tokens=masked_tokens[bbidx, ...],
)
profiler.stop("ft-decoder")
if comm.is_pipeline_group_last():
if self.post_decoder_layernorm is not None:
hidden_states = self.post_decoder_layernorm(
hidden_states
)
# We use logits of fp32 type to avoid overflow issue.
if self.use_fp32_to_compute_logit:
# The FT GPT op internally uses FP32 compute type
# for matrix multiplication.
# This will produce the same result with the
# end-to-end FT's GPT op.
logits = torch.nn.functional.linear(
hidden_states.float(), self.lm_head.weight
)
else:
logits = self.lm_head(hidden_states).float()
profiler.start("ft-decode")
should_stop = self.decode_op.forward(
logits.view(batch_size, beam_width, -1),
step,
max_input_length,
ite,
local_batch_size,
eos_token_ids,
top_k,
top_p,
temperature,
repetition_penalty,
presence_penalty,
min_length,
len_penalty,
beam_search_diversity_rate,
top_p_decay,
top_p_min,
top_p_reset_ids,
None,
input_lengths,
sequence_limit_lengths,
stop_words_list,
bad_words_list,
src_cache_indirection,
output_token_ids.view(-1, batch_size, beam_width),
finished,
sequence_lengths,
cum_log_probs,
output_log_probs,
parent_ids,
tgt_cache_indirection,
)
profiler.stop("ft-decode")
is_generation_done &= should_stop
# Broadcast from the last pipeline node if needed.
profiler.start("ft-bcast")
tensors_to_bcast = [
output_token_ids[step, ...],
finished,
sequence_lengths,
is_generation_done,
]
if beam_width > 1:
tensors_to_bcast.append(tgt_cache_indirection)
self.decode_op.broadcast_from_last_pipeline(tensors_to_bcast)
profiler.stop("ft-bcast")
if is_generation_done or finished.all():
break
# Transpose (L, batch, beam) -> (batch, beam, L)
output_token_ids = output_token_ids.view(
-1, batch_size, beam_width
).permute(1, 2, 0)
# Increase sequence_length by 1 because the sequence length of time step t is t - 1. # noqa: E501
sequence_lengths += 1
# Outputs
output_dict = dict(output_token_ids=output_token_ids)
if return_output_length:
output_dict["output_lengths"] = sequence_lengths
if return_log_probs:
output_dict["cum_log_probs"] = cum_log_probs
output_dict["output_log_probs"] = output_log_probs
return output_dict
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/faster_transformer/gpt/utils/huggingface_gpt_convert.py
================================================
# Based on https://github.com/NVIDIA/FasterTransformer/blob/4402759e48f2340220638675f464b6ba1f79ac3c/examples/pytorch/gpt/utils/huggingface_gpt_convert.py # noqa: E501
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert huggingface GPT model. Use https://huggingface.co/gpt2 as demo.
"""
import argparse
import configparser
import os
import sys
from loguru import logger
import numpy as np
from transformers import GPT2Model # transformers-4.10.0-py3
from nebullvm.optional_modules.torch import torch
dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.append(dir_path + "/../../../..")
sys.path.append(dir_path)
def get_weight_data_type(data_type):
if data_type == "fp32":
return np.float32
elif data_type == "fp16":
return np.float16
else:
assert False, f"Invalid weight data type {data_type}"
def split_and_convert_process(i, saved_dir, factor, key, args, val):
if (
key.find("input_layernorm.weight") != -1
or key.find("input_layernorm.bias") != -1
or key.find("attention.dense.bias") != -1
or key.find("post_attention_layernorm.weight") != -1
or key.find("post_attention_layernorm.bias") != -1
or key.find("mlp.dense_4h_to_h.bias") != -1
or key.find("final_layernorm.weight") != -1
or key.find("final_layernorm.bias") != -1
):
# shared weights, only need to convert the weights of rank 0
if i == 0:
saved_path = saved_dir + "/model." + key + ".bin"
val.tofile(saved_path)
elif (
key.find("attention.dense.weight") != -1
or key.find("mlp.dense_4h_to_h.weight") != -1
):
split_vals = np.split(val, factor, axis=0)
for j in range(factor):
saved_path = (
saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
)
split_vals[j].tofile(saved_path)
elif (
key.find("mlp.dense_h_to_4h.weight") != -1
or key.find("mlp.dense_h_to_4h.bias") != -1
):
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = (
saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
)
split_vals[j].tofile(saved_path)
elif key.find("attention.query_key_value.bias") != -1:
local_dim = (int)(val.shape[-1] / 3)
val = val.reshape(3, local_dim)
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = (
saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
)
split_vals[j].tofile(saved_path)
elif key.find("attention.query_key_value.weight") != -1:
hidden_dim = val.shape[0]
local_dim = (int)(val.shape[-1] / 3)
val = val.reshape(hidden_dim, 3, local_dim)
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = (
saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
)
split_vals[j].tofile(saved_path)
else:
logger.warning("[ERROR] cannot find key '{}'".format(key))
def split_and_convert(args):
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT2Model.from_pretrained(args.in_file).to(torch_device)
main(
args.saved_dir,
model,
args.trained_gpu_num,
args.infer_gpu_num,
args.processes,
args.weight_data_type,
)
def main(
saved_dir,
model: GPT2Model,
trained_gpu_num=1,
infer_gpu_num=1,
processes=1,
weight_data_type="fp32",
):
assert isinstance(model, GPT2Model), "model must be GPT2Model"
args = None
saved_dir = saved_dir + "/%d-gpu/" % infer_gpu_num
if not os.path.exists(saved_dir):
os.makedirs(saved_dir)
# ckpt_name = args.in_file
t_gpu_num = trained_gpu_num
i_gpu_num = infer_gpu_num
assert i_gpu_num % t_gpu_num == 0
factor = (int)(i_gpu_num / t_gpu_num)
# load position_embedding from rank 0
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = GPT2Model.from_pretrained(args.in_file).to(torch_device)
hf_config = vars(model.config)
# NOTE: save parameters to config files (loaded by triton backends)
config = configparser.ConfigParser()
config["gpt"] = {}
try:
config["gpt"]["model_name"] = (
"gpt"
if hf_config["_name_or_path"] == ""
else hf_config["_name_or_path"]
)
config["gpt"]["head_num"] = str(hf_config["n_head"])
n_embd = hf_config["n_embd"]
config["gpt"]["size_per_head"] = str(n_embd // hf_config["n_head"])
config["gpt"]["inter_size"] = str(n_embd * 4)
config["gpt"]["max_pos_seq_len"] = str(hf_config["n_positions"])
config["gpt"]["num_layer"] = str(hf_config["n_layer"])
config["gpt"]["vocab_size"] = str(hf_config["vocab_size"])
config["gpt"]["start_id"] = str(hf_config["bos_token_id"])
config["gpt"]["end_id"] = str(hf_config["eos_token_id"])
config["gpt"]["weight_data_type"] = weight_data_type
with open(saved_dir + "/config.ini", "w") as configfile:
config.write(configfile)
except: # noqa: E722
logger.warning("Fail to save the config in config.ini.")
np_weight_data_type = get_weight_data_type(weight_data_type)
huggingface_model_name_pattern = [
"ln_1.bias",
"ln_1.weight",
"attn.c_attn.bias",
"attn.c_attn.weight",
"attn.c_proj.bias",
"attn.c_proj.weight",
"ln_2.bias",
"ln_2.weight",
"mlp.c_fc.bias",
"mlp.c_fc.weight",
"mlp.c_proj.bias",
"mlp.c_proj.weight",
]
ft_model_name_pattern = [
"input_layernorm.bias",
"input_layernorm.weight",
"attention.query_key_value.bias",
"attention.query_key_value.weight",
"attention.dense.bias",
"attention.dense.weight",
"post_attention_layernorm.bias",
"post_attention_layernorm.weight",
"mlp.dense_h_to_4h.bias",
"mlp.dense_h_to_4h.weight",
"mlp.dense_4h_to_h.bias",
"mlp.dense_4h_to_h.weight",
]
# torch.multiprocessing.set_start_method("spawn")
# torch.multiprocessing.set_sharing_strategy("file_system")
# pool = multiprocessing.Pool(args.processes)
for name, param in model.named_parameters():
if name.find("weight") == -1 and name.find("bias") == -1:
continue
if name == "wpe.weight":
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
saved_dir + "model.wpe.bin"
)
elif name == "wte.weight":
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
saved_dir + "model.wte.bin"
)
elif name == "ln_f.bias":
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
saved_dir + "model.final_layernorm.bias.bin"
)
elif name == "ln_f.weight":
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
saved_dir + "model.final_layernorm.weight.bin"
)
elif name == "lm_head.weight":
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
saved_dir + "model.lm_head.weight.bin"
)
else:
for i in range(len(huggingface_model_name_pattern)):
if name.find(huggingface_model_name_pattern[i]) != -1:
new_name = name.replace("h.", "layers.").replace(
huggingface_model_name_pattern[i],
ft_model_name_pattern[i],
)
# pool.starmap(split_and_convert_process,
# [(0, saved_dir, factor, new_name, args,
# param.detach().cpu().numpy().astype(np_weight_data_type))],
# )
split_and_convert_process(
0,
saved_dir,
factor,
new_name,
args,
param.detach()
.cpu()
.numpy()
.astype(np_weight_data_type),
)
# pool.close()
# pool.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"-saved_dir",
"-o",
type=str,
help="file name of output file",
required=True,
)
parser.add_argument(
"-in_file",
"-i",
type=str,
help="file name of input checkpoint file",
required=True,
)
parser.add_argument(
"-trained_gpu_num",
"-t_g",
type=int,
help="How many gpus for inference",
default=1,
)
parser.add_argument(
"-infer_gpu_num",
"-i_g",
type=int,
help="How many gpus for inference",
required=True,
)
parser.add_argument(
"-processes",
"-p",
type=int,
help="How many processes to spawn for conversion (default: 4)",
default=4,
)
parser.add_argument(
"-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]
)
args = parser.parse_args()
logger.info("\n=============== Argument ===============")
for key in vars(args):
logger.info("{}: {}".format(key, vars(args)[key]))
logger.info("========================================")
split_and_convert(args)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/intel_neural_compressor.py
================================================
from pathlib import Path
from typing import Union
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.intel_neural_compressor import ( # noqa: E501
quantize_neural_compressor,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.torch import Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class IntelNeuralCompressorCompiler(Compiler):
supported_ops = {
"cpu": [
QuantizationType.STATIC,
QuantizationType.DYNAMIC,
],
"gpu": [],
}
def __init__(self):
super().__init__()
self.model_orig = None
def execute(
self,
model: Module,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using IntelNeuralCompressor library.
Args:
model (torch.nn.Module): The pytorch model.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train")
self.model_orig = model
if quantization_type is not None:
quantized_model = self._quantize_model(
model, quantization_type, input_tfms, train_input_data
)
self.compiled_model = self._compile_model(quantized_model)
def _compile_model(self, model: Union[str, Path]):
return model
@staticmethod
def _quantize_model(
model: Module,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data: DataManager,
):
return quantize_neural_compressor(
model, quantization_type, input_tfms, input_data
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/onnxruntime.py
================================================
from pathlib import Path
from typing import Union, List, Tuple
import numpy as np
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.onnx import (
quantize_onnx,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class ONNXCompiler(Compiler):
supported_ops = {
"cpu": [
None,
QuantizationType.STATIC,
QuantizationType.DYNAMIC,
],
"gpu": [
None,
QuantizationType.HALF,
],
}
def execute(
self,
model: str,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using ONNX Runtime Compiler.
Args:
model (str): The onnx model path.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train").get_numpy_list(
QUANTIZATION_DATA_NUM
)
if quantization_type is not None:
model = self._quantize_model(
model, train_input_data, quantization_type, input_tfms
)
self.compiled_model = self._compile_model(model)
def _compile_model(self, model: Union[str, Path]):
return model
def _quantize_model(
self,
model_path: str,
input_data: List[Tuple[np.ndarray, ...]],
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
):
return quantize_onnx(
model_path, input_data, quantization_type, self.device, input_tfms
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/openvino.py
================================================
import subprocess
from pathlib import Path
from typing import Tuple, List, Union
import numpy as np
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.openvino import ( # noqa: E501
quantize_openvino,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.openvino import (
Core,
CompiledModel,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import MultiStageTransformation
class OpenVINOCompiler(Compiler):
supported_ops = {
"cpu": [
None,
QuantizationType.STATIC,
QuantizationType.HALF,
],
"gpu": [],
}
def __init__(self):
super().__init__()
def execute(
self,
model: Union[str, Path],
model_params: ModelParams,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using OpenVINO library.
Args:
model (str): The onnx model path.
model_params (ModelParams): The model parameters.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train").get_numpy_list(
QUANTIZATION_DATA_NUM
)
cmd = [
"mo",
"--input_model",
str(model),
"--output_dir",
str(Path(model).parent),
"--input",
",".join(get_input_names(str(model))),
"--input_shape",
",".join([f"{list(shape)}" for shape in model_params.input_sizes]),
]
if quantization_type is QuantizationType.DYNAMIC:
return None
if quantization_type is QuantizationType.HALF:
cmd = cmd + ["--compress_to_fp16"]
process = subprocess.Popen(cmd)
process.wait()
base_path = Path(model).parent
openvino_model_path = base_path / f"{Path(model).stem}.xml"
openvino_model_weights = base_path / f"{Path(model).stem}.bin"
if quantization_type not in [QuantizationType.HALF, None]:
openvino_model_path, openvino_model_weights = self._quantize_model(
model_topology=str(openvino_model_path),
model_weights=str(openvino_model_weights),
input_names=get_input_names(str(model)),
input_data=train_input_data,
)
self.compiled_model = str(
Path(openvino_model_path).parent / Path(openvino_model_path).stem
)
def _compile_model(
self,
model_name: str,
model_weights: str,
network_parameters: ModelParams,
) -> CompiledModel:
core = Core()
model = core.read_model(model=model_name, weights=model_weights)
dynamic_shape = self._get_dynamic_shape(model, network_parameters)
if dynamic_shape is not None:
model.reshape(dynamic_shape)
return core.compile_model(model=model, device_name="CPU")
@staticmethod
def _quantize_model(
model_topology: str,
model_weights: str,
input_data: List[Tuple[np.ndarray, ...]],
input_names: List[str],
) -> Tuple[str, str]:
return quantize_openvino(
model_topology, model_weights, input_data, input_names
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/intel_neural_compressor.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
import yaml
from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.neural_compressor import (
MixedPrecision,
Quantization,
)
from nebullvm.optional_modules.torch import DataLoader, Module, GraphModule
from nebullvm.tools.data import DataManager, PytorchDataset
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
def _prepare_quantization_config(model: Any, tmp_dir: str, approach: str):
config = {
"model": {
"name": model.__class__.__name__,
"framework": "pytorch_fx",
},
"quantization": {"approach": approach},
"evaluation": {"accuracy": {"metric": {"topk": 1}}},
"tuning": {
"accuracy_criterion": {"relative": 0.01},
},
}
path_file = Path(tmp_dir) / "temp_qt.yaml"
with open(path_file, "w") as f:
yaml.dump(config, f)
return path_file
def _prepare_mixed_precision_config(model: Any, tmp_dir: str):
config = {
"model": {
"name": model.__class__.__name__,
"framework": "pytorch_fx",
},
"mixed_precision": {"precisions": "bf16"},
"evaluation": {"accuracy": {"metric": {"topk": 1}}},
"tuning": {
"accuracy_criterion": {"relative": 0.01},
},
}
path_file = Path(tmp_dir) / "temp_mp.yaml"
with open(path_file, "w") as f:
yaml.dump(config, f)
return path_file
def _get_dataloader(input_data: DataManager):
bs = input_data[0][0][0].shape[0]
ds = PytorchDataset(input_data, has_labels=True)
dl = DataLoader(ds, bs)
return dl
def _quantize_static(model: Module, input_data: DataManager) -> GraphModule:
with TemporaryDirectory() as tmp_dir:
config_file_qt = _prepare_quantization_config(
model, tmp_dir, "post_training_static_quant"
)
quantizer = Quantization(str(config_file_qt))
quantizer.model = model
quantizer.calib_dataloader = _get_dataloader(input_data)
quantizer.eval_dataloader = _get_dataloader(input_data)
compressed_model = quantizer()
return compressed_model
def _quantize_dynamic(model: Module) -> GraphModule:
with TemporaryDirectory() as tmp_dir:
config_file_qt = _prepare_quantization_config(
model, tmp_dir, "post_training_dynamic_quant"
)
quantizer = Quantization(str(config_file_qt))
quantizer.model = model
compressed_model = quantizer()
return compressed_model
def _mixed_precision(
model: Module, input_tfms: MultiStageTransformation
) -> GraphModule:
with TemporaryDirectory() as tmp_dir:
config_file_qt = _prepare_mixed_precision_config(model, tmp_dir)
converter = MixedPrecision(str(config_file_qt))
converter.model = model
compressed_model = converter()
input_tfms.append(HalfPrecisionTransformation())
return compressed_model
def quantize_neural_compressor(
model: Module,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data: DataManager,
) -> GraphModule:
if quantization_type is QuantizationType.STATIC:
quantized_model = _quantize_static(model, input_data)
elif quantization_type is QuantizationType.DYNAMIC:
quantized_model = _quantize_dynamic(model)
elif quantization_type is QuantizationType.HALF:
quantized_model = _mixed_precision(model, input_tfms)
else:
raise ValueError(
f"Quantization type {quantization_type} is not "
f"supported by Intel Neural Compressor"
)
return quantized_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/onnx.py
================================================
from pathlib import Path
from typing import Union, Iterable, Tuple, List
import cpuinfo
import numpy as np
from nebullvm.core.models import QuantizationType, Device, DeviceType
from nebullvm.optional_modules.onnx import (
onnx,
convert_float_to_float16_model_path,
)
from nebullvm.optional_modules.onnxruntime import (
CalibrationDataReader,
QuantType,
quantize_dynamic,
quantize_static,
)
from nebullvm.optional_modules.torch import DataLoader
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
class _IterableCalibrationDataReader(CalibrationDataReader):
def __init__(
self,
iterable_dataset: Union[Iterable[Tuple], List[Tuple]],
input_names: List[str],
):
self.iterable_dataset = iter(
[
{
input_name: value
for inputs in iterable_dataset
for input_name, value in zip(input_names, inputs)
}
]
)
def get_next(self) -> dict:
return next(self.iterable_dataset, None)
@classmethod
def from_dataloader(
cls, dl: DataLoader, input_names: List[str], contains_y: bool = True
):
iterable_ds = iter(
inputs[:-1] if contains_y else inputs for inputs in dl
)
return cls(iterable_ds, input_names)
def _quantize_dynamic(model_path: str) -> str:
model_path = Path(model_path)
model_quant = model_path.parent.parent / "int8_dynamic"
model_quant.mkdir(parents=True, exist_ok=True)
model_quant = model_quant / (model_path.stem + ".quant.onnx")
quantize_dynamic(
model_path,
model_quant,
weight_type=QuantType.QUInt8,
optimize_model=False,
)
return str(model_quant)
def _get_quantization_type_for_static(use_gpu) -> Tuple[QuantType, QuantType]:
"""Returns the quantization types for activations and weights,
depending on the underlying hardware
"""
arch = cpuinfo.get_cpu_info()["arch"].lower()
if use_gpu:
activation_type = weight_type = QuantType.QInt8
elif "x86" in arch:
cpu_raw_data = cpuinfo.get_cpu_info()["brand_raw"].lower()
if "intel" in cpu_raw_data and "xeon" in cpu_raw_data:
activation_type = QuantType.QUInt8
weight_type = QuantType.QInt8
else:
activation_type = weight_type = QuantType.QUInt8
else:
activation_type = QuantType.QUInt8
weight_type = QuantType.QUInt8
return activation_type, weight_type
def _quantize_static(
model_path: str, input_data: List[Tuple[np.ndarray, ...]], use_gpu: bool
) -> str:
model_path = Path(model_path)
model_quant = model_path.parent.parent / "int8_static"
model_quant.mkdir(parents=True, exist_ok=True)
model_quant = model_quant / (model_path.stem + ".quant.onnx")
inputs = input_data
input_names = get_input_names(str(model_path))
cdr = _IterableCalibrationDataReader(
input_names=input_names, iterable_dataset=inputs
)
activation_type, weight_type = _get_quantization_type_for_static(use_gpu)
quantize_static(
model_path,
Path(model_quant),
cdr,
activation_type=activation_type,
weight_type=weight_type,
optimize_model=False,
)
return str(model_quant)
def _convert_to_half_precision(
model_path: str, input_tfms: MultiStageTransformation
) -> str:
model_path = Path(model_path)
model_quant = model_path.parent.parent / "fp16"
model_quant.mkdir(parents=True)
model_quant = model_quant / (model_path.stem + "_fp16.onnx")
new_onnx_model = convert_float_to_float16_model_path(str(model_path))
input_tfms.append(HalfPrecisionTransformation())
try:
onnx.save(new_onnx_model, str(model_quant))
except ValueError:
# Model larger than 2GB must be saved as external data
onnx.save(
new_onnx_model,
str(model_quant),
save_as_external_data=True,
all_tensors_to_one_file=False,
convert_attribute=True,
)
return str(model_quant)
def quantize_onnx(
model_path: str,
input_data: List[Tuple[np.ndarray, ...]],
quantization_type: QuantizationType,
device: Device,
input_tfms: MultiStageTransformation,
) -> str:
if quantization_type == QuantizationType.DYNAMIC:
return _quantize_dynamic(model_path)
elif quantization_type == QuantizationType.STATIC:
return _quantize_static(
model_path, input_data, device.type is DeviceType.GPU
)
elif quantization_type == QuantizationType.HALF:
return _convert_to_half_precision(model_path, input_tfms)
else:
raise ValueError(
f"Quantization type {quantization_type} not supported"
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/openvino.py
================================================
from typing import List, Tuple, Any
import numpy as np
from nebullvm.optional_modules.openvino import (
DataLoader,
load_model,
IEEngine,
create_pipeline,
compress_model_weights,
save_model,
)
class _CalibrationDataLoader(DataLoader):
def __init__(
self, input_data: List[Tuple[Any, ...]], input_names: List[str]
):
self._input_data = input_data
self._input_names = input_names
def __len__(self):
return len(self._input_data)
def __getitem__(self, item):
inputs = {
k: v for (k, v) in zip(self._input_names, self._input_data[item])
}
return (
(item, None),
inputs,
)
def quantize_openvino(
model_topology: str,
model_weights: str,
input_data: List[Tuple[np.ndarray, ...]],
input_names: List[str],
) -> Tuple[str, str]:
model_config = {
"model_name": "model",
"model": model_topology,
"weights": model_weights,
}
# Engine config
engine_config = {"device": "CPU"}
algorithms = [
{
"name": "DefaultQuantization",
"params": {
"target_device": "ANY",
"preset": "performance",
"stat_subset_size": len(input_data),
},
}
]
data_loader = _CalibrationDataLoader(
input_data=input_data, input_names=input_names
)
model = load_model(model_config=model_config)
engine = IEEngine(config=engine_config, data_loader=data_loader)
pipeline = create_pipeline(algorithms, engine)
compressed_model = pipeline.run(model=model)
compress_model_weights(compressed_model)
compressed_model_paths = save_model(
model=compressed_model,
save_path="quantized_model",
model_name="quantized_model",
)
return (
compressed_model_paths[0]["model"],
compressed_model_paths[0]["weights"],
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/pytorch.py
================================================
import copy
from typing import List, Tuple, Union
from loguru import logger
from nebullvm.core.models import DeviceType, Device, QuantizationType
from nebullvm.optional_modules.torch import (
torch,
Module,
symbolic_trace,
QuantStub,
DeQuantStub,
GraphModule,
default_dynamic_qconfig,
prepare_fx,
convert_fx,
ScriptModule,
)
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
from nebullvm.tools.utils import check_module_version
class _QuantWrapper(Module):
def __init__(self, model: Module):
super(_QuantWrapper, self).__init__()
qconfig = model.qconfig if hasattr(model, "qconfig") else None
self.quant = QuantStub(qconfig)
self.model = model
self.dequant = DeQuantStub()
def forward(self, *inputs: torch.Tensor):
inputs = (self.quant(x) for x in inputs)
outputs = self.model(*inputs)
return tuple(self.dequant(x) for x in outputs)
def _quantize_dynamic_torch(model: Module):
layer_types = {
type(layer)
for layer in model.children()
if len(list(layer.parameters())) > 0
}
return torch.quantization.quantize_dynamic(
model=model, qconfig_spec=layer_types, dtype=torch.qint8
)
def _quantize_dynamic_torch_fx(
model: GraphModule,
input_data: List[Tuple[torch.Tensor, ...]],
):
qconfig_dict = {"": default_dynamic_qconfig}
additional_arguments = {}
if check_module_version(torch, min_version="1.13.0"):
additional_arguments["example_inputs"] = input_data[0]
model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)
return convert_fx(model_prepared)
def _quantize_static_torch(
model: Module,
input_data: List[Tuple[torch.Tensor, ...]],
backend: str,
):
model = _QuantWrapper(model)
model.qconfig = torch.quantization.get_default_qconfig(backend)
# TODO: change line below, it's wrong
# model = torch.quantization.fuse_modules(model, [["conv", "relu"]])
model = torch.quantization.prepare(model)
with torch.no_grad():
for tensors in input_data:
_ = model(*tensors)
return torch.quantization.convert(model)
def _quantize_static_torch_fx(
model: GraphModule,
input_data: List[Tuple[torch.Tensor, ...]],
backend: str,
):
qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)}
additional_arguments = {}
if check_module_version(torch, min_version="1.13.0"):
additional_arguments["example_inputs"] = input_data[0]
model_prepared = prepare_fx(model, qconfig_dict, **additional_arguments)
with torch.no_grad():
for tensors in input_data:
_ = model_prepared(*tensors)
return convert_fx(model_prepared)
def _quantize_static(
model: Union[Module, GraphModule],
input_data: List[Tuple[torch.Tensor, ...]],
device: Device,
):
assert (
device is not DeviceType.GPU
), "Quantization for torch is only available on CPU"
backend = (
"fbgemm"
if "fbgemm" in torch.backends.quantized.supported_engines
else "qnnpack"
)
torch.backends.quantized.engine = backend
if isinstance(model, GraphModule):
return _quantize_static_torch_fx(model, input_data, backend)
else:
return _quantize_static_torch(model, input_data, backend)
def _quantize_dynamic(
model: Union[Module, GraphModule],
input_data: List[Tuple[torch.Tensor, ...]],
device: Device,
):
assert (
device is not DeviceType.GPU
), "Quantization for torch is only available on CPU"
backend = (
"fbgemm"
if "fbgemm" in torch.backends.quantized.supported_engines
else "qnnpack"
)
torch.backends.quantized.engine = backend
if isinstance(model, GraphModule):
return _quantize_dynamic_torch_fx(model, input_data)
else:
return _quantize_dynamic_torch(model)
def _half_precision(model: Module):
return model.half()
def quantize_pytorch(
model: Module,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data_torch: List[Tuple[torch.Tensor, ...]],
device: Device,
) -> Union[torch.nn.Module, ScriptModule, GraphModule]:
model = copy.deepcopy(model).eval()
try:
model = symbolic_trace(model)
except Exception:
logger.warning("Unable to trace model with torch.fx")
if quantization_type is QuantizationType.HALF:
input_tfms.append(HalfPrecisionTransformation())
quantized_model = _half_precision(model)
elif quantization_type is QuantizationType.STATIC:
quantized_model = _quantize_static(model, input_data_torch, device)
elif quantization_type is QuantizationType.DYNAMIC:
quantized_model = _quantize_dynamic(model, input_data_torch, device)
else:
raise NotImplementedError(
f"No quantization implemented for quantization "
f"type {quantization_type}"
)
return quantized_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensor_rt.py
================================================
from typing import List, Tuple
import numpy as np
from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.optional_modules.tensor_rt import (
tensorrt as trt,
IInt8EntropyCalibrator2,
polygraphy,
)
from nebullvm.tools.transformations import (
MultiStageTransformation,
)
def quantize_tensorrt(
quantization_type: QuantizationType,
model_params: ModelParams,
config,
input_tfms: MultiStageTransformation,
input_data: List[Tuple[np.ndarray, ...]] = None,
):
if quantization_type is QuantizationType.HALF:
config.set_flag(trt.BuilderFlag.FP16)
# Tensor RT does not need to transform input data
# to fp16 because it expects always fp32
elif quantization_type is QuantizationType.STATIC:
assert input_data is not None, (
"You need to specify the calibration data for "
"performing static quantization."
)
calibrator = TensorRTCalibrator(
batch_size=model_params.batch_size,
input_data=input_data,
)
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = calibrator
return config
class TensorRTCalibrator(IInt8EntropyCalibrator2):
def __init__(
self, batch_size: int, input_data: List[Tuple[np.ndarray, ...]]
):
super(TensorRTCalibrator, self).__init__()
self._bs = batch_size
self.batches = (x for x in input_data)
def get_batch(self, names):
cuda_stream = polygraphy.Stream()
try:
data = next(self.batches)
cuda_data = []
for input_tensor in data:
device_array = polygraphy.DeviceArray(
shape=input_tensor.shape, dtype=input_tensor.dtype
)
device_array.copy_from(
host_buffer=input_tensor, stream=cuda_stream
)
cuda_data.append(device_array)
return [input_tensor.ptr for input_tensor in cuda_data]
except StopIteration:
return None
def get_batch_size(self):
return self._bs
def read_calibration_cache(self):
return None
def write_calibration_cache(self, cache):
return None
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tensorflow.py
================================================
from typing import List, Tuple
from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.tensorflow import tensorflow as tf
def _quantize_dynamic(model: tf.Module):
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quant_model = converter.convert()
return tflite_quant_model
def _quantize_static(model: tf.Module, dataset: List[Tuple[tf.Tensor, ...]]):
def representative_dataset():
for data_tuple in dataset:
yield list(data_tuple)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset
tflite_quant_model = converter.convert()
return tflite_quant_model
def _half_precision(model: tf.Module):
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_quant_model = converter.convert()
return tflite_quant_model
def quantize_tensorflow(
model: tf.Module,
quantization_type: QuantizationType,
input_data_tensorflow: List[Tuple[tf.Tensor, ...]],
):
if quantization_type is QuantizationType.DYNAMIC:
quantized_model = _quantize_dynamic(model)
elif quantization_type is QuantizationType.STATIC:
quantized_model = _quantize_static(model, input_data_tensorflow)
elif quantization_type is QuantizationType.HALF:
quantized_model = _half_precision(model)
else:
raise NotImplementedError(
f"Quantization not supported for type {quantization_type}"
)
return quantized_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/tvm.py
================================================
from typing import List, Sequence, Any
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.optional_modules.tvm import (
relay,
ToMixedPrecision,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
class TVMCalibrator(DataManager):
def __init__(self, data_reader: Sequence, input_names: List[str]):
super(TVMCalibrator, self).__init__(data_reader=data_reader)
self._input_names = input_names
def __getitem__(self, item: int):
tuple_ = self._data_reader[item]
return {name: data for name, data in zip(self._input_names, tuple_)}
def quantize_apache_tvm(
model: Any,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data: DataManager,
params: Any,
):
if quantization_type is not None:
if quantization_type is QuantizationType.HALF:
quantized_model = ToMixedPrecision(mixed_precision_type="float16")(
model
)
input_tfms.append(HalfPrecisionTransformation())
else:
if quantization_type is QuantizationType.DYNAMIC:
inputs = None
elif quantization_type is QuantizationType.STATIC:
inputs = input_data.get_split("train").get_numpy_list(
QUANTIZATION_DATA_NUM
)
input_names = [f"input_{n}" for n in range(len(inputs[0]))]
inputs = TVMCalibrator(inputs, input_names)
else:
return
if inputs is not None:
with relay.quantize.qconfig(
calibrate_mode="kl_divergence", weight_scale="max"
):
quantized_model = relay.quantize.quantize(
model, params, dataset=inputs
)
else:
with relay.quantize.qconfig(
calibrate_mode="global_scale", global_scale=8.0
):
quantized_model = relay.quantize.quantize(model, params)
return quantized_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/quantizations/utils.py
================================================
from loguru import logger
from nebullvm.core.models import QuantizationType
def check_quantization(
quantization_type: QuantizationType, perf_loss_ths: float
):
if quantization_type is not None and perf_loss_ths is None:
logger.warning(
"Got a valid quantization type without any given quantization "
"threshold. The quantization step will be ignored."
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensor_rt.py
================================================
import abc
import copy
import os
import subprocess
from pathlib import Path
from typing import List, Any, Tuple
import numpy as np
from nebullvm.config import QUANTIZATION_DATA_NUM, TORCH_TENSORRT_PRECISIONS
from nebullvm.core.models import QuantizationType, ModelParams
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.tensor_rt import ( # noqa: E501
quantize_tensorrt,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensor_rt import tensorrt as trt
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.optional_modules.torch_tensorrt import (
torch_tensorrt,
DataLoaderCalibrator,
)
from nebullvm.tools.data import DataManager, PytorchDataset
from nebullvm.tools.diffusers import UNet
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.transformations import (
MultiStageTransformation,
HalfPrecisionTransformation,
)
class TensorRTCompiler(Compiler, abc.ABC):
supported_ops = {
"cpu": [],
"gpu": [
None,
QuantizationType.STATIC,
QuantizationType.HALF,
],
}
def __init__(self):
super().__init__()
self.model_orig = None
@staticmethod
def _extract_dynamic_shape_ranges(model_params: ModelParams):
inputs_shapes = []
for i, info in enumerate(model_params.input_infos):
static_shape = info.size
if model_params.dynamic_info is not None:
input_dict = model_params.dynamic_info.inputs[i]
assert all(
key in dim
for dim in input_dict.values()
for key in ["min_val", "opt_val", "max_val"]
), (
"Missing min/opt/max ranges, TensorRT needs them to "
"enable dynamic shape properly"
)
shape_dict = {
"min_shape": [
static_shape[j]
if j not in input_dict
else input_dict[j]["min_val"]
for j in range(len(static_shape))
],
"opt_shape": [
static_shape[j]
if j not in input_dict
else input_dict[j]["opt_val"]
for j in range(len(static_shape))
],
"max_shape": [
static_shape[j]
if j not in input_dict
else input_dict[j]["max_val"]
for j in range(len(static_shape))
],
}
inputs_shapes.append(shape_dict)
else:
inputs_shapes.append({"shape": static_shape})
return inputs_shapes
@abc.abstractmethod
def execute(self, *args, **kwargs):
pass
class PyTorchTensorRTCompiler(TensorRTCompiler):
def execute(
self,
model: Module,
model_params: ModelParams,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using TensorRT Compiler from the
PyTorch interface.
Args:
model (torch.nn.Module): The pytorch model.
model_params (ModelParams): The model parameters.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
if quantization_type is QuantizationType.HALF:
dtype = torch.half
input_tfms.append(HalfPrecisionTransformation())
elif quantization_type is QuantizationType.STATIC:
if model_params.dynamic_info is not None:
self.logger.warning(
"Static quantization is not available when "
"using dynamic shape"
)
return
dtype = torch.int8
dataset = PytorchDataset(input_data.get_split("train"))
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=dataset.batch_size,
shuffle=False,
num_workers=0,
)
calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(
dataloader,
use_cache=False,
algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2, # noqa E501
device=torch.device(self.device.to_torch_format()),
)
else:
dtype = torch.float32
# Convert int64 to int32 for transformers inputs
input_tensors = [
tensor.to(self.device.to_torch_format())
if tensor.dtype != torch.int64
else tensor.to(torch.int32).to(self.device.to_torch_format())
for tensor in input_data.get_list(1)[0]
]
self.compiled_model = self._compile_model(
model=model,
model_params=model_params,
input_tensors=input_tensors,
dtype=dtype,
calibrator=calibrator
if quantization_type is QuantizationType.STATIC
else None, # noqa E501
quantization_type=quantization_type,
)
@torch.no_grad()
def _compile_model(
self,
model: Module,
model_params: ModelParams,
input_tensors: List[torch.Tensor],
dtype: torch.dtype,
calibrator: DataLoaderCalibrator,
quantization_type: QuantizationType,
):
model.to(self.device.to_torch_format()).eval()
try:
if quantization_type is QuantizationType.HALF:
ts_model = torch.jit.script(copy.deepcopy(model).half()).half()
else:
ts_model = torch.jit.script(model)
except Exception:
if quantization_type is QuantizationType.HALF:
ts_model = torch.jit.trace(
copy.deepcopy(model).half(),
[t.half() for t in input_tensors],
).half()
else:
ts_model = torch.jit.trace(model, input_tensors)
with torch_tensorrt.logging.errors():
inputs_shapes = self._extract_dynamic_shape_ranges(model_params)
trt_model = torch_tensorrt.compile(
ts_model,
inputs=[
torch_tensorrt.Input(
**inputs_shapes[i],
dtype=torch.half
if (
dtype == torch.half
and tensor.dtype not in [torch.int8, torch.int32]
)
else tensor.dtype,
)
for i, tensor in enumerate(input_tensors)
],
enabled_precisions=TORCH_TENSORRT_PRECISIONS[str(dtype)],
calibrator=calibrator
if quantization_type is QuantizationType.STATIC
else None,
workspace_size=self.device.get_free_memory(),
device={
"device_type": torch_tensorrt.DeviceType.GPU,
"gpu_id": self.device.idx,
"dla_core": 0,
"allow_gpu_fallback": False,
"disable_tf32": False,
},
truncate_long_and_double=True,
)
# Delete calibration cache
if os.path.exists("calibration.cache"):
os.remove("calibration.cache")
return trt_model
@staticmethod
def _quantize_model(**kwargs) -> Any:
raise NotImplementedError
class ONNXTensorRTCompiler(TensorRTCompiler):
def __init__(self):
super().__init__()
self.model_orig = None
self.onnx_model_path = None
self.simplify_model = True
def execute(
self,
model: str,
model_params: ModelParams,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
is_diffusion: bool = False,
**kwargs,
):
"""Compile the input model using TensorRT Compiler from the
ONNX interface.
Args:
model (str): The path to the onnx model.
model_params (ModelParams): The model parameters.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
is_diffusion (bool): Whether the model is a diffusion model.
Default: False.
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train").get_numpy_list(
QUANTIZATION_DATA_NUM
)
if self.simplify_model and not is_diffusion:
try:
import onnxsim # noqa: F401
# Simplify model, otherwise tensor RT won't work
# on gpt2 and some other models.
simplified_model = str(model) + "_simplified"
if not Path(simplified_model).is_file():
cmd = [
"onnxsim",
str(model),
simplified_model,
]
subprocess.run(cmd, stdout=subprocess.DEVNULL)
# First try with simplified model
self.onnx_model_path = simplified_model
assert os.path.isfile(self.onnx_model_path)
except Exception:
# Use original model
self.logger.warning(
"Unable to simplify model with ONNX Simplifier. "
"Original ONNX model will be used to build "
"TensorRT engine"
)
self.onnx_model_path = str(model)
self.simplify_model = False
elif self.onnx_model_path is None:
self.onnx_model_path = str(model)
if is_diffusion:
if quantization_type is None:
self.logger.warning(
"Skipping float32 precision for Stable Diffusion, "
"half precision will be used instead."
)
return
if quantization_type is QuantizationType.STATIC:
self.logger.warning(
"Skipping static quantization for Stable Diffusion "
"because for now it's not supported."
)
return
if self.simplify_model and is_diffusion:
optimized_model = str(Path(model).parent / "model_opt.onnx")
unet = UNet(hf_token=None)
opt_graph = unet.optimize(onnx.load(str(model)))
try:
onnx.save(opt_graph, optimized_model)
except Exception:
onnx.save(
opt_graph, optimized_model, save_as_external_data=True
)
self.onnx_model_path = optimized_model
self.simplify_model = False
elif self.onnx_model_path is None:
self.onnx_model_path = str(model)
# -- Build phase --
nvidia_logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(nvidia_logger)
# create network definition
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
# build the engine
# TODO: setup config value for the class in a config file
config = builder.create_builder_config()
try:
config.set_memory_pool_limit(
trt.MemoryPoolType.WORKSPACE, self.device.get_free_memory()
)
except AttributeError:
# The method set_memory_pool_limit is not available
# until TensorRT Release 8.4.1
self.logger.warning(
"Cannot call method set_memory_pool_limit for TensorRT. "
"because your version is lower than 8.4.1. "
"Please update TensorRT version."
)
if quantization_type is not None:
config = self._quantize_model(
quantization_type,
model_params,
config,
input_tfms,
train_input_data
if quantization_type is QuantizationType.STATIC
else None,
)
self.compiled_model = self._compile_model(
onnx_model_path=str(self.onnx_model_path),
model_params=model_params,
config=config,
network=network,
builder=builder,
nvidia_logger=nvidia_logger,
)
self.model_orig = self.onnx_model_path
def _compile_model(
self,
onnx_model_path: str,
model_params: ModelParams,
config,
network,
builder,
nvidia_logger,
):
parser = trt.OnnxParser(network, nvidia_logger)
success = parser.parse_from_file(onnx_model_path)
if not success:
for idx in range(parser.num_errors):
self.logger.debug(parser.get_error(idx))
raise ValueError(
f"Errors occurred while processing the "
f"ONNX file at {onnx_model_path}"
)
if model_params.dynamic_info is not None:
inputs_shapes = self._extract_dynamic_shape_ranges(model_params)
profile = builder.create_optimization_profile()
for i, input_name in enumerate(get_input_names(onnx_model_path)):
profile.set_shape(
input_name,
inputs_shapes[i]["min_shape"],
inputs_shapes[i]["opt_shape"],
inputs_shapes[i]["max_shape"],
)
config.add_optimization_profile(profile)
return builder.build_serialized_network(network, config)
@staticmethod
def _quantize_model(
quantization_type: QuantizationType,
model_params: ModelParams,
config,
input_tfms: MultiStageTransformation,
input_data: List[Tuple[np.ndarray, ...]] = None,
):
return quantize_tensorrt(
quantization_type,
model_params,
config,
input_tfms,
input_data,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tensorflow.py
================================================
from typing import List, Tuple
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.tensorflow import ( # noqa: E501
quantize_tensorflow,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class TensorflowBackendCompiler(Compiler):
supported_ops = {
"cpu": [None],
"gpu": [None],
}
def execute(
self,
model: tf.Module,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Optimize the input model using tensorflow built-in techniques.
Args:
model (tf.Module): The tensorflow model.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None.
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
self.compiled_model = model
def _compile_model(self):
pass
@staticmethod
def _quantize_model(**kwargs):
raise NotImplementedError()
class TFLiteBackendCompiler(Compiler):
supported_ops = {
"cpu": [
None,
QuantizationType.STATIC,
QuantizationType.HALF,
QuantizationType.DYNAMIC,
],
"gpu": [],
}
def execute(
self,
model: tf.Module,
input_tfms: MultiStageTransformation,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Optimize the input model using pytorch built-in techniques.
Args:
model (torch.nn.Module): The pytorch model. For avoiding un-wanted
modifications to the original model, it will be copied in the
method.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with an higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train").get_list(
QUANTIZATION_DATA_NUM
)
if quantization_type is not None:
self.compiled_model = self._quantize_model(
model, quantization_type, train_input_data
)
else:
self.compiled_model = self._compile_model(model)
def _compile_model(
self,
model: tf.Module,
):
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
return tflite_model
@staticmethod
def _quantize_model(
model: tf.Module,
quantization_type: QuantizationType,
input_data_tensorflow: List[Tuple[tf.Tensor, ...]],
):
return quantize_tensorflow(
model, quantization_type, input_data_tensorflow
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_dynamo.py
================================================
from typing import Union, Any
from nebullvm.core.models import ModelParams, QuantizationType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.optional_modules.torch import (
torch,
Module,
GraphModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class TorchDynamoCompiler(Compiler):
supported_ops = {
"cpu": [None],
"gpu": [None],
}
def execute(
self,
model: Module,
model_params: ModelParams,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Optimize the input model using pytorch built-in techniques.
Args:
model (torch.nn.Module): The pytorch model.
model_params (ModelParams): The model parameters.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None.
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
self.compiled_model = self._compile_model(model, model_params)
@torch.no_grad()
def _compile_model(
self,
model: Union[Module, GraphModule],
network_parameters: ModelParams,
) -> Any:
dynamic = False
if network_parameters.dynamic_info is not None:
dynamic = True
return torch.compile(model, dynamic=dynamic)
def _quantize_model(self, **kwargs) -> Any:
raise NotImplementedError
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_neuron.py
================================================
from typing import List, Tuple
from nebullvm.core.models import QuantizationType, ModelParams, DeviceType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.torch import (
torch,
symbolic_trace,
)
from nebullvm.optional_modules.torch_neuron import torch_neuron
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class TorchNeuronCompiler(Compiler):
supported_ops = {
"cpu": [],
"gpu": [],
"neuron": [None, QuantizationType.HALF],
}
@staticmethod
def _check_dynamic_shape(network_parameters: ModelParams) -> bool:
"""Handles case when model inputs have dynamic shapes.
For now TorchNeuron only supports dynamic shape for the
batch dimension.
Args:
network_parameters (ModelParams): The model parameters.
Returns:
bool: True if the model has dynamic batch size, False otherwise.
"""
if network_parameters.dynamic_info is None:
return False
for i, input_shape in enumerate(
network_parameters.dynamic_info.inputs
):
if len(input_shape) > 1 or (
len(input_shape) == 1 and input_shape.get(0) is None
):
raise ValueError(
f"TorchNeuronCompiler only supports dynamic shapes for "
f"batch dimension. Provided dynamic info for input {i} "
f"is: {input_shape}. Please use padding for the other "
f"dimensions."
)
return True
def execute(
self,
model: torch.nn.Module,
model_params: ModelParams,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Optimize the input model using pytorch built-in techniques.
Args:
model (torch.nn.Module): The pytorch model.
model_params (ModelParams): The model parameters.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None.
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
dynamic_batch_size = self._check_dynamic_shape(model_params)
self.compiled_model = self._compile_model(
model,
input_data,
quantization_type,
dynamic_batch_size=dynamic_batch_size,
)
@torch.no_grad()
def _compile_model(
self,
model: torch.nn.Module,
input_data: DataManager,
quantization_type: QuantizationType,
dynamic_batch_size: bool,
) -> torch.jit.ScriptModule:
input_sample = input_data.get_list(1)[0]
if self.device.type is DeviceType.GPU:
if quantization_type is QuantizationType.HALF:
input_sample = [
t.to(self.device.to_torch_format()).half()
if torch.is_floating_point(t)
else t.to(self.device.to_torch_format())
for t in input_sample
]
else:
input_sample = [
t.to(self.device.to_torch_format()) for t in input_sample
]
model.to(self.device.to_torch_format())
model.eval()
try:
model_scripted = symbolic_trace(model)
model_scripted = torch_neuron.trace(
model_scripted,
input_sample,
dynamic_batch_size=dynamic_batch_size,
compiler_args=["--fast-math", "none"]
if quantization_type is None
else None,
)
except Exception:
try:
model_scripted = torch_neuron.trace(
model,
input_sample,
dynamic_batch_size=dynamic_batch_size,
compiler_args=["--fast-math", "none"]
if quantization_type is None
else None,
)
except Exception:
raise RuntimeError("Unable to trace model with torch_neuron.")
return model_scripted
@torch.no_grad()
def _quantize_model(
self,
model: torch.nn.Module,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data_torch: List[Tuple[torch.Tensor, ...]],
):
raise NotImplementedError()
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torch_xla.py
================================================
from nebullvm.core.models import QuantizationType
from nebullvm.operations.optimizations.compilers.torchscript import (
TorchScriptCompiler,
)
from nebullvm.optional_modules.torch import (
torch,
)
from nebullvm.tools.data import DataManager
class TorchXLACompiler(TorchScriptCompiler):
supported_ops = {
"cpu": [],
"gpu": [],
"tpu": [None, QuantizationType.HALF],
}
@torch.no_grad()
def _compile_model(
self,
model: torch.nn.Module,
input_data: DataManager,
quantization_type: QuantizationType,
) -> torch.nn.Module:
compiled_model = model.to(self.device.to_torch_format())
return compiled_model
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/torchscript.py
================================================
from typing import Union, List, Tuple
from nebullvm.config import QUANTIZATION_DATA_NUM
from nebullvm.core.models import QuantizationType, DeviceType
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.pytorch import (
quantize_pytorch,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.torch import (
torch,
Module,
ScriptModule,
GraphModule,
symbolic_trace,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
class TorchScriptCompiler(Compiler):
supported_ops = {
"cpu": [None, QuantizationType.STATIC, QuantizationType.DYNAMIC],
"gpu": [
None,
QuantizationType.HALF,
],
}
def execute(
self,
model: Module,
input_tfms: MultiStageTransformation = None,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Optimize the input model using pytorch built-in techniques.
Args:
model (torch.nn.Module): The pytorch model.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None.
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
train_input_data = input_data.get_split("train").get_list(
QUANTIZATION_DATA_NUM
)
if quantization_type is not None:
model = self._quantize_model(
model, quantization_type, input_tfms, train_input_data
)
self.compiled_model = self._compile_model(
model, input_data, quantization_type
)
@torch.no_grad()
def _compile_model(
self,
model: Union[Module, GraphModule],
input_data: DataManager,
quantization_type: QuantizationType,
) -> ScriptModule:
input_sample = input_data.get_list(1)[0]
if self.device.type is DeviceType.GPU:
if quantization_type is QuantizationType.HALF:
input_sample = [
t.to(self.device.to_torch_format()).half()
if torch.is_floating_point(t)
else t.to(self.device.to_torch_format())
for t in input_sample
]
else:
input_sample = [
t.to(self.device.to_torch_format()) for t in input_sample
]
model.to(self.device.to_torch_format())
if not isinstance(model, torch.fx.GraphModule):
model.eval()
try:
model_scripted = symbolic_trace(model)
model_scripted = torch.jit.script(model_scripted)
except Exception:
if quantization_type is None:
self.logger.warning("Unable to trace model with torch.fx")
try:
model_scripted = torch.jit.script(model)
except Exception:
model_scripted = torch.jit.trace(model, input_sample)
else:
model_scripted = torch.jit.script(model)
return model_scripted
@torch.no_grad()
def _quantize_model(
self,
model: Module,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data_torch: List[Tuple[torch.Tensor, ...]],
):
return quantize_pytorch(
model, quantization_type, input_tfms, input_data_torch, self.device
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/tvm.py
================================================
import abc
import os
import uuid
from abc import ABC
from typing import Any, Tuple, Dict, Union
from nebullvm.config import (
AUTO_TVM_PARAMS,
AUTO_TVM_TUNING_OPTION,
)
from nebullvm.core.models import (
QuantizationType,
ModelParams,
DeviceType,
Device,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.quantizations.tvm import (
TVMCalibrator,
quantize_apache_tvm,
)
from nebullvm.operations.optimizations.compilers.quantizations.utils import (
check_quantization,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.torch import Module, torch
from nebullvm.optional_modules.tvm import (
tvm,
IRModule,
NDArray,
XGBTuner,
autotvm,
relay,
ExecutorFactoryModule,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import get_input_names
from nebullvm.tools.pytorch import create_model_inputs_torch
from nebullvm.tools.transformations import MultiStageTransformation
class ApacheTVMCompiler(Compiler, ABC):
supported_ops = {
"cpu": [
None,
# QuantizationType.STATIC,
QuantizationType.HALF,
QuantizationType.DYNAMIC,
],
"gpu": [
None,
# QuantizationType.STATIC,
QuantizationType.HALF,
QuantizationType.DYNAMIC,
],
}
def __init__(self):
super().__init__()
self.model_orig = None
def execute(
self,
model: Union[Module, str],
input_tfms: MultiStageTransformation,
model_params: ModelParams,
metric_drop_ths: float = None,
quantization_type: QuantizationType = None,
input_data: DataManager = None,
**kwargs,
):
"""Compile the input model using Apache TVM compiler.
Args:
model (Union[Module, str]: The input model. Can be a torch model
or a path to an onnx model.
input_tfms (MultiStageTransformation, optional): Transformations
to be performed to the model's input tensors in order to
get the prediction. Default: None.
model_params (ModelParams): Model parameters.
metric_drop_ths (float, optional): Threshold for the accepted drop
in terms of precision. Any optimized model with a higher drop
will be ignored. Default: None.
quantization_type (QuantizationType, optional): The desired
quantization algorithm to be used. Default: None.
input_data (DataManager): User defined data. Default: None
"""
if quantization_type not in self.supported_ops[self.device.type.value]:
self.compiled_model = None
return
if quantization_type is QuantizationType.STATIC and input_data is None:
raise ValueError("Input data is required for static quantization.")
self.logger.info(
f"Optimizing with {self.__class__.__name__} and "
f"q_type: {quantization_type}."
)
check_quantization(quantization_type, metric_drop_ths)
mod, params = self._build_tvm_model(model, model_params)
if quantization_type is not None:
mod = self._quantize_model(
mod, quantization_type, input_tfms, input_data, params
)
self.compiled_model = self._compile_model(mod, params)
@abc.abstractmethod
def _build_tvm_model(self, model: Any, model_params: ModelParams):
raise NotImplementedError()
@staticmethod
def _build_tvm_model_from_torch(
torch_model: Module, model_params: ModelParams, device: Device
) -> Tuple[IRModule, Dict[str, NDArray]]:
shape_dict = {
f"input_{i}": input_size
for i, input_size in enumerate(model_params.input_sizes)
}
inputs = tuple(create_model_inputs_torch(model_params.input_infos))
if device.type is not DeviceType.GPU:
inputs = tuple(input_.cpu() for input_ in inputs)
torch_model.cpu()
else:
inputs = tuple(
input_.to(device.to_torch_format()) for input_ in inputs
)
torch_model.to(device.to_torch_format())
with torch.no_grad():
_ = torch_model(*inputs)
model_trace = torch.jit.trace(torch_model, inputs)
model_trace.eval()
mod, params = relay.frontend.from_pytorch(
model_trace, list(shape_dict.items())
)
return mod, params
@staticmethod
def _build_tvm_model_from_onnx(
onnx_model_path: str, model_params: ModelParams
) -> Tuple[IRModule, Dict[str, NDArray]]:
shape_dict = {
input_key: input_size
for input_key, input_size in zip(
get_input_names(onnx_model_path), model_params.input_sizes
)
}
onnx_model = onnx.load(onnx_model_path)
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
return mod, params
@staticmethod
def _quantize(
mod: IRModule,
params: Dict[str, NDArray],
input_data: TVMCalibrator = None,
) -> IRModule:
if input_data is not None:
with relay.quantize.qconfig(
calibrate_mode="kl_divergence", weight_scale="max"
):
mod = relay.quantize.quantize(mod, params, dataset=input_data)
else:
with relay.quantize.qconfig(
calibrate_mode="global_scale", global_scale=8.0
):
mod = relay.quantize.quantize(mod, params)
return mod
@staticmethod
def _get_target(device) -> str:
if device.type is DeviceType.GPU:
return str(tvm.target.cuda())
else:
return "llvm" # run on CPU
@staticmethod
def _tune_tvm_model(
target: str, mod: IRModule, params: Dict[str, NDArray]
) -> str:
"""Tune the model using AutoTVM."""
# TODO: add support to Ansor
tuning_records = f"{uuid.uuid4()}_model_records.json"
# create a TVM runner
runner = autotvm.LocalRunner(
number=AUTO_TVM_PARAMS["number"],
repeat=AUTO_TVM_PARAMS["repeat"],
timeout=AUTO_TVM_PARAMS["timeout"],
min_repeat_ms=AUTO_TVM_PARAMS["min_repeat_ms"],
# TODO modify min_repeat_ms for GPU usage
enable_cpu_cache_flush=True,
)
# begin by extracting the tasks from the onnx model
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params
)
# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
tuner_obj = XGBTuner(task, loss_type="rank")
tuner_obj.tune(
n_trial=min(
AUTO_TVM_TUNING_OPTION["trials"], len(task.config_space)
),
early_stopping=AUTO_TVM_TUNING_OPTION["early_stopping"],
measure_option=autotvm.measure_option(
builder=autotvm.LocalBuilder(build_func="default"),
runner=runner,
),
callbacks=[
autotvm.callback.log_to_file(tuning_records),
],
)
return tuning_records
def _compile_model(self, model: Any, params: Any) -> ExecutorFactoryModule:
target = self._get_target(self.device)
tuning_records = self._tune_tvm_model(target, model, params)
with autotvm.apply_history_best(tuning_records):
with tvm.transform.PassContext(opt_level=3, config={}):
lib = relay.build(model, target=target, params=params)
# Remove temporary file created by tvm
os.remove(tuning_records)
return lib
@staticmethod
def _quantize_model(
model: Any,
quantization_type: QuantizationType,
input_tfms: MultiStageTransformation,
input_data: DataManager,
params,
):
return quantize_apache_tvm(
model, quantization_type, input_tfms, input_data, params
)
class PyTorchApacheTVMCompiler(ApacheTVMCompiler):
def _build_tvm_model(self, model: Any, model_params: ModelParams):
return self._build_tvm_model_from_torch(
model, model_params, self.device
)
class ONNXApacheTVMCompiler(ApacheTVMCompiler):
def _build_tvm_model(self, model: Any, model_params: ModelParams):
self.model_orig = model
return self._build_tvm_model_from_onnx(model, model_params)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compilers/utils.py
================================================
from pathlib import Path
import nebullvm
from nebullvm.core.models import Device, ModelCompiler, DeviceType
def onnxruntime_is_available() -> bool:
try:
import onnxruntime # noqa F401
return True
except ImportError:
return False
def tvm_is_available() -> bool:
try:
import tvm # noqa F401
from tvm.runtime import Module # noqa F401
return True
except ImportError:
return False
def bladedisc_is_available() -> bool:
try:
import torch_blade # noqa F401
return True
except ImportError:
return False
def tensorrt_is_available() -> bool:
try:
import polygraphy # noqa F401
import tensorrt # noqa F401
return True
except ImportError:
return False
def torch_tensorrt_is_available() -> bool:
try:
import torch_tensorrt # noqa F401
return True
except ImportError:
return False
def openvino_is_available() -> bool:
try:
from openvino.runtime import Core # noqa F401
except ImportError:
return False
else:
return True
def deepsparse_is_available() -> bool:
try:
import deepsparse # noqa F401
except ImportError:
return False
else:
return True
def intel_neural_compressor_is_available() -> bool:
try:
import neural_compressor # noqa F401
except ImportError:
return False
else:
return True
def torch_xla_is_available():
try:
import torch_xla # noqa F401
return True
except ImportError:
return False
def torch_neuron_is_available():
try:
import torch_neuron # noqa F401
return True
except ImportError:
return False
def get_faster_transformer_repo_path() -> Path:
return Path(nebullvm.__file__).parent.joinpath("FasterTransformer")
def faster_transformer_is_available() -> bool:
return (
get_faster_transformer_repo_path()
.parent.joinpath("FasterTransformer_build_success")
.exists()
)
def select_compilers_from_hardware_onnx(device: Device):
from nebullvm.optional_modules.utils import onnx_is_available
compilers = []
if onnx_is_available():
if onnxruntime_is_available():
compilers.append(ModelCompiler.ONNX_RUNTIME)
if tvm_is_available():
compilers.append(ModelCompiler.APACHE_TVM)
if device.type is DeviceType.GPU and tensorrt_is_available():
compilers.append(ModelCompiler.TENSOR_RT)
if device.type is DeviceType.CPU and openvino_is_available():
compilers.append(ModelCompiler.OPENVINO)
return compilers
def select_compilers_from_hardware_torch(device: Device):
from nebullvm.optional_modules.utils import torch_is_available
compilers = []
if torch_is_available():
compilers.append(ModelCompiler.TORCHSCRIPT)
if tvm_is_available():
compilers.append(ModelCompiler.APACHE_TVM)
if bladedisc_is_available():
compilers.append(ModelCompiler.BLADEDISC)
if torch_neuron_is_available():
compilers.append(ModelCompiler.TORCH_NEURON)
if device.type is DeviceType.CPU:
if deepsparse_is_available():
compilers.append(ModelCompiler.DEEPSPARSE)
if intel_neural_compressor_is_available():
compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)
elif device.type is DeviceType.GPU:
if torch_tensorrt_is_available:
compilers.append(ModelCompiler.TENSOR_RT)
return compilers
def select_compilers_from_hardware_tensorflow():
from nebullvm.optional_modules.utils import tensorflow_is_available
compilers = []
if tensorflow_is_available():
compilers.append(ModelCompiler.XLA)
compilers.append(ModelCompiler.TFLITE)
return compilers
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/base.py
================================================
from abc import ABC, abstractmethod
from typing import Any, Optional, Dict, Callable, Tuple
import yaml
from nebullvm.operations.base import Operation
from nebullvm.tools.data import DataManager
class Compressor(Operation, ABC):
def __init__(self, config_file: str = None):
super().__init__()
self._config = self._read_config(config_file)
self.compressed_model = None
self.new_metric_ths = None
@abstractmethod
def execute(
self,
model: Any,
train_input_data: DataManager,
eval_input_data: DataManager,
metric_drop_ths: float,
metric: Callable,
) -> Tuple[Any, Optional[float]]:
raise NotImplementedError()
def _read_config(self, config_file: Optional[str]) -> Dict:
config = self._get_default_config()
if config_file is not None:
with open(config_file, "r") as f:
data = yaml.load(f, Loader=yaml.CLoader)
config.update(data.get(self.config_key, {}))
return config
@staticmethod
@abstractmethod
def _get_default_config() -> Dict:
raise NotImplementedError
@property
@abstractmethod
def config_key(self) -> str:
raise NotImplementedError()
def get_result(self) -> Tuple[Any, Optional[float]]:
return self.compressed_model, self.new_metric_ths
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/intel.py
================================================
import copy
import re
from abc import ABC, abstractmethod
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, Any, Callable
import numpy as np
import yaml
from nebullvm.operations.optimizations.compressors.base import Compressor
from nebullvm.optional_modules.neural_compressor import Pruning
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import DataLoader, Dataset, Module
from nebullvm.tools.data import DataManager
def _get_model_framework(model: Any) -> str:
if isinstance(model, Module):
return "torch"
elif isinstance(model, tf.Module) and model is not None:
return "tensorflow"
else:
return "numpy"
class IntelPruningCompressor(Compressor, ABC):
def __init__(self, config_file: str = None):
super().__init__(config_file)
self._temp_dir = mkdtemp()
@property
def config_key(self) -> str:
return "intel_pruning"
@staticmethod
def _get_default_config() -> Dict:
# see https://github.com/intel/neural-compressor/blob/master/neural_compressor/conf/config.py # noqa
# for further details
config = {
"train": {
"optimizer": {
"SGD": {"learning_rate": 0.001},
},
"criterion": {
"CrossEntropyLoss": {
"reduction": "mean",
"from_logits": False,
},
},
"epoch": 10,
"start_epoch": 0,
"end_epoch": 10,
"iteration": 30,
"execution_mode": "eager", # either eager or graph
# "hostfile": None, # str for multinode training support
},
"approach": {
"weight_compression": {
"initial_sparsity": 0.0,
"target_sparsity": 0.60,
"start_epoch": 0,
"end_epoch": 8,
"pruners": [
{
"start_epoch": 0,
"end_epoch": 8,
"prune_type": "basic_magnitude",
},
],
}
},
}
return config
def _prepare_pruning_config(self, model: Any):
pruning_config = copy.deepcopy(self._config)
framework = _get_model_framework(model)
config = {
"model": {
"name": model.__class__.__name__,
"framework": framework if framework != "torch" else "pytorch",
},
"evaluation": {"accuracy": {"metric": {"topk": 1}}},
"device": "cpu",
"tuning": {
"random_seed": 1978,
"tensorboard": False,
"workspace": {"path": self._temp_dir},
},
"pruning": pruning_config,
}
path_file = Path(self._temp_dir) / "temp.yaml"
with open(path_file, "w") as f:
yaml.dump(config, f)
with open(path_file, "r+") as f:
file_str = f.read()
file_str = re.sub(
"pruners:\n - end_epoch:",
"pruners:\n - !Pruner\n end_epoch:",
file_str,
)
f.seek(0)
f.write(file_str)
return path_file
def execute(
self,
model: Any,
train_input_data: DataManager,
eval_input_data: DataManager,
metric_drop_ths: float,
metric: Callable,
):
config_file_pr = self._prepare_pruning_config(model)
prune = Pruning(str(config_file_pr))
prune.model = model
prune.train_dataloader = self._get_dataloader(train_input_data)
prune.eval_dataloader = self._get_dataloader(eval_input_data)
self.compressed_model = prune.fit()
if self.compressed_model is not None:
error = self._compute_error(
model, self.compressed_model, eval_input_data, metric
)
if error > metric_drop_ths:
self.compressed_model = None
else:
self.new_metric_ths = metric_drop_ths - error
@abstractmethod
def _compute_error(
self,
model: Any,
compressed_model: Any,
eval_input_data: DataManager,
metric: Callable,
):
raise NotImplementedError
@staticmethod
@abstractmethod
def _get_dataloader(input_data: DataManager):
raise NotImplementedError
class INCDataset(Dataset):
def __init__(self, input_data: DataManager):
self.data = input_data
self.batch_size = input_data[0][0][0].shape[0]
def __len__(self):
return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])
def __getitem__(self, idx):
batch_idx = int(idx / self.batch_size)
item_idx = idx % self.batch_size
data = tuple([data[item_idx] for data in self.data[batch_idx][0]])
return data, self.data[batch_idx][1][item_idx]
class TorchIntelPruningCompressor(IntelPruningCompressor):
@staticmethod
def _get_dataloader(input_data: DataManager):
bs = input_data[0][0][0].shape[0]
ds = INCDataset(input_data)
dl = DataLoader(ds, bs)
return dl
def _compute_error(
self,
model: Module,
compressed_model: Module,
eval_input_data: DataManager,
metric: Callable,
):
if len(eval_input_data) == 0:
return np.inf
metric_val = 0
for inputs, y in eval_input_data:
pred_model = model(*inputs)
pred_compressed_model = compressed_model(*inputs)
metric_val += metric(pred_model, pred_compressed_model, y)
return metric_val / len(eval_input_data)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/__init__.py
================================================
import json
import logging
import os.path
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, List, Any, Dict
import torch
from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude
from sparseml.pytorch.optim import (
ScheduledModifierManager,
)
from sparseml.pytorch.sparsification import (
EpochRangeModifier,
GMPruningModifier,
)
from sparseml.pytorch.utils import ModuleExporter
from sparsify.blueprints.utils import (
default_epochs_distribution,
PruningModelEvaluator,
default_pruning_settings,
)
from sparsify.schemas import ProjectModelAnalysisSchema
from torch.nn import CrossEntropyLoss, MSELoss
from torch.optim import SGD
from tqdm.auto import tqdm
CRITERION_FNS = {
"CrossEntropy": CrossEntropyLoss(),
"MSE": MSELoss(),
}
logging.basicConfig(
format=" %(asctime)s [%(levelname)s] %(message)s",
datefmt="%d/%m/%Y %I:%M:%S %p",
)
logger = logging.getLogger("nebullvm_logger")
logger.setLevel(logging.INFO)
def _export_model_onnx(
model: torch.nn.Module,
save_path: Path,
model_name: str,
input_batch: Tuple,
):
if torch.cuda.is_available():
input_batch = tuple(t.cuda() for t in input_batch)
model.cuda()
exporter = ModuleExporter(model, output_dir=save_path)
with torch.no_grad():
example_outputs = model(*input_batch)
exporter.export_onnx(
input_batch, name=model_name, example_outputs=example_outputs
)
onnx_path = save_path / model_name
return onnx_path
class RecipeBuilder:
def __init__(self, model_path):
self.model_path = model_path
def _make_analysis(self):
analyzer = ModelAnalyzer(self.model_path)
self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())
def _compute_loss_sensitivity(self):
sensitivities = []
parameters = []
for i, node in enumerate(self.analysis["nodes"]):
if node["prunable"]:
sensitivities.append(node["prunable_equation_sensitivity"])
parameters.append(node["prunable_params"])
loss_analysis = pruning_loss_sens_magnitude(self.model_path)
results_model = loss_analysis.results_model
results = loss_analysis.results
model = {
"baseline_measurement_key": (
str(results_model.baseline_measurement_key)
),
"measurements": {
str(key): val for key, val in results_model.averages.items()
},
}
ops = []
for res in results:
ops.append(
{
"id": res.id_,
"name": res.name,
"index": res.index,
"baseline_measurement_key": (
str(res.baseline_measurement_key)
),
"measurements": {
str(key): val for key, val in res.averages.items()
},
}
)
pruning = {"model": model, "ops": ops}
loss = {}
loss["baseline"] = {}
loss["pruning"] = pruning
model = PruningModelEvaluator(
self.analysis,
None,
loss,
)
model.eval_baseline(default_pruning_settings().sparsity)
model.eval_pruning(default_pruning_settings())
self.final_analysis = model.to_dict_values()
def build_recipe(self, epochs_pruning_window=None, training_epochs=10):
self._make_analysis()
self._compute_loss_sensitivity()
if epochs_pruning_window is None:
epochs = default_epochs_distribution(training_epochs)
else:
# TODO: set custom parameters
epochs = default_epochs_distribution(training_epochs)
epochs_dict = epochs._asdict()
epochs_dict.update(epochs_pruning_window)
epochs = epochs.__class__(**epochs_dict)
mods = [
EpochRangeModifier(
start_epoch=epochs.start_epoch,
end_epoch=epochs.end_epoch,
)
]
node_weight_name_lookup = {
node["id"]: node["weight_name"]
for node in self.analysis["nodes"]
if node["prunable"]
}
sparsity_to_params = {}
nodes = self.final_analysis[0]
for node in nodes:
sparsity = node["sparsity"]
node_id = node["node_id"]
weight_name = node_weight_name_lookup[node_id]
if sparsity is None:
continue
if sparsity not in sparsity_to_params:
sparsity_to_params[sparsity] = []
sparsity_to_params[sparsity].append(weight_name)
for sparsity, params in sparsity_to_params.items():
gm_pruning = GMPruningModifier(
init_sparsity=0.05,
final_sparsity=sparsity,
start_epoch=epochs.pruning_start_epoch,
end_epoch=epochs.pruning_end_epoch,
update_frequency=epochs.pruning_update_frequency,
params=params,
)
mods.append(gm_pruning)
return ScheduledModifierManager(mods)
class PruningTrainer:
def __init__(self, model, bs):
self.data_loader = None
self.optimizer = None
self.model = model
self.batch_size = bs
def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
if loss_fn is None:
loss_fn = CrossEntropyLoss()
else:
loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())
self.criterion = loss_fn
self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)
def _run_model_one_epoch(self, train=False):
if train:
self.model.train()
data_loader = self.train_data_loader
else:
self.model.eval()
data_loader = self.val_data_loader
running_loss = 0.0
for step, (inputs, labels) in tqdm(
enumerate(data_loader), total=len(data_loader)
):
inputs = tuple(t.to(self.device) for t in inputs)
if not isinstance(labels, torch.Tensor):
labels = torch.tensor(labels)
if len(labels.shape) == 0:
labels = labels.unsqueeze(0)
labels = labels.to(self.device)
if train:
self.optimizer.zero_grad()
outputs = self.model(
*inputs
) # model returns logits and softmax as a tuple
loss = self.criterion(outputs, labels)
if train:
loss.backward()
self.optimizer.step()
running_loss += loss.item()
loss = running_loss / (len(data_loader) + 1e-5)
return loss
def train(
self, manager, train_data_loader, val_data_loader, **train_kwargs
):
self.train_data_loader = train_data_loader
self.val_data_loader = val_data_loader
self._setup_training(**train_kwargs)
self.optimizer = manager.modify(
self.model,
self.optimizer,
steps_per_epoch=len(self.train_data_loader),
)
self.model.train()
# Run model pruning
epoch = manager.min_epochs
while epoch < manager.max_epochs:
# run training loop
epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs)
logger.info("Running Training Epoch {}".format(epoch_name))
train_loss = self._run_model_one_epoch(train=True)
logger.info(
("Training Epoch: {}\nTraining Loss: {}\n").format(
epoch_name, train_loss
)
)
# run validation loop
logger.info("Running Validation Epoch {}".format(epoch_name))
val_loss = self._run_model_one_epoch()
logger.info(
"Validation Epoch: {}\nVal Loss: {}\n".format(
epoch_name, val_loss
)
)
epoch += 1
manager.finalize(self.model)
return self.model
def _load_config(config_file: str):
with open(config_file, "r") as f:
config = json.load(f)
return config
def _load_data(data_dir: str):
data_dir = Path(data_dir)
return [torch.load(input_path) for input_path in data_dir.glob("*.pt")]
def _load_model(model_file: str):
if os.path.isdir(model_file):
path = Path(model_file)
module_file = path / "module.py"
with open(module_file, "r") as f:
module_str = f.read()
exec(module_str, globals())
model = eval("NebullvmFxModule")()
model.load_state_dict(torch.load(path / "state_dict.pt"))
else:
model = torch.load(model_file)
return model
def _train_model(
model: torch.nn.Module,
train_data: List[Tuple[Tuple, Any]],
eval_data: List[Tuple[Tuple, Any]],
epochs_pruning_window: Dict = None,
training_epochs: int = 10,
lr: float = 1e-3,
momentum: float = 0.9,
loss_fn: str = "CrossEntropy",
):
batch_size = train_data[0][0][0].shape[0]
with TemporaryDirectory() as tmp_dir:
onnx_path = _export_model_onnx(
model, Path(tmp_dir), "model.onnx", train_data[0][0]
)
onnx_path = onnx_path.as_posix()
recipe = RecipeBuilder(onnx_path)
# TODO: implement custom parameters support
manager = recipe.build_recipe(
epochs_pruning_window=epochs_pruning_window,
training_epochs=training_epochs,
)
trainer = PruningTrainer(model, batch_size)
pruned_model = trainer.train(
manager, train_data, eval_data, lr=lr, momentum=momentum
)
return pruned_model
def _save_model(model: torch.nn.Module, path: str):
if path.endswith(".pt"):
torch.save(model, path)
else:
torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt")
def main(
model_file: str,
train_data_dir: str,
eval_data_dir: str,
config_file: str,
out_file: str,
):
config = _load_config(config_file)
model = _load_model(model_file)
train_data = _load_data(train_data_dir)
eval_data = _load_data(eval_data_dir)
pruned_model = _train_model(model, train_data, eval_data, **config)
_save_model(pruned_model, out_file)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--model", help="The model to be pruned.")
parser.add_argument(
"--train_dir",
help="The directory contained the pickled training data.",
)
parser.add_argument(
"--eval_dir", help="The directory contained the pickled test data."
)
parser.add_argument("--config", help="The config file.")
parser.add_argument(
"--pruned_model", help="Path where storing the pruned model."
)
args = parser.parse_args()
main(
model_file=args.model,
train_data_dir=args.train_dir,
eval_data_dir=args.eval_dir,
config_file=args.config,
out_file=args.pruned_model,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/scripts/neural_magic_training.py
================================================
import json
import logging
import os.path
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Tuple, List, Any, Dict
import torch
from sparseml.onnx.optim import ModelAnalyzer, pruning_loss_sens_magnitude
from sparseml.pytorch.optim import (
ScheduledModifierManager,
)
from sparseml.pytorch.sparsification import (
EpochRangeModifier,
GMPruningModifier,
)
from sparseml.pytorch.utils import ModuleExporter
from sparsify.blueprints.utils import (
default_epochs_distribution,
PruningModelEvaluator,
default_pruning_settings,
)
from sparsify.schemas import ProjectModelAnalysisSchema
from torch.nn import CrossEntropyLoss, MSELoss
from torch.optim import SGD
from tqdm.auto import tqdm
CRITERION_FNS = {
"CrossEntropy": CrossEntropyLoss(),
"MSE": MSELoss(),
}
logging.basicConfig(
format=" %(asctime)s [%(levelname)s] %(message)s",
datefmt="%d/%m/%Y %I:%M:%S %p",
)
logger = logging.getLogger("nebullvm_logger")
logger.setLevel(logging.INFO)
def _export_model_onnx(
model: torch.nn.Module,
save_path: Path,
model_name: str,
input_batch: Tuple,
):
if torch.cuda.is_available():
input_batch = tuple(t.cuda() for t in input_batch)
model.cuda()
exporter = ModuleExporter(model, output_dir=save_path)
with torch.no_grad():
example_outputs = model(*input_batch)
exporter.export_onnx(
input_batch, name=model_name, example_outputs=example_outputs
)
onnx_path = save_path / model_name
return onnx_path
class RecipeBuilder:
def __init__(self, model_path):
self.model_path = model_path
def _make_analysis(self):
analyzer = ModelAnalyzer(self.model_path)
self.analysis = ProjectModelAnalysisSchema().load(analyzer.dict())
def _compute_loss_sensitivity(self):
sensitivities = []
parameters = []
for i, node in enumerate(self.analysis["nodes"]):
if node["prunable"]:
sensitivities.append(node["prunable_equation_sensitivity"])
parameters.append(node["prunable_params"])
loss_analysis = pruning_loss_sens_magnitude(self.model_path)
results_model = loss_analysis.results_model
results = loss_analysis.results
model = {
"baseline_measurement_key": (
str(results_model.baseline_measurement_key)
),
"measurements": {
str(key): val for key, val in results_model.averages.items()
},
}
ops = []
for res in results:
ops.append(
{
"id": res.id_,
"name": res.name,
"index": res.index,
"baseline_measurement_key": (
str(res.baseline_measurement_key)
),
"measurements": {
str(key): val for key, val in res.averages.items()
},
}
)
pruning = {"model": model, "ops": ops}
loss = {}
loss["baseline"] = {}
loss["pruning"] = pruning
model = PruningModelEvaluator(
self.analysis,
None,
loss,
)
model.eval_baseline(default_pruning_settings().sparsity)
model.eval_pruning(default_pruning_settings())
self.final_analysis = model.to_dict_values()
def build_recipe(self, epochs_pruning_window=None, training_epochs=10):
self._make_analysis()
self._compute_loss_sensitivity()
if epochs_pruning_window is None:
epochs = default_epochs_distribution(training_epochs)
else:
# TODO: set custom parameters
epochs = default_epochs_distribution(training_epochs)
epochs_dict = epochs._asdict()
epochs_dict.update(epochs_pruning_window)
epochs = epochs.__class__(**epochs_dict)
mods = [
EpochRangeModifier(
start_epoch=epochs.start_epoch,
end_epoch=epochs.end_epoch,
)
]
node_weight_name_lookup = {
node["id"]: node["weight_name"]
for node in self.analysis["nodes"]
if node["prunable"]
}
sparsity_to_params = {}
nodes = self.final_analysis[0]
for node in nodes:
sparsity = node["sparsity"]
node_id = node["node_id"]
weight_name = node_weight_name_lookup[node_id]
if sparsity is None:
continue
if sparsity not in sparsity_to_params:
sparsity_to_params[sparsity] = []
sparsity_to_params[sparsity].append(weight_name)
for sparsity, params in sparsity_to_params.items():
gm_pruning = GMPruningModifier(
init_sparsity=0.05,
final_sparsity=sparsity,
start_epoch=epochs.pruning_start_epoch,
end_epoch=epochs.pruning_end_epoch,
update_frequency=epochs.pruning_update_frequency,
params=params,
)
mods.append(gm_pruning)
return ScheduledModifierManager(mods)
class PruningTrainer:
def __init__(self, model, bs):
self.data_loader = None
self.optimizer = None
self.model = model
self.batch_size = bs
def _setup_training(self, loss_fn=None, lr=1e-3, momentum=0.9):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
if loss_fn is None:
loss_fn = CrossEntropyLoss()
else:
loss_fn = CRITERION_FNS.get(loss_fn, CrossEntropyLoss())
self.criterion = loss_fn
self.optimizer = SGD(self.model.parameters(), lr=lr, momentum=momentum)
def _run_model_one_epoch(self, train=False):
if train:
self.model.train()
data_loader = self.train_data_loader
else:
self.model.eval()
data_loader = self.val_data_loader
running_loss = 0.0
for step, (inputs, labels) in tqdm(
enumerate(data_loader), total=len(data_loader)
):
inputs = tuple(t.to(self.device) for t in inputs)
if not isinstance(labels, torch.Tensor):
labels = torch.tensor(labels)
if len(labels.shape) == 0:
labels = labels.unsqueeze(0)
labels = labels.to(self.device)
if train:
self.optimizer.zero_grad()
outputs = self.model(
*inputs
) # model returns logits and softmax as a tuple
loss = self.criterion(outputs, labels)
if train:
loss.backward()
self.optimizer.step()
running_loss += loss.item()
loss = running_loss / (len(data_loader) + 1e-5)
return loss
def train(
self, manager, train_data_loader, val_data_loader, **train_kwargs
):
self.train_data_loader = train_data_loader
self.val_data_loader = val_data_loader
self._setup_training(**train_kwargs)
self.optimizer = manager.modify(
self.model,
self.optimizer,
steps_per_epoch=len(self.train_data_loader),
)
self.model.train()
# Run model pruning
epoch = manager.min_epochs
while epoch < manager.max_epochs:
# run training loop
epoch_name = "{}/{}".format(epoch + 1, manager.max_epochs)
logger.info("Running Training Epoch {}".format(epoch_name))
train_loss = self._run_model_one_epoch(train=True)
logger.info(
("Training Epoch: {}\nTraining Loss: {}\n").format(
epoch_name, train_loss
)
)
# run validation loop
logger.info("Running Validation Epoch {}".format(epoch_name))
val_loss = self._run_model_one_epoch()
logger.info(
"Validation Epoch: {}\nVal Loss: {}\n".format(
epoch_name, val_loss
)
)
epoch += 1
manager.finalize(self.model)
return self.model
def _load_config(config_file: str):
with open(config_file, "r") as f:
config = json.load(f)
return config
def _load_data(data_dir: str):
data_dir = Path(data_dir)
return [torch.load(input_path) for input_path in data_dir.glob("*.pt")]
def _load_model(model_file: str):
if os.path.isdir(model_file):
path = Path(model_file)
module_file = path / "module.py"
with open(module_file, "r") as f:
module_str = f.read()
exec(module_str, globals())
model = eval("NebullvmFxModule")()
model.load_state_dict(torch.load(path / "state_dict.pt"))
else:
model = torch.load(model_file)
return model
def _train_model(
model: torch.nn.Module,
train_data: List[Tuple[Tuple, Any]],
eval_data: List[Tuple[Tuple, Any]],
epochs_pruning_window: Dict = None,
training_epochs: int = 10,
lr: float = 1e-3,
momentum: float = 0.9,
loss_fn: str = "CrossEntropy",
):
batch_size = train_data[0][0][0].shape[0]
with TemporaryDirectory() as tmp_dir:
onnx_path = _export_model_onnx(
model, Path(tmp_dir), "model.onnx", train_data[0][0]
)
onnx_path = onnx_path.as_posix()
recipe = RecipeBuilder(onnx_path)
# TODO: implement custom parameters support
manager = recipe.build_recipe(
epochs_pruning_window=epochs_pruning_window,
training_epochs=training_epochs,
)
trainer = PruningTrainer(model, batch_size)
pruned_model = trainer.train(
manager, train_data, eval_data, lr=lr, momentum=momentum
)
return pruned_model
def _save_model(model: torch.nn.Module, path: str):
if path.endswith(".pt"):
torch.save(model, path)
else:
torch.save(model.state_dict(), Path(path) / "pruned_state_dict.pt")
def main(
model_file: str,
train_data_dir: str,
eval_data_dir: str,
config_file: str,
out_file: str,
):
config = _load_config(config_file)
model = _load_model(model_file)
train_data = _load_data(train_data_dir)
eval_data = _load_data(eval_data_dir)
pruned_model = _train_model(model, train_data, eval_data, **config)
_save_model(pruned_model, out_file)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--model", help="The model to be pruned.")
parser.add_argument(
"--train_dir",
help="The directory contained the pickled training data.",
)
parser.add_argument(
"--eval_dir", help="The directory contained the pickled test data."
)
parser.add_argument("--config", help="The config file.")
parser.add_argument(
"--pruned_model", help="Path where storing the pruned model."
)
args = parser.parse_args()
main(
model_file=args.model,
train_data_dir=args.train_dir,
eval_data_dir=args.eval_dir,
config_file=args.config,
out_file=args.pruned_model,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/compressors/sparseml.py
================================================
import json
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Callable, Dict
import numpy as np
from loguru import logger
from nebullvm.operations.optimizations.compressors.base import Compressor
from nebullvm.optional_modules.torch import torch, Module
from nebullvm.tools.data import DataManager
from nebullvm.tools.pytorch import save_with_torch_fx, load_with_torch_fx
from nebullvm.tools.venv import run_in_different_venv
def _save_model(model: Module, path: Path):
try:
save_with_torch_fx(model, path)
except Exception as ex:
logger.warning(
f"Got an error while exporting with TorchFX. The model will be "
f"saved using the standard PyTorch save pickling method. Error "
f"got: {ex}"
)
torch.save(model, path / "model.pt")
return path / "model.pt"
else:
return path
def _load_model(path: Path):
if path.is_file():
return torch.load(path)
else:
return load_with_torch_fx(path)
def _save_dataset(input_data: DataManager, path: Path):
path.mkdir(exist_ok=True)
for i, x in enumerate(input_data):
torch.save(x, path / f"input_{i}.pt")
def _save_json(dictionary: Dict, path: Path):
with open(path, "w") as f:
json.dump(dictionary, f)
def _write_requirements_file(path: Path):
requirements = "sparseml\nsparsify\ntqdm"
with open(path, "w") as f:
f.write(requirements)
class SparseMLCompressor(Compressor):
def execute(
self,
model: Module,
train_input_data: DataManager,
eval_input_data: DataManager,
metric_drop_ths: float,
metric: Callable,
):
script_path = (
Path(__file__).parent / "scripts/neural_magic_training.py"
)
with TemporaryDirectory(dir="") as tmp_dir:
tmp_dir = Path(tmp_dir)
requirements_file = tmp_dir / "requirements.txt"
model_path = _save_model(model, tmp_dir)
training_data_dir = tmp_dir / "train"
eval_data_dir = tmp_dir / "eval"
config_file = tmp_dir / "config.json"
pruned_model_path = (
tmp_dir / "pruned_model.pt"
if model_path.is_file()
else tmp_dir
)
_write_requirements_file(requirements_file)
_save_dataset(train_input_data, training_data_dir)
_save_dataset(eval_input_data, eval_data_dir)
_save_json(self._config, config_file)
run_in_different_venv(
str(requirements_file),
str(script_path),
torch.cuda.is_available(),
"--model",
f"{model_path}",
"--train_dir",
f"{training_data_dir}",
"--eval_dir",
f"{eval_data_dir}",
"--config",
f"{config_file}",
"--pruned_model",
f"{pruned_model_path}",
)
self.compressed_model = _load_model(pruned_model_path)
if self.compressed_model is not None:
error = self._compute_error(
model, self.compressed_model, eval_input_data, metric
)
if error > metric_drop_ths:
self.compressed_model = None
else:
self.new_metric_ths = metric_drop_ths - error
@staticmethod
@torch.no_grad()
def _compute_error(
model: Module,
pruned_model: Module,
eval_input_data: DataManager,
metric: Callable,
) -> float:
if len(eval_input_data) == 0:
return np.inf
metric_val = 0.0
model.eval()
pruned_model.eval()
for inputs, y in eval_input_data:
if torch.cuda.is_available():
inputs = tuple(data.cuda() for data in inputs)
pruned_model.cuda()
model.cuda()
model_pred = model(*inputs)
pruned_pred = pruned_model(*inputs)
metric_val += metric(model_pred, pruned_pred, y)
return metric_val / len(eval_input_data)
@staticmethod
def _get_default_config() -> Dict:
return {
"training_epochs": 10,
"epochs_pruning_window": {"start_epoch": 0, "end_epoch": 10},
"loss_fn": "CrossEntropy",
"lr": 1e-3,
"momentum": 0.9,
}
@property
def config_key(self) -> str:
return "sparseml"
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimize_inference.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Iterable, Callable, List, Union, Dict, Optional
from nebullvm.config import TRAIN_TEST_SPLIT_RATIO
from nebullvm.core import types
from nebullvm.core.models import (
OptimizeInferenceResult,
OriginalModel,
OptimizedModel,
BenchmarkOriginalModelResult,
ModelCompiler,
ModelCompressor,
OptimizationTime,
ModelParams,
DeepLearningFramework,
)
from nebullvm.operations.base import Operation
from nebullvm.operations.conversions.utils import get_conversion_op
from nebullvm.operations.measures.measures import LatencyOriginalModelMeasure
from nebullvm.operations.measures.utils import QUANTIZATION_METRIC_MAP
from nebullvm.operations.optimizations.optimizers.optimizers import (
PytorchOptimizer,
TensorflowOptimizer,
ONNXOptimizer,
)
from nebullvm.operations.optimizations.utils import (
map_compilers_and_compressors,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import DataLoader as TorchDataLoader
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.utils import (
check_dependencies,
)
from nebullvm.tools.adapters import (
ModelAdapter,
DiffusionAdapter,
HuggingFaceAdapter,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.diffusers import (
is_diffusion_model_pipe,
is_diffusion_model,
)
from nebullvm.tools.hardware_utils import get_hw_setup
from nebullvm.tools.utils import (
is_huggingface_data,
check_input_data,
is_data_subscriptable,
get_dl_framework,
extract_info_from_data,
get_model_name,
get_model_size_mb,
get_throughput,
)
class OptimizeInferenceOp(Operation):
def __init__(self):
super().__init__()
self.torch_optimization_op = PytorchOptimizer()
self.onnx_optimization_op = ONNXOptimizer()
self.tensorflow_optimization_op = TensorflowOptimizer()
@staticmethod
def _as_data_manager(data) -> DataManager:
if isinstance(data, DataManager):
return data
if check_input_data(data) is False:
raise ValueError(
"The provided data does not match the expected "
"format.\n"
"Speedster supports data in the following formats: \n"
"- PyTorch DataLoader\n"
"- TensorFlow Dataset\n"
"- List of tuples: [((input_0, ... ), label), ...] \n"
"Inputs and labels should be either tensors or numpy "
"arrays,\n"
"depending on the framework used.\n"
)
if is_data_subscriptable(data):
return DataManager(data)
else:
return DataManager.from_iterable(data)
@staticmethod
def _check_inputs(model: Any, input_data: types.InputData):
if model is None:
raise ValueError("Input model cannot be None")
if len(input_data) == 0:
raise ValueError("Input data cannot be empty")
def execute(
self,
model: Any,
input_data: types.InputData,
metric_drop_ths: float = None,
metric: Union[str, Callable] = None,
optimization_time: str = "constrained",
dynamic_info: Dict = None,
config_file: str = None,
ignore_compilers: List[str] = None,
ignore_compressors: List[str] = None,
store_latencies: bool = False,
**kwargs,
) -> OptimizeInferenceResult:
self._check_inputs(model, input_data)
check_dependencies(self.device)
ignore_compilers = map_compilers_and_compressors(
ignore_compilers, ModelCompiler
)
ignore_compressors = map_compilers_and_compressors(
ignore_compressors, ModelCompressor
)
optimization_time = OptimizationTime(optimization_time)
data = input_data
if isinstance(data, (TorchDataLoader, tf.data.Dataset)):
try:
data = DataManager.from_dataloader(data)
except Exception:
raise ValueError(
"The provided dataloader does not match the expected "
"format.\n"
"Speedster supports dataloaders that return tuples in "
"the\n"
"following formats: \n"
"Single input: (input, label)\n"
"Multiple inputs: ((input1, input2, ...), label) or "
"(input1, input2, ..., label)\n"
"Inputs and labels should be either tensors or numpy "
"arrays,\n"
"depending on the framework used.\n"
)
# Setup adapters
model_adapter: Optional[ModelAdapter] = None
if is_diffusion_model_pipe(model):
self.logger.info(
"The provided model is a diffusion model. "
"Speedster will optimize the UNet part of the model."
)
model_adapter = DiffusionAdapter(model, data, self.device)
elif is_huggingface_data(data[0]):
model_adapter = HuggingFaceAdapter(
model, data, self.device, **kwargs
)
if dynamic_info is None:
self.logger.warning(
"Dynamic shape info has not been provided for the "
"HuggingFace model. The resulting optimized model "
"will be usable only with a fixed input shape. "
"To optimize the model for dynamic shapes, please "
"look here: https://nebuly.gitbook.io/nebuly/modules/"
"speedster/how-to-guides"
"#using-dynamic-shape."
)
# Adapt data and model
if model_adapter is not None:
data = model_adapter.adapted_data
model = model_adapter.adapted_model
data = self._as_data_manager(data)
dl_framework = get_dl_framework(model)
if metric_drop_ths is not None and metric_drop_ths <= 0:
metric_drop_ths = None
elif metric_drop_ths is not None and metric is None:
metric = "numeric_precision"
if isinstance(metric, str):
metric = QUANTIZATION_METRIC_MAP.get(metric)
model_params: ModelParams = extract_info_from_data(
model=model,
input_data=data,
dl_framework=dl_framework,
dynamic_info=dynamic_info,
device=self.device,
is_diffusion=is_diffusion_model(model),
)
data.split(TRAIN_TEST_SPLIT_RATIO)
# -------- Benchmark original model --------
original_latency_op = LatencyOriginalModelMeasure().to(self.device)
orig_model_benchmark: BenchmarkOriginalModelResult = (
original_latency_op.execute(
model=model,
input_data=data.get_split("test"),
dl_framework=dl_framework,
)
)
original_model = OriginalModel(
model=model,
latency_seconds=orig_model_benchmark.latency_seconds,
name=get_model_name(model),
size_mb=get_model_size_mb(model),
framework=dl_framework,
throughput=get_throughput(
latency=orig_model_benchmark.latency_seconds,
# Normal models have batch size B, diffusion
# models have batch size 2B
batch_size=model_params.batch_size
if not is_diffusion_model(model)
else model_params.batch_size / 2,
),
)
# ------------------------------------------
with TemporaryDirectory() as tmp_dir:
tmp_dir = Path(tmp_dir) / "fp32"
tmp_dir.mkdir(parents=True, exist_ok=True)
# Convert model to all available frameworks
conversion_op = get_conversion_op(dl_framework)
conversion_op.to(self.device).set_state(model, data).execute(
save_path=tmp_dir,
model_params=model_params,
)
# Optimize models
optimized_models: List[OptimizedModel] = []
is_diffusion = is_diffusion_model(model)
for i, model in enumerate(conversion_op.get_result()):
optimized_models += self._optimize(
model=model,
input_data=data,
model_outputs=orig_model_benchmark.model_outputs,
optimization_time=optimization_time,
metric_drop_ths=metric_drop_ths,
metric=metric,
model_params=model_params,
ignore_compilers=ignore_compilers,
ignore_compressors=ignore_compressors,
source_dl_framework=dl_framework,
pipeline_idx=i + 1,
len_pipelines=len(conversion_op.get_result()),
is_diffusion=is_diffusion,
)
optimized_models.sort(key=lambda x: x.latency_seconds, reverse=False)
# Check if at least one optimized model has been created
no_optimized_models = len(optimized_models) < 1
no_inference_learners = all(
o.inference_learner is None for o in optimized_models
)
if no_optimized_models or no_inference_learners:
self.logger.warning(
"No optimized model has been created. This is likely "
"due to a bug during optimization. Please open an issue "
"and report in details your use case."
)
# Extract lowest-latency model
lowest_latency = self._extract_lowest_latency_model(optimized_models)
if model_adapter is not None:
original_model = model_adapter.adapt_original_model(original_model)
lowest_latency = model_adapter.adapt_inference_learner(
lowest_latency
)
return OptimizeInferenceResult(
original_model=original_model,
optimized_model=lowest_latency,
hardware_setup=get_hw_setup(),
)
def _optimize(
self,
model: Any,
model_outputs: Iterable,
input_data: types.InputData,
optimization_time: OptimizationTime,
metric_drop_ths: float,
metric: Callable,
model_params: ModelParams,
ignore_compilers: List[ModelCompiler],
ignore_compressors: List[ModelCompressor],
source_dl_framework: DeepLearningFramework,
pipeline_idx: int,
len_pipelines: int,
is_diffusion: bool,
) -> List[OptimizedModel]:
if isinstance(model, torch.nn.Module):
optimization_op = self.torch_optimization_op
self.logger.info(
f"[{pipeline_idx}/{len_pipelines}] Running PyTorch "
f"Optimization Pipeline"
)
elif isinstance(model, tf.Module):
optimization_op = self.tensorflow_optimization_op
self.logger.info(
f"[{pipeline_idx}/{len_pipelines}] Running TensorFlow "
f"Optimization Pipeline"
)
else:
optimization_op = self.onnx_optimization_op
self.logger.info(
f"[{pipeline_idx}/{len_pipelines}] Running ONNX "
f"Optimization Pipeline"
)
# Run optimization
optimized_models = optimization_op.to(self.device).execute(
model=model,
input_data=input_data,
optimization_time=optimization_time,
metric_drop_ths=metric_drop_ths,
metric=metric,
model_params=model_params,
model_outputs=model_outputs,
ignore_compilers=ignore_compilers,
ignore_compressors=ignore_compressors,
source_dl_framework=source_dl_framework,
is_diffusion=is_diffusion,
)
if isinstance(model, torch.nn.Module):
optimization_op.free_model_gpu(model)
return optimized_models
@staticmethod
def _extract_lowest_latency_model(
models: List[OptimizedModel],
) -> Optional[OptimizedModel]:
# fmt: off
inference_learner_models = [
m for m in models
if m.inference_learner is not None
]
# fmt: on
if len(inference_learner_models) == 0:
return None
return min(inference_learner_models, key=lambda m: m.latency_seconds)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/base.py
================================================
import abc
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List, Tuple, Type, Union
from nebullvm.config import ACTIVATION_METRIC_DROP_THS
from nebullvm.core.models import (
OptimizedModel,
OptimizationTime,
ModelParams,
ModelCompiler,
ModelCompressor,
DeepLearningFramework,
DeviceType,
QuantizationType,
)
from nebullvm.operations.base import Operation
from nebullvm.operations.inference_learners.base import (
BuildInferenceLearner,
)
from nebullvm.operations.inference_learners.builders import (
DeepSparseBuildInferenceLearner,
FasterTransformerBuildInferenceLearner,
IntelNeuralCompressorBuildInferenceLearner,
ONNXApacheTVMBuildInferenceLearner,
ONNXBuildInferenceLearner,
ONNXTensorRTBuildInferenceLearner,
OpenVINOBuildInferenceLearner,
PyTorchApacheTVMBuildInferenceLearner,
PyTorchTensorRTBuildInferenceLearner,
TensorflowBuildInferenceLearner,
TFLiteBuildInferenceLearner,
TorchNeuronBuildInferenceLearner,
TorchXLABuildInferenceLearner,
TorchDynamoBuildInferenceLearner,
TorchScriptBuildInferenceLearner,
)
from nebullvm.operations.measures.measures import MetricDropMeasure
from nebullvm.operations.measures.utils import (
compute_optimized_running_time,
compute_relative_difference,
)
from nebullvm.operations.optimizations.compilers.base import Compiler
from nebullvm.operations.optimizations.compilers.deepsparse import (
DeepSparseCompiler,
)
from nebullvm.operations.optimizations.compilers.faster_transformer import (
FasterTransformerCompiler,
)
from nebullvm.operations.optimizations.compilers.intel_neural_compressor import ( # noqa: E501
IntelNeuralCompressorCompiler,
)
from nebullvm.operations.optimizations.compilers.onnxruntime import (
ONNXCompiler,
)
from nebullvm.operations.optimizations.compilers.openvino import (
OpenVINOCompiler,
)
from nebullvm.operations.optimizations.compilers.tensor_rt import (
ONNXTensorRTCompiler,
PyTorchTensorRTCompiler,
)
from nebullvm.operations.optimizations.compilers.tensorflow import (
TensorflowBackendCompiler,
TFLiteBackendCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_dynamo import (
TorchDynamoCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_neuron import (
TorchNeuronCompiler,
)
from nebullvm.operations.optimizations.compilers.torch_xla import (
TorchXLACompiler,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
TorchScriptCompiler,
)
from nebullvm.operations.optimizations.compilers.tvm import (
ONNXApacheTVMCompiler,
PyTorchApacheTVMCompiler,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import get_throughput
class Optimizer(Operation, abc.ABC):
def __init__(self):
super().__init__()
self.optimized_models = []
self.source_dl_framework = None
self.pipeline_dl_framework = None
self.compiler_ops = {}
self.build_inference_learner_ops = {}
self.validity_check_op = MetricDropMeasure()
def execute(
self,
model: Any,
input_data: DataManager,
optimization_time: OptimizationTime,
metric_drop_ths: float,
metric: Callable,
model_params: ModelParams,
model_outputs: List[Tuple[Any, ...]],
ignore_compilers: List[ModelCompiler],
ignore_compressors: List[ModelCompressor],
source_dl_framework: DeepLearningFramework,
is_diffusion: bool = False,
) -> List[OptimizedModel]:
self.source_dl_framework = source_dl_framework
# TODO: implement and select compressors from hardware
compilers = self._select_compilers_from_hardware()
remove_compiler_list = []
add_compiler_list = []
for compiler in ignore_compilers:
if compiler in MULTI_FRAMEWORK_COMPILERS:
add_compiler_list += MULTI_FRAMEWORK_COMPILERS[compiler]
remove_compiler_list.append(compiler)
for c in remove_compiler_list:
ignore_compilers.remove(c)
ignore_compilers += add_compiler_list
(
self.compiler_ops,
self.build_inference_learner_ops,
) = self._load_compilers(
ignore_compilers=ignore_compilers,
compilers=compilers,
)
self._optimize(
model=model,
input_data=input_data,
optimization_time=optimization_time,
metric_drop_ths=metric_drop_ths,
metric=metric,
model_params=model_params,
model_outputs=model_outputs,
ignore_compilers=ignore_compilers,
is_diffusion=is_diffusion,
)
return self.optimized_models
@abc.abstractmethod
def _select_compilers_from_hardware(self):
raise NotImplementedError()
@staticmethod
def _load_compilers(
ignore_compilers: List[ModelCompiler],
compilers: List[ModelCompiler],
):
compiler_ops = {
compiler: COMPILER_TO_OPTIMIZER_MAP[compiler]()
for compiler in compilers
if compiler not in ignore_compilers
and compiler in COMPILER_TO_OPTIMIZER_MAP
}
build_inference_learner_ops = {
compiler: COMPILER_TO_INFERENCE_LEARNER_MAP[compiler]()
for compiler in compilers
if compiler not in ignore_compilers
and compiler in COMPILER_TO_OPTIMIZER_MAP
}
return compiler_ops, build_inference_learner_ops
def free_model_gpu(self, model: Any):
# Free gpu memory
if self.device.type is DeviceType.GPU:
try:
model.cpu()
except Exception:
pass
try:
with torch.cuda.device(self.device.to_torch_format()):
torch.cuda.empty_cache()
except Exception:
pass
def _optimize(
self,
model: Union[torch.nn.Module, tf.Module, str],
input_data: DataManager,
optimization_time: OptimizationTime,
metric_drop_ths: float,
metric: Callable,
model_params: ModelParams,
model_outputs: List[Tuple[Any, ...]],
ignore_compilers: List[ModelCompiler],
is_diffusion: bool = False,
):
if metric_drop_ths is not None:
q_types = [
None,
]
if metric_drop_ths > 0:
q_types.append(QuantizationType.HALF)
if metric_drop_ths > ACTIVATION_METRIC_DROP_THS:
q_types.append(QuantizationType.DYNAMIC)
if input_data is not None:
q_types.append(QuantizationType.STATIC)
else:
q_types = [None]
optimization_info = []
for compiler, compiler_op, build_inference_learner_op in zip(
self.compiler_ops.keys(),
self.compiler_ops.values(),
self.build_inference_learner_ops.values(),
):
for q_type in q_types:
input_tfms = MultiStageTransformation([])
self.free_model_gpu(model)
with TemporaryDirectory() as tmp_dir:
try:
compiler_op.to(self.device).execute(
model=model,
input_data=input_data,
model_params=model_params,
metric_drop_ths=metric_drop_ths
if q_type is not None
else None,
quantization_type=q_type,
input_tfms=input_tfms,
onnx_output_path=tmp_dir,
is_diffusion=is_diffusion,
)
compiled_model = compiler_op.get_result()
if compiled_model is not None:
build_inference_learner_op.to(self.device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=self.source_dl_framework,
quantization_type=q_type,
)
inference_learner = (
build_inference_learner_op.get_result()
)
if inference_learner is not None:
test_input_data, ys = input_data.get_split(
"test"
).get_list(with_ys=True)
self.validity_check_op.execute(
inference_learner,
test_input_data,
model_outputs,
metric_drop_ths,
metric_func=metric
if q_type is not None
else compute_relative_difference,
ys=ys,
)
if self.validity_check_op.valid:
latency = compute_optimized_running_time(
inference_learner, input_data
)
self.logger.info(
f"Optimized model latency: {latency} "
f"sec/iter"
)
if (
compiler not in ignore_compilers
and optimization_time
is OptimizationTime.CONSTRAINED
):
ignore_compilers.append(compiler)
self.optimized_models.append(
OptimizedModel(
inference_learner=inference_learner, # noqa: E501
metric_drop=self.validity_check_op.measure_result, # noqa: E501
compiler=compiler,
technique=q_type.name
if q_type is not None
else "None",
latency_seconds=latency,
throughput=get_throughput(
latency,
# Normal models have batch
# size B, diffusion models
# have batch size 2B
model_params.batch_size
if not is_diffusion
else model_params.batch_size
/ 2,
),
size_mb=inference_learner.get_size() # noqa: E501
/ 1e6,
)
)
opt_info_dict = {
"compiler": f"{self.pipeline_dl_framework.value}_{compiler.value}", # noqa: E501
"technique": q_type.value
if q_type
else "none",
"latency": latency,
}
if (
metric_drop_ths is not None
and q_type is not None
):
opt_info_dict[
"metric_loss"
] = (
self.validity_check_op.measure_result # noqa: E501
)
opt_info_dict[
"metric"
] = metric.__name__
optimization_info.append(opt_info_dict)
else:
self.logger.warning(
"The optimized model will be "
"discarded due to poor results "
"obtained with the given metric."
)
if self.device.type in [
DeviceType.GPU,
DeviceType.TPU,
]:
inference_learner.free_gpu_memory()
except Exception as ex:
self.logger.warning(
f"Optimization failed with "
f"{self.pipeline_dl_framework} "
f"interface of {compiler}. Got error {ex}. "
f"If possible the compilation will be re-scheduled"
f" with another interface. Please consult the "
f"documentation for further info or open an issue "
f"on GitHub for receiving assistance."
)
optimization_info.append(
{
"compiler": compiler.value,
"technique": q_type.value
if q_type
else "none",
"latency": -1,
}
)
if self.feedback_collector is not None:
self.feedback_collector.store_info(
key="optimizations",
value=optimization_info,
)
MULTI_FRAMEWORK_COMPILERS = {
ModelCompiler.TENSOR_RT: [
ModelCompiler.TENSOR_RT_TORCH,
ModelCompiler.TENSOR_RT_ONNX,
],
ModelCompiler.APACHE_TVM: [
ModelCompiler.APACHE_TVM_TORCH,
ModelCompiler.APACHE_TVM_ONNX,
],
}
COMPILER_TO_OPTIMIZER_MAP: Dict[ModelCompiler, Type[Compiler]] = {
ModelCompiler.TORCHSCRIPT: TorchScriptCompiler,
ModelCompiler.DEEPSPARSE: DeepSparseCompiler,
ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorCompiler,
ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTCompiler,
ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTCompiler,
ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMCompiler,
ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMCompiler,
ModelCompiler.ONNX_RUNTIME: ONNXCompiler,
ModelCompiler.OPENVINO: OpenVINOCompiler,
ModelCompiler.TFLITE: TFLiteBackendCompiler,
ModelCompiler.XLA: TensorflowBackendCompiler,
ModelCompiler.TORCH_NEURON: TorchNeuronCompiler,
ModelCompiler.TORCH_XLA: TorchXLACompiler,
ModelCompiler.TORCH_DYNAMO: TorchDynamoCompiler,
ModelCompiler.FASTER_TRANSFORMER: FasterTransformerCompiler,
}
COMPILER_TO_INFERENCE_LEARNER_MAP: Dict[
ModelCompiler, Type[BuildInferenceLearner]
] = {
ModelCompiler.TORCHSCRIPT: TorchScriptBuildInferenceLearner,
ModelCompiler.DEEPSPARSE: DeepSparseBuildInferenceLearner,
ModelCompiler.INTEL_NEURAL_COMPRESSOR: IntelNeuralCompressorBuildInferenceLearner, # noqa: E501
ModelCompiler.TENSOR_RT_TORCH: PyTorchTensorRTBuildInferenceLearner,
ModelCompiler.TENSOR_RT_ONNX: ONNXTensorRTBuildInferenceLearner,
ModelCompiler.APACHE_TVM_TORCH: PyTorchApacheTVMBuildInferenceLearner,
ModelCompiler.APACHE_TVM_ONNX: ONNXApacheTVMBuildInferenceLearner,
ModelCompiler.ONNX_RUNTIME: ONNXBuildInferenceLearner,
ModelCompiler.OPENVINO: OpenVINOBuildInferenceLearner,
ModelCompiler.TFLITE: TFLiteBuildInferenceLearner,
ModelCompiler.XLA: TensorflowBuildInferenceLearner,
ModelCompiler.TORCH_NEURON: TorchNeuronBuildInferenceLearner,
ModelCompiler.TORCH_XLA: TorchXLABuildInferenceLearner,
ModelCompiler.TORCH_DYNAMO: TorchDynamoBuildInferenceLearner,
ModelCompiler.FASTER_TRANSFORMER: FasterTransformerBuildInferenceLearner,
}
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/optimizers/optimizers.py
================================================
import platform
from nebullvm.core.models import (
DeepLearningFramework,
DeviceType,
ModelCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import Optimizer
from nebullvm.operations.optimizations.compilers.utils import (
tvm_is_available,
bladedisc_is_available,
deepsparse_is_available,
intel_neural_compressor_is_available,
torch_tensorrt_is_available,
onnxruntime_is_available,
tensorrt_is_available,
openvino_is_available,
torch_neuron_is_available,
torch_xla_is_available,
faster_transformer_is_available,
)
from nebullvm.optional_modules.torch import torch
from nebullvm.optional_modules.utils import (
torch_is_available,
tensorflow_is_available,
onnx_is_available,
)
from nebullvm.tools.utils import check_module_version
class PytorchOptimizer(Optimizer):
def __init__(self):
super().__init__()
self.pipeline_dl_framework = DeepLearningFramework.PYTORCH
def _select_compilers_from_hardware(self):
compilers = []
if torch_is_available():
if self.device.type is DeviceType.TPU:
if torch_xla_is_available():
compilers.append(ModelCompiler.TORCH_XLA)
else:
raise RuntimeError(
"Torch XLA is not available on your platform. "
"Please install torch-xla the readme at this "
"link: https://github.com/pytorch/xla"
)
elif self.device.type is DeviceType.NEURON:
if torch_neuron_is_available():
compilers.append(ModelCompiler.TORCH_NEURON)
else:
raise RuntimeError(
"Torch Neuron is not available on your platform. "
"Please install torch-neuron by following "
"this guide: https://awsdocs-neuron"
".readthedocs-hosted.com/en/latest/general/"
"quick-start/torch-neuron.html."
)
else:
compilers.append(ModelCompiler.TORCHSCRIPT)
if (
check_module_version(torch, min_version="2.0.0")
and platform.system() != "Windows"
and False
): # Deactivated because save and load methods are
# not implemented
compilers.append(ModelCompiler.TORCH_DYNAMO)
if tvm_is_available():
compilers.append(ModelCompiler.APACHE_TVM_TORCH)
if bladedisc_is_available():
compilers.append(ModelCompiler.BLADEDISC)
if self.device.type is DeviceType.CPU:
if deepsparse_is_available():
compilers.append(ModelCompiler.DEEPSPARSE)
if intel_neural_compressor_is_available():
compilers.append(ModelCompiler.INTEL_NEURAL_COMPRESSOR)
elif self.device.type is DeviceType.GPU:
if torch_tensorrt_is_available():
compilers.append(ModelCompiler.TENSOR_RT_TORCH)
if faster_transformer_is_available():
compilers.append(ModelCompiler.FASTER_TRANSFORMER)
return compilers
class TensorflowOptimizer(Optimizer):
def __init__(self):
super().__init__()
self.pipeline_dl_framework = DeepLearningFramework.TENSORFLOW
def _select_compilers_from_hardware(self):
compilers = []
if tensorflow_is_available():
compilers.append(ModelCompiler.XLA)
compilers.append(ModelCompiler.TFLITE)
return compilers
class ONNXOptimizer(Optimizer):
def __init__(self):
super().__init__()
self.pipeline_dl_framework = DeepLearningFramework.NUMPY
def _select_compilers_from_hardware(self):
compilers = []
if onnx_is_available():
if onnxruntime_is_available():
compilers.append(ModelCompiler.ONNX_RUNTIME)
if tvm_is_available():
compilers.append(ModelCompiler.APACHE_TVM_ONNX)
if self.device.type is DeviceType.GPU and tensorrt_is_available():
compilers.append(ModelCompiler.TENSOR_RT_ONNX)
if self.device.type is DeviceType.CPU and openvino_is_available():
compilers.append(ModelCompiler.OPENVINO)
return compilers
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_deepsparse.py
================================================
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.config import CONSTRAINED_METRIC_DROP_THS
from nebullvm.core.models import (
Device,
DeviceType,
DeepLearningFramework,
ModelCompiler,
)
from nebullvm.operations.inference_learners.deepsparse import (
DEEPSPARSE_INFERENCE_LEARNERS,
)
from nebullvm.operations.measures.measures import MetricDropMeasure
from nebullvm.operations.measures.utils import compute_relative_difference
from nebullvm.operations.optimizations.compilers.deepsparse import (
DeepSparseCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
deepsparse_is_available,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import initialize_model
from nebullvm.operations.inference_learners.utils import load_model
device = Device(DeviceType.CPU)
@pytest.mark.parametrize(
("output_library", "dynamic"),
[
# (DeepLearningFramework.PYTORCH, True),
(DeepLearningFramework.PYTORCH, False),
],
)
@pytest.mark.skipif(
not deepsparse_is_available(),
reason="Can't test deepsparse if it's not installed.",
)
def test_deepsparse(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type=None,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, None, output_library, device)
compiler_op = DeepSparseCompiler()
compiler_op.to(device).execute(
model=model,
onnx_output_path=tmp_dir,
model_params=model_params,
quantization_type=None,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.DEEPSPARSE
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, DEEPSPARSE_INFERENCE_LEARNERS[output_library]
)
inputs_example = optimized_model.get_inputs_example()
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
test_input_data, ys = input_data.get_split("test").get_list(
with_ys=True
)
validity_check_op = MetricDropMeasure()
validity_check_op.execute(
optimized_model,
test_input_data,
model_outputs,
CONSTRAINED_METRIC_DROP_THS,
metric_func=metric
if quantization_type is not None
else compute_relative_difference,
ys=ys,
)
# Check validity of the optimized model
assert validity_check_op.get_result()
# Dynamic batch size is currently not supported from deepsparse
# if dynamic:
# inputs_example = [
# input_[: len(input_) // 2] for input_ in inputs_example
# ]
# res = model(*inputs_example)
# assert res is not None
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_intel_neural_compressor.py
================================================
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.core.models import (
DeviceType,
Device,
QuantizationType,
DeepLearningFramework,
ModelCompiler,
)
from nebullvm.operations.inference_learners.neural_compressor import (
NEURAL_COMPRESSOR_INFERENCE_LEARNERS,
)
from nebullvm.operations.optimizations.compilers.intel_neural_compressor import ( # noqa: E501
IntelNeuralCompressorCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import (
intel_neural_compressor_is_available,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
device = Device(DeviceType.CPU)
@pytest.mark.parametrize(
("output_library", "dynamic", "metric_drop_ths", "quantization_type"),
[
(DeepLearningFramework.PYTORCH, True, 2, QuantizationType.DYNAMIC),
(DeepLearningFramework.PYTORCH, False, 2, QuantizationType.DYNAMIC),
(DeepLearningFramework.PYTORCH, True, 2, QuantizationType.STATIC),
(DeepLearningFramework.PYTORCH, False, 2, QuantizationType.STATIC),
],
)
@pytest.mark.skipif(
not intel_neural_compressor_is_available(),
reason="Can't test neural compressor if it's not installed.",
)
def test_neural_compressor(
output_library: DeepLearningFramework,
dynamic: bool,
metric_drop_ths: float,
quantization_type: QuantizationType,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, None, output_library, device)
compiler_op = IntelNeuralCompressorCompiler()
compiler_op.to(device).execute(
model=model,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.INTEL_NEURAL_COMPRESSOR
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model,
NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library],
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, NEURAL_COMPRESSOR_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = optimized_model.get_inputs_example()
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic:
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(res_tensor, res_orig_tensor, rtol=1e-01)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_onnxruntime.py
================================================
import sys
from pathlib import Path
from tempfile import TemporaryDirectory
import onnx
import pytest
import torch
from nebullvm.core.models import (
Device,
DeviceType,
DeepLearningFramework,
QuantizationType,
ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.onnx import ONNX_INFERENCE_LEARNERS
from nebullvm.operations.optimizations.compilers.onnxruntime import (
ONNXCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available
device = (
Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
"external_data_format",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None, True),
(DeepLearningFramework.PYTORCH, True, None, None, None, False),
(DeepLearningFramework.PYTORCH, False, None, None, None, False),
],
)
def test_onnxruntime(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
external_data_format: bool,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
# Test onnx external data format (large models)
if external_data_format:
onnx_model = onnx.load(model_path)
onnx.save_model(
onnx_model,
model_path,
save_as_external_data=True,
all_tensors_to_one_file=False,
)
compiler_op = ONNXCompiler()
compiler_op.to(device).execute(
model=model_path,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.ONNX_RUNTIME
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
quantization_type=quantization_type,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
with torch.inference_mode():
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
"external_data_format",
),
[
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.DYNAMIC,
2,
"numeric_precision",
False,
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.STATIC,
2,
"numeric_precision",
False,
),
],
)
@pytest.mark.skipif(
torch.cuda.is_available(),
reason="onnxruntime with int8 precision is very slow on GPU",
)
def test_onnxruntime_quantization(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
external_data_format: bool,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
# Test onnx external data format (large models)
if external_data_format:
onnx_model = onnx.load(model_path)
onnx.save_model(
onnx_model,
model_path,
save_as_external_data=True,
all_tensors_to_one_file=False,
)
compiler_op = ONNXCompiler()
compiler_op.to(device).execute(
model=model_path,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.ONNX_RUNTIME
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
quantization_type=quantization_type,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
with torch.inference_mode():
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(res_tensor, res_orig_tensor, rtol=2e-01)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
"external_data_format",
),
[
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.HALF,
2,
"numeric_precision",
False,
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.HALF,
2,
"numeric_precision",
True,
),
],
)
@pytest.mark.skipif(
sys.platform == "win32",
reason="onnxruntime with half precision on windows does not work",
)
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="onnxruntime with half precision is very slow on CPU",
)
def test_onnxruntime_half(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
external_data_format: bool,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
# Test onnx external data format (large models)
if external_data_format:
onnx_model = onnx.load(model_path)
onnx.save_model(
onnx_model,
model_path,
save_as_external_data=True,
all_tensors_to_one_file=False,
)
compiler_op = ONNXCompiler()
compiler_op.to(device).execute(
model=model_path,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.ONNX_RUNTIME
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
quantization_type=quantization_type,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, ONNX_INFERENCE_LEARNERS[output_library]
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = ONNX_INFERENCE_LEARNERS[output_library].load(tmp_dir)
assert isinstance(
loaded_model, ONNX_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
with torch.inference_mode():
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=1e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_openvino.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
import cpuinfo
import pytest
import torch
from nebullvm.core.models import (
DeepLearningFramework,
QuantizationType,
Device,
DeviceType,
ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.openvino import (
OPENVINO_INFERENCE_LEARNERS,
)
from nebullvm.operations.optimizations.compilers.openvino import (
OpenVINOCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.HALF,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.STATIC,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.STATIC,
2,
"numeric_precision",
),
],
)
@pytest.mark.skipif(
"intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
reason="Openvino is only available for intel processors.",
)
def test_openvino(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
device = Device(DeviceType.CPU)
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
compiler_op = OpenVINOCompiler()
compiler_op.to(device).execute(
model=model_path,
model_params=model_params,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.OPENVINO
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, OPENVINO_INFERENCE_LEARNERS[output_library]
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, OPENVINO_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=2e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensor_rt.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.core.models import (
Device,
DeviceType,
DeepLearningFramework,
QuantizationType,
ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.tensor_rt import (
TENSOR_RT_INFERENCE_LEARNERS,
PytorchTensorRTInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tensor_rt import (
ONNXTensorRTCompiler,
PyTorchTensorRTCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import check_module_version
device = Device(DeviceType.GPU)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.HALF,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.STATIC,
2,
"numeric_precision",
),
],
)
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="Skip because cuda is not available.",
)
def test_tensorrt_onnx(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
compiler_op = ONNXTensorRTCompiler()
compiler_op.to(device).execute(
model=model_path,
model_params=model_params,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.TENSOR_RT_ONNX
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(
optimized_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]
)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(
loaded_model, TENSOR_RT_INFERENCE_LEARNERS[output_library]
)
assert isinstance(optimized_model.get_size(), int)
inputs_example = tuple(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic:
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
with torch.inference_mode():
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=1e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.HALF,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
False,
QuantizationType.STATIC,
2,
"numeric_precision",
),
],
)
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="Skip because cuda is not available.",
)
@pytest.mark.skipif(
not check_module_version(torch, max_version="1.13.1+cu117"),
reason="Skip because torch version is not supported.",
)
def test_tensorrt_torch(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = PyTorchTensorRTCompiler()
compiler_op.to(device).execute(
model=model,
model_params=model_params,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.TENSOR_RT_TORCH
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, PytorchTensorRTInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = PytorchTensorRTInferenceLearner.load(tmp_dir)
assert isinstance(loaded_model, PytorchTensorRTInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = tuple(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=1e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tensorflow.py
================================================
from tempfile import TemporaryDirectory
import pytest
from nebullvm.core.models import (
DeepLearningFramework,
QuantizationType,
Device,
DeviceType,
ModelCompiler,
)
from nebullvm.operations.inference_learners.tensorflow import (
TensorflowBackendInferenceLearner,
TFLiteBackendInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tensorflow import (
TensorflowBackendCompiler,
TFLiteBackendCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.TENSORFLOW, False, None, None, None),
(DeepLearningFramework.TENSORFLOW, True, None, None, None),
],
)
def test_tensorflow_backend(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
device = (
Device(DeviceType.GPU)
if gpu_is_available()
else Device(DeviceType.CPU)
)
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = TensorflowBackendCompiler()
compiler_op.to(device).execute(
model=model,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.XLA
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, TensorflowBackendInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, TensorflowBackendInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model.predict(*inputs_example)
assert res is not None
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = optimized_model.predict(*inputs_example)
assert res is not None
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(
DeepLearningFramework.TENSORFLOW,
False,
None,
0.1,
"numeric_precision",
),
(
DeepLearningFramework.TENSORFLOW,
True,
None,
0.1,
"numeric_precision",
),
(
DeepLearningFramework.TENSORFLOW,
True,
QuantizationType.DYNAMIC,
2,
"numeric_precision",
),
(
DeepLearningFramework.TENSORFLOW,
True,
QuantizationType.HALF,
2,
"numeric_precision",
),
(
DeepLearningFramework.TENSORFLOW,
True,
QuantizationType.STATIC,
2,
"numeric_precision",
),
],
)
def test_tf_lite(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
device = Device(DeviceType.CPU)
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = TFLiteBackendCompiler()
compiler_op.to(device).execute(
model=model,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.TFLITE
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, TFLiteBackendInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = TFLiteBackendInferenceLearner.load(tmp_dir)
assert isinstance(loaded_model, TFLiteBackendInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model.predict(*inputs_example)
assert res is not None
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = optimized_model.predict(*inputs_example)
assert res is not None
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torch_dynamo.py
================================================
import platform
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.core.models import (
DeviceType,
Device,
DeepLearningFramework,
QuantizationType,
ModelCompiler,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
TorchDynamoInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.torch_dynamo import (
TorchDynamoCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.tools.utils import gpu_is_available, check_module_version
device = (
Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)
def run_test_torch_dynamo(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir: # noqa: F841
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = TorchDynamoCompiler()
compiler_op.to(device).execute(
model=model,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
model_params=model_params,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.TORCH_DYNAMO
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, TorchDynamoInferenceLearner)
# Test save and load functions
# optimized_model.save(tmp_dir)
# loaded_model = load_model(tmp_dir)
# assert isinstance(loaded_model, TorchDynamoInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
# res_loaded = loaded_model(*inputs_example)
# assert all(
# [
# torch.allclose(res_tensor, res_loaded_tensor)
# for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
# ]
# )
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=2e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
],
)
@pytest.mark.skipif(
not check_module_version(torch, min_version="2.0.0"),
reason="Torch version is not supported",
)
@pytest.mark.skipif(
platform.system() == "Windows",
reason="Torch compile() is not currently supported on windows",
)
def test_torch_dynamo_fp32(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
run_test_torch_dynamo(
output_library,
dynamic,
quantization_type,
metric_drop_ths,
metric,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_torchscript.py
================================================
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.core.models import (
DeviceType,
Device,
DeepLearningFramework,
QuantizationType,
ModelCompiler,
)
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.torchscript import (
TorchScriptCompiler,
)
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available
device = (
Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)
def run_test_torchscript(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = TorchScriptCompiler()
compiler_op.to(device).execute(
model=model,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.TORCHSCRIPT
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, TorchScriptInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, TorchScriptInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = list(optimized_model.get_inputs_example())
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic: # Check also with a smaller bath_size
torch_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
inputs_example = [
input_[: len(input_) // 2].to(torch_device)
for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=2e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
],
)
def test_torchscript_no_quantization(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
run_test_torchscript(
output_library,
dynamic,
quantization_type,
metric_drop_ths,
metric,
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.HALF,
2,
"numeric_precision",
)
],
)
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="Half quantization is not available on CPU",
)
def test_torchscript_half_quantization(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
run_test_torchscript(
output_library,
dynamic,
quantization_type,
metric_drop_ths,
metric,
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.DYNAMIC,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.STATIC,
2,
"numeric_precision",
),
],
)
@pytest.mark.skipif(
torch.cuda.is_available(),
reason="INT8 quantization is not available on GPU",
)
def test_torchscript_int8_quantization(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
run_test_torchscript(
output_library,
dynamic,
quantization_type,
metric_drop_ths,
metric,
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/test_tvm.py
================================================
from pathlib import Path
from tempfile import TemporaryDirectory
import pytest
import torch
from nebullvm.core.models import (
Device,
DeviceType,
DeepLearningFramework,
QuantizationType,
ModelCompiler,
)
from nebullvm.operations.conversions.converters import PytorchConverter
from nebullvm.operations.inference_learners.tvm import (
PytorchApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.tvm import (
ONNXApacheTVMCompiler,
PyTorchApacheTVMCompiler,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from nebullvm.operations.optimizations.optimizers.base import (
COMPILER_TO_INFERENCE_LEARNER_MAP,
)
from nebullvm.operations.optimizations.tests.utils import (
initialize_model,
check_model_validity,
)
from nebullvm.operations.inference_learners.utils import load_model
from nebullvm.tools.utils import gpu_is_available
device = (
Device(DeviceType.GPU) if gpu_is_available() else Device(DeviceType.CPU)
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.DYNAMIC,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.HALF,
2,
"numeric_precision",
),
# (
# DeepLearningFramework.PYTORCH,
# True,
# QuantizationType.STATIC,
# 2,
# "numeric_precision",
# ),
],
)
@pytest.mark.skipif(
not tvm_is_available(), reason="Apache TVM is not installed"
)
def test_tvm_onnx(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
model_path = Path(tmp_dir) / "fp32"
model_path.mkdir(parents=True)
converter_op = PytorchConverter()
converter_op.to(device).set_state(model, input_data).execute(
model_path, model_params
)
converted_models = converter_op.get_result()
assert len(converted_models) > 1
model_path = str(
[model for model in converted_models if isinstance(model, Path)][0]
)
compiler_op = ONNXApacheTVMCompiler()
compiler_op.to(device).execute(
model=model_path,
model_params=model_params,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.APACHE_TVM_ONNX
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = optimized_model.get_inputs_example()
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
if dynamic:
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=1e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
@pytest.mark.parametrize(
(
"output_library",
"dynamic",
"quantization_type",
"metric_drop_ths",
"metric",
),
[
(DeepLearningFramework.PYTORCH, True, None, None, None),
(DeepLearningFramework.PYTORCH, False, None, None, None),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.DYNAMIC,
2,
"numeric_precision",
),
(
DeepLearningFramework.PYTORCH,
True,
QuantizationType.HALF,
2,
"numeric_precision",
),
# (
# DeepLearningFramework.PYTORCH,
# True,
# QuantizationType.STATIC,
# 2,
# "numeric_precision",
# ),
],
)
@pytest.mark.skipif(
not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_tvm_torch(
output_library: DeepLearningFramework,
dynamic: bool,
quantization_type: QuantizationType,
metric_drop_ths: int,
metric: str,
):
with TemporaryDirectory() as tmp_dir:
(
model,
input_data,
model_params,
input_tfms,
model_outputs,
metric,
) = initialize_model(dynamic, metric, output_library, device)
compiler_op = PyTorchApacheTVMCompiler()
compiler_op.to(device).execute(
model=model,
model_params=model_params,
input_tfms=input_tfms,
metric_drop_ths=metric_drop_ths,
quantization_type=quantization_type,
input_data=input_data,
)
compiled_model = compiler_op.get_result()
build_inference_learner_op = COMPILER_TO_INFERENCE_LEARNER_MAP[
ModelCompiler.APACHE_TVM_TORCH
]()
build_inference_learner_op.to(device).execute(
model=compiled_model,
model_orig=compiler_op.model_orig
if hasattr(compiler_op, "model_orig")
else None,
model_params=model_params,
input_tfms=input_tfms,
source_dl_framework=output_library,
)
optimized_model = build_inference_learner_op.get_result()
assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)
# Test save and load functions
optimized_model.save(tmp_dir)
loaded_model = PytorchApacheTVMInferenceLearner.load(tmp_dir)
assert isinstance(loaded_model, PytorchApacheTVMInferenceLearner)
assert isinstance(optimized_model.get_size(), int)
inputs_example = optimized_model.get_inputs_example()
res = optimized_model(*inputs_example)
assert res is not None
res_loaded = loaded_model(*inputs_example)
assert all(
[
torch.allclose(res_tensor, res_loaded_tensor)
for (res_tensor, res_loaded_tensor) in zip(res, res_loaded)
]
)
# Test validity of the model
valid = check_model_validity(
optimized_model,
input_data,
model_outputs,
metric_drop_ths,
quantization_type,
metric,
)
assert valid
if dynamic:
inputs_example = [
input_[: len(input_) // 2] for input_ in inputs_example
]
res = optimized_model(*inputs_example)
assert res is not None
res_orig = tuple(model(*inputs_example))
assert all(
[
torch.allclose(
res_tensor.float(), res_orig_tensor, rtol=1e-01
)
for (res_tensor, res_orig_tensor) in zip(res, res_orig)
]
)
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/tests/utils.py
================================================
import os
from pathlib import Path
from typing import Any, Callable, Optional, Tuple
import tensorflow as tf
import tensorflow.keras as keras
import torch
from tensorflow.keras import Model, layers
from transformers import AlbertModel, AlbertTokenizer
from nebullvm.config import TRAIN_TEST_SPLIT_RATIO, CONSTRAINED_METRIC_DROP_THS
from nebullvm.core.models import (
DeepLearningFramework,
ModelParams,
DataType,
DeviceType,
Device,
QuantizationType,
)
from nebullvm.operations.conversions.huggingface import convert_hf_model
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.operations.measures.measures import (
LatencyOriginalModelMeasure,
MetricDropMeasure,
)
from nebullvm.operations.measures.utils import compute_relative_difference
from nebullvm.tools.data import DataManager
from nebullvm.tools.transformations import MultiStageTransformation
from nebullvm.tools.utils import gpu_is_available, extract_info_from_data
INPUT_SHAPE = (3, 256, 256)
OUTPUT_SHAPE = (2,)
STATIC_BATCH_SIZE = 1
DYNAMIC_BATCH_SIZE = 2
class TestModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = torch.nn.Conv2d(
in_channels=3, out_channels=64, kernel_size=3
)
self.relu1 = torch.nn.ReLU()
self.conv2 = torch.nn.Conv2d(
in_channels=64, out_channels=32, kernel_size=3
)
self.relu2 = torch.nn.ReLU()
self.fcn = torch.nn.Linear(32, 2)
def forward(self, input_tensor_0, input_tensor_1):
x0 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_0))))
x1 = self.relu2(self.conv2(self.relu1(self.conv1(input_tensor_1))))
x = x0 + x1
x = self.fcn(x.mean(dim=(-2, -1)).view(-1, 32))
return x
def tensorflow_model():
input_0 = keras.Input(shape=(256, 256, 3))
input_1 = keras.Input(shape=(256, 256, 3))
x0 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_0)
x1 = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(input_1)
x0 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x0)
x1 = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x1)
x = x0 + x1
y = layers.Dense(2, activation="softmax")(x)
return Model(inputs=[input_0, input_1], outputs=y)
def _build_static_model(
framework: DeepLearningFramework = DeepLearningFramework.PYTORCH,
) -> Tuple[torch.nn.Module, ModelParams]:
model_params = {
"batch_size": STATIC_BATCH_SIZE,
"input_infos": [
{"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
{"size": (STATIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
],
"output_sizes": [
(STATIC_BATCH_SIZE, *OUTPUT_SHAPE),
],
"output_types": [DataType.FLOAT32],
}
model_params = ModelParams(**model_params)
if framework == DeepLearningFramework.PYTORCH:
model = TestModel()
elif framework == DeepLearningFramework.TENSORFLOW:
model = tensorflow_model()
else:
raise NotImplementedError
return model, model_params
def _build_dynamic_model(
framework: DeepLearningFramework,
) -> Tuple[torch.nn.Module, ModelParams]:
model_params = {
"batch_size": DYNAMIC_BATCH_SIZE,
"input_infos": [
{"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
{"size": (DYNAMIC_BATCH_SIZE, *INPUT_SHAPE), "dtype": "float32"},
],
"output_sizes": [
(DYNAMIC_BATCH_SIZE, *OUTPUT_SHAPE),
],
"output_types": [DataType.FLOAT32],
"dynamic_info": {
"inputs": [
{
0: {
"name": "batch",
"min_val": 1,
"opt_val": 1,
"max_val": 2,
}
},
{
0: {
"name": "batch",
"min_val": 1,
"opt_val": 1,
"max_val": 2,
}
},
],
"outputs": [{0: "batch"}],
},
}
if framework == DeepLearningFramework.PYTORCH:
model = TestModel()
elif framework == DeepLearningFramework.TENSORFLOW:
model = tensorflow_model()
else:
raise NotImplementedError()
return model, ModelParams(**model_params)
def get_torch_model(dynamic: bool = False):
if dynamic:
model, model_params = _build_dynamic_model(
DeepLearningFramework.PYTORCH
)
else:
model, model_params = _build_static_model(
DeepLearningFramework.PYTORCH
)
return model, model_params
def get_tensorflow_model(dynamic: bool = False):
if dynamic:
model, model_params = _build_dynamic_model(
DeepLearningFramework.TENSORFLOW
)
else:
model, model_params = _build_static_model(
DeepLearningFramework.TENSORFLOW
)
return model, model_params
def get_huggingface_model(temp_dir: str, dl_framework: DeepLearningFramework):
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = AlbertModel.from_pretrained("albert-base-v1")
text = "Short text you wish to process"
encoded_input = tokenizer(text, return_tensors="pt")
device = (
Device(DeviceType.GPU)
if gpu_is_available()
else Device(DeviceType.CPU)
)
(
model,
input_data,
input_names,
output_structure,
output_type,
) = convert_hf_model(model, [encoded_input], device=device)
input_data = DataManager(input_data)
input_data.split(TRAIN_TEST_SPLIT_RATIO)
# Benchmark original model
benchmark_orig_model_op = LatencyOriginalModelMeasure()
benchmark_orig_model_op.to(device).execute(
model=model,
input_data=input_data.get_split("test"),
dl_framework=dl_framework,
)
model_outputs = benchmark_orig_model_op.get_result()[0]
model_path = os.path.join(temp_dir, "test_model.onnx")
model_params = extract_info_from_data(
model, input_data, dl_framework, None, device
)
device = DeviceType.GPU if gpu_is_available() else DeviceType.CPU
convert_torch_to_onnx(
model, input_data, model_params, Path(model_path), device
)
return (
model_path,
model_params,
output_structure,
input_names,
output_type,
input_data,
model_outputs,
)
def initialize_model(
dynamic: bool,
metric: Optional[str],
output_library: DeepLearningFramework,
device: Device,
):
torch_device = torch.device(
"cuda" if device.type is DeviceType.GPU else "cpu"
)
batch_size = DYNAMIC_BATCH_SIZE if dynamic else STATIC_BATCH_SIZE
if output_library == DeepLearningFramework.PYTORCH:
model, model_params = get_torch_model(dynamic)
input_data = DataManager(
[
(
(
torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),
torch.randn(batch_size, *INPUT_SHAPE).to(torch_device),
),
torch.zeros(batch_size, dtype=torch.long),
)
]
)
elif output_library == DeepLearningFramework.TENSORFLOW:
model, model_params = get_tensorflow_model(dynamic)
input_data = DataManager(
[
(
(
tf.random_normal_initializer()(
shape=(
batch_size,
*INPUT_SHAPE[1:],
INPUT_SHAPE[0],
)
),
tf.random_normal_initializer()(
shape=(
batch_size,
*INPUT_SHAPE[1:],
INPUT_SHAPE[0],
)
),
),
[0 for _ in range(batch_size)],
)
]
)
input_data.split(TRAIN_TEST_SPLIT_RATIO)
input_tfms = MultiStageTransformation([])
# Benchmark original model
benchmark_orig_model_op = LatencyOriginalModelMeasure()
benchmark_res = benchmark_orig_model_op.to(device).execute(
model=model,
input_data=input_data.get_split("test"),
dl_framework=output_library,
)
model_outputs = benchmark_res.model_outputs
if metric is not None:
metric = compute_relative_difference
return model, input_data, model_params, input_tfms, model_outputs, metric
def check_model_validity(
optimized_model: Any,
input_data: DataManager,
model_outputs: Any,
metric_drop_ths: float,
quantization_type: QuantizationType,
metric: Callable,
) -> bool:
test_input_data, ys = input_data.get_split("test").get_list(with_ys=True)
validity_check_op = MetricDropMeasure()
validity_check_op.execute(
optimized_model,
test_input_data,
model_outputs,
metric_drop_ths
if metric_drop_ths is not None
else CONSTRAINED_METRIC_DROP_THS,
metric_func=metric
if quantization_type is not None
else compute_relative_difference,
ys=ys,
)
print(validity_check_op.get_result()[1])
return validity_check_op.get_result()[0]
================================================
FILE: optimization/nebullvm/nebullvm/operations/optimizations/utils.py
================================================
from typing import Callable, List
def map_compilers_and_compressors(ignore_list: List, enum_class: Callable):
if ignore_list is None:
ignore_list = []
else:
ignore_list = [enum_class(element) for element in ignore_list]
return ignore_list
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/blade_disc.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import torch_blade
except ImportError:
torch_blade = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/deepsparse.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
from deepsparse import compile_model, cpu
except ImportError:
compile_model = cpu = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/diffusers.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import diffusers # noqa F401
from diffusers import (
StableDiffusionPipeline,
DiffusionPipeline,
) # noqa F401
from diffusers.models import (
AutoencoderKL,
UNet2DConditionModel,
) # noqa F401
from diffusers.models.unet_2d import UNet2DOutput # noqa F401
except ImportError:
diffusers = DummyClass
StableDiffusionPipeline = DummyClass
DiffusionPipeline = DummyClass
UNet2DConditionModel = DummyClass
AutoencoderKL = DummyClass
UNet2DOutput = DummyClass
try:
import onnx_graphsurgeon # noqa F401
except ImportError:
onnx_graphsurgeon = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/dummy.py
================================================
class DummyClass:
pass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/huggingface.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
from transformers import PreTrainedModel, CLIPTextModel, CLIPTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.models.bert.modeling_bert import (
BertModel,
BertEmbeddings,
BertEncoder,
BertPooler,
BertPreTrainedModel,
)
from transformers import BertConfig, GPT2Tokenizer, GPT2LMHeadModel
except ImportError:
# add placeholders for function definition
PreTrainedModel = DummyClass
CLIPTextModel = DummyClass
CLIPTokenizer = DummyClass
PreTrainedTokenizer = DummyClass
BertModel = DummyClass
BertEmbeddings = DummyClass
BertEncoder = DummyClass
BertPooler = DummyClass
BertPreTrainedModel = DummyClass
BertConfig = DummyClass
GPT2Tokenizer = DummyClass
GPT2LMHeadModel = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/neural_compressor.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import neural_compressor # noqa F401
from neural_compressor.adaptor.pytorch import (
_cfg_to_qconfig as cfg_to_qconfig,
_cfgs_to_fx_cfgs as cfgs_to_fx_cfgs,
)
from neural_compressor.experimental import (
MixedPrecision,
Quantization,
Pruning,
)
except ImportError:
cfg_to_qconfig = cfgs_to_fx_cfgs = None
MixedPrecision = Quantization = Pruning = DummyClass
except ValueError:
# MacOS
cfg_to_qconfig = cfgs_to_fx_cfgs = None
MixedPrecision = Quantization = Pruning = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnx.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import onnx # noqa F401
except ImportError:
onnx = DummyClass
try:
import onnxmltools # noqa F401
from onnxmltools.utils.float16_converter import ( # noqa F401
convert_float_to_float16_model_path,
)
except ImportError:
convert_float_to_float16_model_path = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnxruntime.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import onnxruntime # noqa F401
from onnxruntime.quantization import (
QuantType,
quantize_static,
quantize_dynamic,
CalibrationDataReader,
)
except ImportError:
onnxruntime = DummyClass
setattr(onnxruntime, "SessionOptions", None)
QuantType = quantize_static = quantize_dynamic = None
CalibrationDataReader = DummyClass
except FileNotFoundError:
# Solves a colab issue
QuantType = quantize_static = quantize_dynamic = None
CalibrationDataReader = DummyClass
try:
# They require torch
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.optimizer import MODEL_TYPES
except ImportError:
MODEL_TYPES = DummyClass
optimizer = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/onnxsim.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import onnxsim
except ImportError:
onnxsim = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/openvino.py
================================================
import logging
from nebullvm.optional_modules.dummy import DummyClass
try:
from openvino.runtime import Core, Model, CompiledModel, InferRequest
from openvino.tools.pot import DataLoader
from openvino.tools.pot import IEEngine
from openvino.tools.pot import load_model, save_model
from openvino.tools.pot import compress_model_weights
from openvino.tools.pot import create_pipeline
except ImportError:
Model = CompiledModel = InferRequest = Core = DummyClass
DataLoader = IEEngine = DummyClass
load_model = save_model = compress_model_weights = create_pipeline = None
# Fix openvino issue with logging
# It adds a second handler to the root logger that cause issues
if len(logging.getLogger().handlers) > 1:
logging.getLogger().removeHandler(logging.getLogger().handlers[-1])
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tensor_rt.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import tensorrt
from tensorrt import IInt8EntropyCalibrator2
except ImportError:
tensorrt = DummyClass
IInt8EntropyCalibrator2 = DummyClass
try:
import polygraphy.cuda as polygraphy
from polygraphy.logger import G_LOGGER
G_LOGGER.module_severity = 40
from polygraphy.backend.onnx.loader import fold_constants
except ImportError:
polygraphy = DummyClass
fold_constants = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tensorflow.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)
except Exception:
pass
class Keras:
Model = DummyClass
class data:
Dataset = DummyClass
class dtypes:
DType = DummyClass
class Tensorflow:
Module = DummyClass
Tensor = DummyClass
keras = Keras()
data = data
dtypes = dtypes
float16 = float32 = int32 = int64 = DummyClass
@staticmethod
def function(**kwargs):
return lambda x: x
try:
import tensorflow # noqa F401
physical_devices = tensorflow.config.experimental.list_physical_devices(
"GPU"
)
if len(physical_devices) > 0:
for physical_device in physical_devices:
tensorflow.config.experimental.set_memory_growth(
physical_device, True
)
tensorflow.get_logger().setLevel("ERROR")
tensorflow.autograph.set_verbosity(0)
except (ImportError, AttributeError):
tensorflow = Tensorflow
try:
import tf2onnx # noqa F401
tf2onnx.logging.set_level("ERROR")
tf2onnx.logging.set_tf_verbosity("ERROR")
except ImportError:
tf2onnx = object
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import torch # noqa F401
from torch.nn import Module # noqa F401
from torch.jit import ScriptModule # noqa F401
from torch.fx import GraphModule
from torch.utils.data import DataLoader, Dataset # noqa F401
from torch.quantization.quantize_fx import ( # noqa F401
prepare_fx,
convert_fx,
)
from torch.ao.quantization.stubs import QuantStub, DeQuantStub
from torch.fx import symbolic_trace
from torch.quantization import default_dynamic_qconfig
import torch.distributed as torch_distributed
except ImportError:
class nn:
Module = DummyClass
class jit:
ScriptModule = DummyClass
class fx:
GraphModule = DummyClass
class torch:
float = half = int8 = DummyClass
float16 = float32 = int32 = int64 = DummyClass
Tensor = DummyClass
dtype = DummyClass
nn = nn
jit = jit
Generator = DummyClass
FloatTensor = DummyClass
fx = fx
@staticmethod
def no_grad():
return lambda x: None
@staticmethod
def inference_mode():
return lambda x: None
Dataset = DummyClass
Module = DummyClass
ScriptModule = DummyClass
GraphModule = DummyClass
DataLoader = DummyClass
symbolic_trace = None
QuantStub = DeQuantStub = DummyClass
default_dynamic_qconfig = prepare_fx = convert_fx = None
Generator = DummyClass
FloatTensor = DummyClass
torch_distributed = None
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_neuron.py
================================================
import logging
from nebullvm.optional_modules.dummy import DummyClass
try:
import torch_neuron # noqa F401
logging.getLogger("Neuron").setLevel(logging.WARNING)
except ImportError:
try:
import torch_neuronx # noqa F401
logging.getLogger("Neuron").setLevel(logging.WARNING)
except ImportError:
torch_neuron = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_tensorrt.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import torch_tensorrt
from torch_tensorrt.ptq import DataLoaderCalibrator # noqa F401
except ImportError:
torch_tensorrt = DummyClass
DataLoaderCalibrator = None
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/torch_xla.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import torch_xla
import torch_xla.core.xla_model as xm
except ImportError:
torch_xla = DummyClass
xm = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/tvm.py
================================================
from nebullvm.optional_modules.dummy import DummyClass
try:
import tvm
from tvm import IRModule
from tvm.runtime.ndarray import NDArray
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm
import tvm.relay as relay
from tvm.relay.transform import ToMixedPrecision
from tvm.contrib.graph_executor import GraphModule
from tvm.runtime import Module
from tvm.relay.backend.executor_factory import ExecutorFactoryModule
except ImportError:
tvm = (
IRModule
) = (
NDArray
) = (
XGBTuner
) = (
ExecutorFactoryModule
) = autotvm = relay = ToMixedPrecision = GraphModule = Module = DummyClass
================================================
FILE: optimization/nebullvm/nebullvm/optional_modules/utils.py
================================================
import cpuinfo
from loguru import logger
from nebullvm.core.models import Device, DeviceType
from nebullvm.operations.optimizations.compilers.utils import (
bladedisc_is_available,
deepsparse_is_available,
faster_transformer_is_available,
intel_neural_compressor_is_available,
onnxruntime_is_available,
openvino_is_available,
tensorrt_is_available,
torch_tensorrt_is_available,
torch_neuron_is_available,
torch_xla_is_available,
tvm_is_available,
)
from nebullvm.tools.utils import gpu_is_available, check_module_version
def torch_is_available() -> bool:
try:
import torch # noqa F401
if not torch.cuda.is_available() and gpu_is_available():
logger.warning(
"Installed PyTorch does not have cuda support. "
"Please ensure that torch.cuda.is_available() "
"returns True by installing the proper version "
"of PyTorch. "
)
if not check_module_version(torch, min_version="1.10.0"):
logger.warning(
"torch module version must be >= 1.10.0. "
"Please update it if you want to use it."
)
return False
except ImportError:
return False
else:
return True
def tensorflow_is_available() -> bool:
try:
import tensorflow # noqa F401
if not check_module_version(tensorflow, min_version="2.7.0"):
logger.warning(
"tensorflow module version must be >= 2.7.0. "
"Please update it if you want to use it."
)
return False
except ImportError:
return False
else:
return True
def onnx_is_available() -> bool:
try:
import onnx # noqa F401
if not check_module_version(onnx, min_version="1.10.0"):
logger.warning(
"onnx module version must be >= 1.10.0. "
"Please update it if you want to use it."
)
return False
return True
except ImportError:
return False
def _onnxmltools_is_available():
try:
import onnxmltools # noqa F401
if not check_module_version(onnxmltools, min_version="1.11.0"):
logger.warning(
"onnxmltools module version must be >= 1.11.0. "
"Please update it if you want to use the ONNX API "
"or the ONNX pipeline for PyTorch and Tensorflow."
)
return False
else:
return True
except ImportError:
return False
def _onnxsim_is_available():
try:
import onnxsim # noqa F401
return True
except ImportError:
return False
def _polygraphy_is_available():
try:
import polygraphy.cuda # noqa F401
return True
except ImportError:
return False
def tf2onnx_is_available():
try:
import tf2onnx # noqa F401
return True
except ImportError:
return False
def check_dependencies(device: Device):
missing_frameworks = []
missing_suggested_compilers = []
missing_optional_compilers = []
missing_dependencies = []
processor = cpuinfo.get_cpu_info()["brand_raw"].lower()
if device.type is DeviceType.TPU:
if not torch_is_available():
missing_frameworks.append("torch")
if not torch_xla_is_available():
missing_dependencies.append("torch_xla")
elif device.type is DeviceType.NEURON:
if not torch_is_available():
missing_frameworks.append("torch")
if not torch_neuron_is_available():
missing_dependencies.append("torch_neuron")
else:
if not onnx_is_available():
missing_frameworks.append("onnx")
if not tvm_is_available():
missing_optional_compilers.append("tvm")
if not onnxruntime_is_available():
missing_suggested_compilers.append("onnxruntime")
elif not _onnxmltools_is_available():
missing_dependencies.append("onnxmltools")
if not faster_transformer_is_available():
missing_optional_compilers.append("faster_transformer")
if device.type is DeviceType.GPU:
if not tensorrt_is_available():
missing_suggested_compilers.append("tensorrt")
else:
if not _onnxsim_is_available():
missing_dependencies.append("onnxsim")
elif not _polygraphy_is_available():
missing_dependencies.append("polygraphy")
if device.type is DeviceType.CPU:
if not openvino_is_available() and "intel" in processor:
missing_suggested_compilers.append("openvino")
if torch_is_available():
if not tvm_is_available():
if "tvm" not in missing_optional_compilers:
missing_optional_compilers.append("tvm")
if not bladedisc_is_available():
missing_optional_compilers.append("torch_blade")
if device.type is DeviceType.CPU:
if not deepsparse_is_available() and "intel" in processor:
missing_suggested_compilers.append("deepsparse")
if (
not intel_neural_compressor_is_available()
and "intel" in processor
):
missing_suggested_compilers.append("neural_compressor")
elif device.type is DeviceType.GPU:
if not torch_tensorrt_is_available:
missing_suggested_compilers.append("torch_tensorrt")
else:
missing_frameworks.append("torch")
if tensorflow_is_available():
if not tf2onnx_is_available():
missing_dependencies.append("tf2onnx")
else:
missing_frameworks.append("tensorflow")
missing_frameworks = ", ".join(missing_frameworks)
if len(missing_frameworks) > 0:
logger.warning(
f"Missing Frameworks: {missing_frameworks}.\n "
f"Please install them "
"to include them in the optimization pipeline."
)
missing_suggested_compilers = ", ".join(missing_suggested_compilers)
if len(missing_suggested_compilers) > 0:
logger.warning(
f"Missing Compilers: {missing_suggested_compilers}.\n "
f"Please install them "
"to include them in the optimization pipeline."
)
missing_dependencies = ", ".join(missing_dependencies)
if len(missing_dependencies) > 0:
logger.warning(
f"Missing Dependencies: {missing_dependencies}.\n "
f"Without them, some compilers "
f"may not work properly."
)
================================================
FILE: optimization/nebullvm/nebullvm/tools/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/tools/adapters.py
================================================
import abc
import copy
from abc import abstractmethod
import time
from typing import List, Any, Union
from loguru import logger
from nebullvm.core.models import (
Device,
DeviceType,
OptimizedModel,
OriginalModel,
)
from nebullvm.operations.conversions.huggingface import convert_hf_model
from nebullvm.operations.inference_learners.base import (
BaseInferenceLearner,
)
from nebullvm.operations.inference_learners.huggingface import (
DiffusionInferenceLearner,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.diffusers import (
get_unet_inputs,
preprocess_diffusers,
postprocess_diffusers,
)
from nebullvm.tools.pytorch import get_torch_model_size
from nebullvm.tools.utils import (
is_huggingface_data,
check_module_version,
get_throughput,
)
class ModelAdapter(abc.ABC):
@property
@abstractmethod
def adapted_model(self):
pass
@property
@abstractmethod
def adapted_data(self):
pass
@abstractmethod
def adapt_inference_learner(
self, optimized_model: OptimizedModel
) -> BaseInferenceLearner:
pass
@abstractmethod
def adapt_original_model(
self, original_model: OriginalModel
) -> OriginalModel:
pass
class DiffusionAdapter(ModelAdapter):
def __init__(
self,
original_pipeline: StableDiffusionPipeline,
data: List,
device: Device,
):
self.original_pipeline = copy.deepcopy(original_pipeline)
self.original_data = data
self.device = device
self.__adapted = False
self.__df_model = None
self.__df_data = None
@torch.no_grad()
def __benchmark_pipeline(
self,
pipe: Union[StableDiffusionPipeline, BaseInferenceLearner],
num_warmup_steps=2,
num_steps=3,
):
# Warmup
for i in range(num_warmup_steps):
_ = pipe(self.original_data[i % len(self.original_data)]).images[0]
start = time.time()
# Benchmark
for i in range(num_steps):
_ = pipe(self.original_data[i % len(self.original_data)]).images[0]
took = time.time() - start
return took / num_steps
def __adapt(self):
if not check_module_version(torch, max_version="1.13.1+cu117"):
raise ValueError(
"Diffusion models are only supported in PyTorch "
"versions <= 1.13.1. Please downgrade your PyTorch "
"version and try again."
)
model = copy.deepcopy(self.original_pipeline)
model.get_unet_inputs = get_unet_inputs
model.to(self.device.to_torch_format())
self.__df_data = [
(
tuple(
d.reshape((1,)) if d.shape == torch.Size([]) else d
for d in model.get_unet_inputs(
model,
prompt=prompt,
)
if d is not None
),
None,
)
for prompt in self.original_data
]
self.__df_model = preprocess_diffusers(model)
self.__adapted = True
@property
def adapted_model(self):
if self.__adapted is False:
self.__adapt()
return self.__df_model
@property
def adapted_data(self):
if self.__adapted is False:
self.__adapt()
return self.__df_data
def adapt_inference_learner(
self, optimized_model: OptimizedModel
) -> OptimizedModel:
pipe = copy.deepcopy(self.original_pipeline)
pipe.to(self.device.to_torch_format())
if self.device.type is DeviceType.GPU:
try:
pipe.enable_xformers_memory_efficient_attention()
except Exception:
pass
pipe = postprocess_diffusers(
optimized_model.inference_learner,
pipe,
self.device,
)
logger.info("Benchmarking optimized pipeline...")
optimized_model.latency_seconds = self.__benchmark_pipeline(pipe)
optimized_model.throughput = get_throughput(
optimized_model.latency_seconds
)
optimized_model.inference_learner = DiffusionInferenceLearner(pipe)
optimized_model.size_mb += (
sum(
[
get_torch_model_size(v)
for (k, v) in pipe.__dict__.items()
if isinstance(v, torch.nn.Module) and k != "unet"
]
)
/ 1e6
)
return optimized_model
def adapt_original_model(
self, original_model: OriginalModel
) -> OriginalModel:
pipe = copy.deepcopy(self.original_pipeline)
pipe.to(self.device.to_torch_format())
logger.info("Benchmarking original pipeline...")
original_model.latency_seconds = self.__benchmark_pipeline(pipe)
original_model.throughput = get_throughput(
original_model.latency_seconds
)
original_model.size_mb += (
sum(
[
get_torch_model_size(v)
for (k, v) in pipe.__dict__.items()
if isinstance(v, torch.nn.Module) and k != "unet"
]
)
/ 1e6
)
return original_model
class HuggingFaceAdapter(ModelAdapter):
def __init__(self, model: Any, data: List, device: Device, **kwargs):
self.original_model = model
self.original_data = data
self.device = device
self.tokenizer_params = kwargs
self.__adapted = False
self.__hf_model = None
self.__hf_data = None
self.__hf_input_names = None
self.__hf_output_type = None
self.__hf_output_structure = None
def __adapt_model(self):
if not is_huggingface_data(self.original_data[0]):
raise ValueError("Cannot convert non-HuggingFace data")
(
model,
data,
input_names,
output_structure,
output_type,
) = convert_hf_model(
self.original_model,
self.original_data,
self.device,
**self.tokenizer_params,
)
self.__hf_model = model
self.__hf_data = data
self.__hf_input_names = input_names
self.__hf_output_type = output_type
self.__hf_output_structure = output_structure
self.__adapted = True
@property
def adapted_model(self):
if self.__adapted is False:
self.__adapt_model()
return self.__hf_model
@property
def adapted_data(self):
if self.__adapted is False:
self.__adapt_model()
return self.__hf_data
def adapt_inference_learner(
self, optimized_model: OptimizedModel
) -> OptimizedModel:
from nebullvm.operations.inference_learners.huggingface import (
HuggingFaceInferenceLearner,
)
optimized_model.inference_learner = HuggingFaceInferenceLearner(
core_inference_learner=optimized_model.inference_learner,
output_structure=self.__hf_output_structure,
input_names=self.__hf_input_names,
output_type=self.__hf_output_type,
)
return optimized_model
def adapt_original_model(
self, original_model: OriginalModel
) -> OriginalModel:
return original_model
================================================
FILE: optimization/nebullvm/nebullvm/tools/benchmark.py
================================================
import time
from abc import abstractmethod, ABC
from typing import Any, Dict, Type
import numpy as np
from loguru import logger
from tqdm import tqdm
from nebullvm.core.models import DeepLearningFramework, ModelParams, DeviceType
from nebullvm.operations.inference_learners.base import BaseInferenceLearner
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, DataLoader
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import create_model_inputs_onnx
from nebullvm.tools.pytorch import create_model_inputs_torch
from nebullvm.tools.tf import create_model_inputs_tf
from nebullvm.tools.utils import (
check_input_data,
extract_info_from_data,
is_data_subscriptable,
check_device,
)
def _get_dl_framework(model: Any):
if (
isinstance(model, torch.nn.Module)
or str(model).startswith("Pytorch")
or str(model).startswith("Torch")
):
return DeepLearningFramework.PYTORCH
elif (isinstance(model, tf.Module) and model is not None) or str(
model
).startswith("Tensorflow"):
return DeepLearningFramework.TENSORFLOW
elif isinstance(model, str) or str(model).startswith("Numpy"):
return DeepLearningFramework.NUMPY
else:
raise TypeError(f"Model type {type(model)} not supported.")
def _create_model_inputs(
dl_framework: DeepLearningFramework, model_params: ModelParams
):
if dl_framework == DeepLearningFramework.PYTORCH:
input_data = create_model_inputs_torch(model_params.input_infos)
elif dl_framework == DeepLearningFramework.TENSORFLOW:
input_data = create_model_inputs_tf(model_params.input_infos)
elif dl_framework == DeepLearningFramework.NUMPY:
input_data = create_model_inputs_onnx(model_params.input_infos)
else:
raise TypeError(f"Unknown framework {dl_framework}")
return input_data
class BaseBenchmark(ABC):
def __init__(self, model, input_tensors, device, n_warmup=50, n_runs=1000):
self.model = model
self.input_tensors = input_tensors
self.device = device
self.n_warmup = n_warmup
self.n_runs = n_runs
@abstractmethod
def benchmark(self):
raise NotImplementedError
class PytorchBenchmark(BaseBenchmark):
def benchmark(self):
input_tensors = [
[tensor.to(self.device.to_torch_format()) for tensor in tensors]
for tensors in self.input_tensors
]
batch_size = input_tensors[0][0].shape[0]
if isinstance(self.model, torch.nn.Module):
self.model.to(self.device.to_torch_format()).eval()
with torch.no_grad():
for i in tqdm(
range(self.n_warmup),
desc=f"Performing warm up on {self.n_warmup} iterations",
):
self.model(
*input_tensors[i % min(self.n_warmup, len(input_tensors))]
)
if self.device.type is DeviceType.GPU:
torch.cuda.synchronize()
timings = []
with torch.no_grad():
for i in tqdm(
range(1, self.n_runs + 1),
desc=f"Performing benchmark on {self.n_runs} iterations",
):
start_time = time.time()
self.model(
*input_tensors[i % min(self.n_runs, len(input_tensors))]
)
if self.device.type is DeviceType.GPU:
torch.cuda.synchronize()
end_time = time.time()
timings.append(end_time - start_time)
print(f"Batch size: {batch_size}")
throughput = batch_size / np.mean(timings)
latency = np.mean(timings) / batch_size
print("Average Throughput: %.2f data/second" % throughput)
print("Average Latency: %.4f seconds/data" % latency)
return throughput, latency
class TensorflowBenchmark(BaseBenchmark):
def benchmark(self):
batch_size = self.input_tensors[0][0].shape[0]
for i in tqdm(
range(self.n_warmup),
desc=f"Performing warm up on {self.n_warmup} iterations",
):
with tf.device(self.device.to_tf_format()):
self.model(
*self.input_tensors[
i % min(self.n_warmup, len(self.input_tensors))
]
)
timings = []
for i in tqdm(
range(1, self.n_runs + 1),
desc=f"Performing benchmark on {self.n_runs} iterations",
):
start_time = time.time()
with tf.device(self.device.to_tf_format()):
self.model(
*self.input_tensors[
i % min(self.n_runs, len(self.input_tensors))
]
)
end_time = time.time()
timings.append(end_time - start_time)
print(f"Batch size: {batch_size}")
throughput = batch_size / np.mean(timings)
latency = np.mean(timings) / batch_size
print("Average Throughput: %.2f data/second" % throughput)
print("Average Latency: %.4f seconds/data" % latency)
return throughput, latency
class NumpyBenchmark(BaseBenchmark):
def benchmark(self):
if not isinstance(self.model, BaseInferenceLearner):
# TODO: Add support for original onnx models
raise NotImplementedError(
"Benchmark function doesn't support original " "onnx models."
)
batch_size = self.input_tensors[0][0].shape[0]
for i in tqdm(
range(self.n_warmup),
desc=f"Performing warm up on {self.n_warmup} iterations",
):
self.model(
*self.input_tensors[
i % min(self.n_warmup, len(self.input_tensors))
]
)
timings = []
for i in tqdm(
range(1, self.n_runs + 1),
desc=f"Performing benchmark on {self.n_runs} iterations",
):
start_time = time.time()
self.model(
*self.input_tensors[
i % min(self.n_runs, len(self.input_tensors))
]
)
end_time = time.time()
timings.append(end_time - start_time)
print(f"Batch size: {batch_size}")
throughput = batch_size / np.mean(timings)
latency = np.mean(timings) / batch_size
print("Average Throughput: %.2f data/second" % throughput)
print("Average Latency: %.4f seconds/data" % latency)
return throughput, latency
def benchmark(
model, input_data, device=None, random=False, n_warmup=50, n_runs=1000
):
"""Performs a Benchmark on the input model regardless of the framework it
was used for implementing it.
Args:
model (Any): The input model.
input_data (Iterable or Sequence): Input data to be used for
optimizing the model. PyTorch, TensorFlow
and Onnx respectively accept input tensor in `torch.Tensor`,
`tf.Tensor` and `np.ndarray` formats. Note that the each input
sample must be a tuple containing a tuple as first element, the
`inputs`, and the `label` as second element. The `inputs` needs to
be passed as tuple even if a single input is needed by the model
(in this case the `inputs` tuple will contain just an element).
HuggingFace models can take as data samples both dictionaries or
strings. Strings will then be converted in data samples using the
HuggingFace tokenizer which must be given as input when just a
list of string is provided as input_data (tokenizers can be passed
as extra arguments of this function using the keyword `tokenizer`).
device (str): Device to be used for running the benchmark. If None,
CPU will be used. Default: None.
random (bool, optional): If set to true, the data used to benchmark the
model will be computed randomly given the info extracted from the
provided input_data.
n_warmup (int, optional): Number of warmup iterations.
n_runs (int, optional): Number of iterations performed to benchmark
the model.
"""
if not isinstance(model, BaseInferenceLearner):
device = check_device(device)
else:
device = model.device
logger.info(f"Running benchmark on {device.type.name}")
dl_framework = _get_dl_framework(model)
if isinstance(input_data, (DataLoader, tf.data.Dataset)):
try:
input_data = DataManager.from_dataloader(input_data)
except Exception:
raise ValueError(
"The provided dataloader does not match the expected "
"format.\n"
"Speedster supports dataloaders that return tuples in "
"the\n"
"following formats: \n"
"Single input: (input, label)\n"
"Multiple inputs: ((input1, input2, ...), label) or "
"(input1, input2, ..., label)\n"
"Inputs and labels should be either tensors or numpy "
"arrays,\n"
"depending on the framework used.\n"
)
if not isinstance(input_data, DataManager):
if check_input_data(input_data):
if is_data_subscriptable(input_data):
input_data = DataManager(input_data)
else:
input_data = DataManager.from_iterable(input_data)
else:
raise ValueError(
"The provided data does not match the expected "
"format.\n"
"Speedster supports data in the following formats: \n"
"- PyTorch DataLoader\n"
"- TensorFlow Dataset\n"
"- List of tuples: [((input_0, ... ), label), ...] \n"
"Inputs and labels should be either tensors or numpy "
"arrays,\n"
"depending on the framework used.\n"
)
if random:
model_params = extract_info_from_data(
model, input_data, dl_framework, None, device
)
input_data = _create_model_inputs(dl_framework, model_params)
else:
input_data = input_data.get_list()
BENCHMARK_FUNCTIONS[dl_framework](
model=model,
input_tensors=input_data,
device=device,
n_warmup=n_warmup,
n_runs=n_runs,
).benchmark()
BENCHMARK_FUNCTIONS: Dict[DeepLearningFramework, Type[BaseBenchmark]] = {
DeepLearningFramework.PYTORCH: PytorchBenchmark,
DeepLearningFramework.TENSORFLOW: TensorflowBenchmark,
DeepLearningFramework.NUMPY: NumpyBenchmark,
}
================================================
FILE: optimization/nebullvm/nebullvm/tools/data.py
================================================
from typing import Sequence, List, Tuple, Any, Union, Iterable
import numpy as np
from loguru import logger
from nebullvm.config import MIN_DIM_INPUT_DATA
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Dataset, DataLoader
from nebullvm.tools.onnx import convert_to_numpy
class DataManager:
"""Class for managing the user data in nebullvm.
Attributes:
data_reader(Sequence): Object implementing the __getitem__, the
__len__ and the __iter__/__next__ APIs. It should read the
user data and return tuples of tensors for feeding the models.
"""
def __init__(self, data_reader: Sequence):
self._data_reader = data_reader
self._pointer = 0
self.train_idxs = []
self.test_idxs = []
def __getitem__(self, item):
return self._data_reader[item]
def __len__(self):
return len(self._data_reader)
def __iter__(self):
self._pointer = 0
return self
def __next__(self):
if self._pointer < len(self):
data = self[self._pointer]
self._pointer += 1
return data
else:
raise StopIteration
def get_numpy_list(
self, n: int = None, shuffle: bool = False, with_ys: bool = False
) -> Union[
List[Tuple[np.ndarray, ...]], Tuple[List[Tuple[np.ndarray, ...]], List]
]:
if n is None:
n = len(self)
if not with_ys:
return [
tuple(convert_to_numpy(x) for x in tuple_)
for tuple_ in self.get_list(n, shuffle)
]
else:
xs, ys = self.get_list(n, shuffle, with_ys=True)
return [
tuple(convert_to_numpy(x) for x in tuple_) for tuple_ in xs
], ys
def get_list(
self, n: int = None, shuffle: bool = False, with_ys: bool = False
) -> Union[List[Tuple[Any, ...]], Tuple[List[Tuple[Any, ...]], List]]:
if n is None:
n = len(self)
if shuffle:
idx = np.random.choice(len(self), n, replace=n > len(self))
else:
idx = np.arange(0, min(n, len(self)))
if n > len(self):
np.random.seed(0)
idx = np.concatenate(
[
idx,
np.random.choice(
len(self), n - len(self), replace=True
),
]
)
if not with_ys:
return [self[i][0] for i in idx]
ys, xs = [], []
for i in idx:
x, y = self[i] if len(self[i]) > 1 else (self[i][0], None)
xs.append(x)
ys.append(y)
return xs, ys
@classmethod
def from_iterable(cls, iterable: Iterable, max_length: int = 500):
return cls([x for i, x in enumerate(iterable) if i < max_length])
@classmethod
def from_dataloader(
cls,
dataloader: Union[DataLoader, tf.data.Dataset],
max_length: int = 500,
):
batch_size = (
dataloader.batch_size
if isinstance(dataloader, DataLoader)
else dataloader._batch_size
)
if batch_size > max_length:
raise ValueError(
f"Batch size ({dataloader.batch_size}) is greater than "
f"max_length ({max_length})."
)
data_manager = []
warning_label = False
for i, batch in enumerate(dataloader):
if i * batch_size >= max_length:
break
if isinstance(batch, (list, tuple)):
if len(batch) == 1:
data_manager.append((batch, None))
elif len(batch) == 2:
if isinstance(batch[0], tuple):
data_manager.append((batch[0], batch[1]))
elif isinstance(batch[0], (torch.Tensor, tf.Tensor)):
warning_label = True
data_manager.append(((batch[0],), batch[1]))
else:
raise ValueError(
"The first element of the batch should be a "
"tuple or a torch.Tensor"
)
else:
warning_label = True
data_manager.append(
(tuple(t for t in batch[:-1]), batch[-1])
)
elif isinstance(batch, (torch.Tensor, tf.Tensor)):
data_manager.append(((batch,), None))
else:
raise ValueError(
"The batch should be a tuple, a list or a Tensor"
)
if warning_label:
logger.warning(
"The provided dataloader returns a tuple of tensors"
"for each batch. The last tensor in the tuple will "
"be considered as the label. "
"To avoid this warning, the dataloader should return "
"a tuple for each batch, where the first element is "
"a tuple containing the inputs and the second element "
"is a tensor containing the label."
)
return cls(data_manager)
def get_split(self, split_type="train"):
return (
DataManager([self[i] for i in self.train_idxs])
if split_type == "train"
else DataManager([self[i] for i in self.test_idxs])
)
def split(self, split_pct: float, shuffle: bool = False):
if shuffle:
idx = np.random.choice(len(self), len(self), replace=False)
else:
idx = np.arange(len(self))
n = int(round(len(idx) * split_pct))
if len(self) < MIN_DIM_INPUT_DATA:
logger.warning(
f"Not enough data for splitting the DataManager. "
f"You should provide at least {MIN_DIM_INPUT_DATA} "
f"data samples to allow a good split between train "
f"and test sets. Compression, calibration and precision "
f"checks will use the same data."
)
self.train_idxs = idx
self.test_idxs = idx
else:
self.train_idxs = idx[:n]
self.test_idxs = idx[n:]
class PytorchDataset(Dataset):
def __init__(self, input_data: DataManager, has_labels: bool = False):
self.data = input_data
self.has_labels = has_labels
self.batch_size = input_data[0][0][0].shape[0]
def __len__(self):
return sum([batch_inputs[0].shape[0] for batch_inputs, _ in self.data])
def __getitem__(self, idx):
batch_idx = int(idx / self.batch_size)
item_idx = idx % self.batch_size
data = tuple([data[item_idx] for data in self.data[batch_idx][0]])
if self.has_labels:
label = self.data[batch_idx][1]
if label is not None:
return data, self.data[batch_idx][1][item_idx]
else:
return data, torch.tensor([0])
else:
return data
================================================
FILE: optimization/nebullvm/nebullvm/tools/diffusers.py
================================================
# Based on https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/models.py
#
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, Union, List, Optional, Any, Tuple
from nebullvm.core.models import Device
from nebullvm.optional_modules.diffusers import (
DiffusionPipeline,
UNet2DConditionModel,
UNet2DOutput,
AutoencoderKL,
onnx_graphsurgeon as gs,
)
from nebullvm.optional_modules.diffusers import StableDiffusionPipeline
from nebullvm.optional_modules.huggingface import CLIPTextModel, CLIPTokenizer
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.tensor_rt import fold_constants
from nebullvm.optional_modules.torch import torch
@torch.no_grad()
def get_unet_inputs(
self,
prompt: Union[str, List[str]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 1,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
# 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor
width = width or self.unet.config.sample_size * self.vae_scale_factor
# 1. Check inputs. Raise error if not correct
self.check_inputs(
prompt,
height,
width,
callback_steps,
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
)
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
device = self._execution_device
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
)
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = self.scheduler.timesteps
# 5. Prepare latent variables
num_channels_latents = self.unet.in_channels
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
prompt_embeds.dtype,
device,
generator,
latents,
)
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = (
torch.cat([latents] * 2)
if do_classifier_free_guidance
else latents
)
latent_model_input = self.scheduler.scale_model_input(
latent_model_input, t
)
return latent_model_input, t, prompt_embeds, cross_attention_kwargs
class DiffusionUNetWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, *x, **kwargs):
return tuple(
self.model(x[0], x[1], encoder_hidden_states=x[2]).values()
)
class OptimizedDiffusionWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, *x, **kwargs):
return UNet2DOutput(
self.model(
x[0],
x[1].reshape((1,)) if x[1].shape == torch.Size([]) else x[1],
kwargs["encoder_hidden_states"],
)[0]
)
def is_diffusion_model_pipe(model):
return isinstance(model, DiffusionPipeline)
def get_default_dynamic_info(input_shape: List[Tuple[int, ...]]):
return {
"inputs": [
{
0: {
"name": "2B",
"min_val": input_shape[0][0],
"opt_val": input_shape[0][0],
"max_val": input_shape[0][0],
},
2: {
"name": "H",
"min_val": input_shape[0][2],
"opt_val": input_shape[0][2],
"max_val": input_shape[0][2],
},
3: {
"name": "W",
"min_val": input_shape[0][3],
"opt_val": input_shape[0][3],
"max_val": input_shape[0][3],
},
},
{},
{
0: {
"name": "2B",
"min_val": input_shape[2][0],
"opt_val": input_shape[2][0],
"max_val": input_shape[2][0],
}
},
],
"outputs": [{0: "2B", 2: "H", 3: "W"}],
}
def preprocess_diffusers(pipe: DiffusionPipeline) -> torch.nn.Module:
# Function that wraps the Diffusion UNet model to
# be compatible with the optimizations performed by nebullvm
model = DiffusionUNetWrapper(pipe.unet)
return model
def postprocess_diffusers(
optimized_model: Any,
pipe: StableDiffusionPipeline,
device: Device,
) -> StableDiffusionPipeline:
# Function that puts the optimized Diffusion UNet model back
# into the Diffusion Pipeline
final_model = OptimizedDiffusionWrapper(optimized_model)
final_model.sample_size = pipe.unet.sample_size
final_model.in_channels = pipe.unet.in_channels
final_model.device = torch.device(device.to_torch_format())
final_model.config = pipe.unet.config
final_model.in_channels = pipe.unet.in_channels
pipe.unet = final_model
return pipe
class Optimizer:
def __init__(self, onnx_graph, verbose=False):
self.graph = gs.import_onnx(onnx_graph)
self.verbose = verbose
def info(self, prefix):
if self.verbose:
print(
f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs"
)
def cleanup(self, return_onnx=False):
self.graph.cleanup().toposort()
if return_onnx:
return gs.export_onnx(self.graph)
def select_outputs(self, keep, names=None):
self.graph.outputs = [self.graph.outputs[o] for o in keep]
if names:
for i, name in enumerate(names):
self.graph.outputs[i].name = name
def fold_constants(self, return_onnx=False):
onnx_graph = fold_constants(
gs.export_onnx(self.graph),
allow_onnxruntime_shape_inference=True,
)
self.graph = gs.import_onnx(onnx_graph)
if return_onnx:
return onnx_graph
def infer_shapes(self, return_onnx=False):
onnx_graph = gs.export_onnx(self.graph)
if onnx_graph.ByteSize() > 2147483648:
raise TypeError("ERROR: model size exceeds supported 2GB limit")
else:
onnx_graph = onnx.shape_inference.infer_shapes(onnx_graph)
self.graph = gs.import_onnx(onnx_graph)
if return_onnx:
return onnx_graph
def get_path(version, inpaint=False):
if version == "1.4":
if inpaint:
return "runwayml/stable-diffusion-inpainting"
else:
return "CompVis/stable-diffusion-v1-4"
elif version == "1.5":
if inpaint:
return "runwayml/stable-diffusion-inpainting"
else:
return "runwayml/stable-diffusion-v1-5"
elif version == "2.0-base":
if inpaint:
return "stabilityai/stable-diffusion-2-inpainting"
else:
return "stabilityai/stable-diffusion-2-base"
elif version == "2.0":
if inpaint:
return "stabilityai/stable-diffusion-2-inpainting"
else:
return "stabilityai/stable-diffusion-2"
elif version == "2.1":
return "stabilityai/stable-diffusion-2-1"
elif version == "2.1-base":
return "stabilityai/stable-diffusion-2-1-base"
else:
raise ValueError(f"Incorrect version {version}")
def get_embedding_dim(version):
if version in ("1.4", "1.5"):
return 768
elif version in ("2.0", "2.0-base", "2.1", "2.1-base"):
return 1024
else:
raise ValueError(f"Incorrect version {version}")
class BaseModel:
def __init__(
self,
hf_token,
fp16=False,
device="cuda",
verbose=False,
path="",
max_batch_size=16,
embedding_dim=768,
text_maxlen=77,
):
self.name = "SD Model"
self.hf_token = hf_token
self.fp16 = fp16
self.device = device
self.verbose = verbose
self.path = path
self.min_batch = 1
self.max_batch = max_batch_size
self.min_image_shape = 256 # min image resolution: 256x256
self.max_image_shape = 1024 # max image resolution: 1024x1024
self.min_latent_shape = self.min_image_shape // 8
self.max_latent_shape = self.max_image_shape // 8
self.embedding_dim = embedding_dim
self.text_maxlen = text_maxlen
def get_model(self):
pass
def get_input_names(self):
pass
def get_output_names(self):
pass
def get_dynamic_axes(self):
return None
def get_sample_input(self, batch_size, image_height, image_width):
pass
def get_input_profile(
self, batch_size, image_height, image_width, static_batch, static_shape
):
return None
def get_shape_dict(self, batch_size, image_height, image_width):
return None
def optimize(self, onnx_graph):
opt = Optimizer(onnx_graph, verbose=self.verbose)
opt.info(self.name + ": original")
opt.cleanup()
opt.info(self.name + ": cleanup")
opt.fold_constants()
opt.info(self.name + ": fold constants")
opt.infer_shapes()
opt.info(self.name + ": shape inference")
onnx_opt_graph = opt.cleanup(return_onnx=True)
opt.info(self.name + ": finished")
return onnx_opt_graph
def check_dims(self, batch_size, image_height, image_width):
assert batch_size >= self.min_batch and batch_size <= self.max_batch
assert image_height % 8 == 0 or image_width % 8 == 0
latent_height = image_height // 8
latent_width = image_width // 8
assert (
latent_height >= self.min_latent_shape
and latent_height <= self.max_latent_shape
)
assert (
latent_width >= self.min_latent_shape
and latent_width <= self.max_latent_shape
)
return (latent_height, latent_width)
def get_minmax_dims(
self, batch_size, image_height, image_width, static_batch, static_shape
):
min_batch = batch_size if static_batch else self.min_batch
max_batch = batch_size if static_batch else self.max_batch
latent_height = image_height // 8
latent_width = image_width // 8
min_image_height = (
image_height if static_shape else self.min_image_shape
)
max_image_height = (
image_height if static_shape else self.max_image_shape
)
min_image_width = image_width if static_shape else self.min_image_shape
max_image_width = image_width if static_shape else self.max_image_shape
min_latent_height = (
latent_height if static_shape else self.min_latent_shape
)
max_latent_height = (
latent_height if static_shape else self.max_latent_shape
)
min_latent_width = (
latent_width if static_shape else self.min_latent_shape
)
max_latent_width = (
latent_width if static_shape else self.max_latent_shape
)
return (
min_batch,
max_batch,
min_image_height,
max_image_height,
min_image_width,
max_image_width,
min_latent_height,
max_latent_height,
min_latent_width,
max_latent_width,
)
class CLIP(BaseModel):
def __init__(
self, hf_token, device, verbose, path, max_batch_size, embedding_dim
):
super(CLIP, self).__init__(
hf_token,
device=device,
verbose=verbose,
path=path,
max_batch_size=max_batch_size,
embedding_dim=embedding_dim,
)
self.name = "CLIP"
def get_model(self):
return CLIPTextModel.from_pretrained(
self.path, subfolder="text_encoder", use_auth_token=self.hf_token
).to(self.device)
def get_input_names(self):
return ["input_ids"]
def get_output_names(self):
return ["text_embeddings", "pooler_output"]
def get_dynamic_axes(self):
return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
def get_input_profile(
self, batch_size, image_height, image_width, static_batch, static_shape
):
self.check_dims(batch_size, image_height, image_width)
min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
batch_size, image_height, image_width, static_batch, static_shape
)
return {
"input_ids": [
(min_batch, self.text_maxlen),
(batch_size, self.text_maxlen),
(max_batch, self.text_maxlen),
]
}
def get_shape_dict(self, batch_size, image_height, image_width):
self.check_dims(batch_size, image_height, image_width)
return {
"input_ids": (batch_size, self.text_maxlen),
"text_embeddings": (
batch_size,
self.text_maxlen,
self.embedding_dim,
),
}
def get_sample_input(self, batch_size, image_height, image_width):
self.check_dims(batch_size, image_height, image_width)
return torch.zeros(
batch_size, self.text_maxlen, dtype=torch.int32, device=self.device
)
def optimize(self, onnx_graph):
opt = Optimizer(onnx_graph, verbose=self.verbose)
opt.info(self.name + ": original")
opt.select_outputs([0]) # delete graph output#1
opt.cleanup()
opt.info(self.name + ": remove output[1]")
opt.fold_constants()
opt.info(self.name + ": fold constants")
opt.infer_shapes()
opt.info(self.name + ": shape inference")
opt.select_outputs(
[0], names=["text_embeddings"]
) # rename network output
opt.info(self.name + ": remove output[0]")
opt_onnx_graph = opt.cleanup(return_onnx=True)
opt.info(self.name + ": finished")
return opt_onnx_graph
def make_CLIP(
version, hf_token, device, verbose, max_batch_size, inpaint=False
):
return CLIP(
hf_token=hf_token,
device=device,
verbose=verbose,
path=get_path(version, inpaint=inpaint),
max_batch_size=max_batch_size,
embedding_dim=get_embedding_dim(version),
)
class UNet(BaseModel):
def __init__(
self,
hf_token,
fp16=False,
device="cuda",
verbose=False,
path="",
max_batch_size=16,
embedding_dim=768,
text_maxlen=77,
unet_dim=4,
):
super(UNet, self).__init__(
hf_token,
fp16=fp16,
device=device,
verbose=verbose,
path=path,
max_batch_size=max_batch_size,
embedding_dim=embedding_dim,
text_maxlen=text_maxlen,
)
self.unet_dim = unet_dim
self.name = "UNet"
def get_model(self):
model_opts = (
{"revision": "fp16", "torch_dtype": torch.float16}
if self.fp16
else {}
)
return UNet2DConditionModel.from_pretrained(
self.path,
subfolder="unet",
use_auth_token=self.hf_token,
**model_opts,
).to(self.device)
def get_input_names(self):
return ["sample", "timestep", "encoder_hidden_states"]
def get_output_names(self):
return ["latent"]
def get_dynamic_axes(self):
return {
"sample": {0: "2B", 2: "H", 3: "W"},
"encoder_hidden_states": {0: "2B"},
"latent": {0: "2B", 2: "H", 3: "W"},
}
def get_input_profile(
self, batch_size, image_height, image_width, static_batch, static_shape
):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
(
min_batch,
max_batch,
_,
_,
_,
_,
min_latent_height,
max_latent_height,
min_latent_width,
max_latent_width,
) = self.get_minmax_dims(
batch_size, image_height, image_width, static_batch, static_shape
)
return {
"sample": [
(
2 * min_batch,
self.unet_dim,
min_latent_height,
min_latent_width,
),
(2 * batch_size, self.unet_dim, latent_height, latent_width),
(
2 * max_batch,
self.unet_dim,
max_latent_height,
max_latent_width,
),
],
"encoder_hidden_states": [
(2 * min_batch, self.text_maxlen, self.embedding_dim),
(2 * batch_size, self.text_maxlen, self.embedding_dim),
(2 * max_batch, self.text_maxlen, self.embedding_dim),
],
}
def get_shape_dict(self, batch_size, image_height, image_width):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
return {
"sample": (
2 * batch_size,
self.unet_dim,
latent_height,
latent_width,
),
"encoder_hidden_states": (
2 * batch_size,
self.text_maxlen,
self.embedding_dim,
),
"latent": (2 * batch_size, 4, latent_height, latent_width),
}
def get_sample_input(self, batch_size, image_height, image_width):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
dtype = torch.float16 if self.fp16 else torch.float32
return (
torch.randn(
2 * batch_size,
self.unet_dim,
latent_height,
latent_width,
dtype=torch.float32,
device=self.device,
),
torch.tensor([1.0], dtype=torch.float32, device=self.device),
torch.randn(
2 * batch_size,
self.text_maxlen,
self.embedding_dim,
dtype=dtype,
device=self.device,
),
)
def make_UNet(
version, hf_token, device, verbose, max_batch_size, inpaint=False
):
return UNet(
hf_token=hf_token,
fp16=True,
device=device,
verbose=verbose,
path=get_path(version, inpaint=inpaint),
max_batch_size=max_batch_size,
embedding_dim=get_embedding_dim(version),
unet_dim=(9 if inpaint else 4),
)
class VAE(BaseModel):
def __init__(
self, hf_token, device, verbose, path, max_batch_size, embedding_dim
):
super(VAE, self).__init__(
hf_token,
device=device,
verbose=verbose,
path=path,
max_batch_size=max_batch_size,
embedding_dim=embedding_dim,
)
self.name = "VAE decoder"
def get_model(self):
vae = AutoencoderKL.from_pretrained(
self.path, subfolder="vae", use_auth_token=self.hf_token
).to(self.device)
vae.forward = vae.decode
return vae
def get_input_names(self):
return ["latent"]
def get_output_names(self):
return ["images"]
def get_dynamic_axes(self):
return {
"latent": {0: "B", 2: "H", 3: "W"},
"images": {0: "B", 2: "8H", 3: "8W"},
}
def get_input_profile(
self, batch_size, image_height, image_width, static_batch, static_shape
):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
(
min_batch,
max_batch,
_,
_,
_,
_,
min_latent_height,
max_latent_height,
min_latent_width,
max_latent_width,
) = self.get_minmax_dims(
batch_size, image_height, image_width, static_batch, static_shape
)
return {
"latent": [
(min_batch, 4, min_latent_height, min_latent_width),
(batch_size, 4, latent_height, latent_width),
(max_batch, 4, max_latent_height, max_latent_width),
]
}
def get_shape_dict(self, batch_size, image_height, image_width):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
return {
"latent": (batch_size, 4, latent_height, latent_width),
"images": (batch_size, 3, image_height, image_width),
}
def get_sample_input(self, batch_size, image_height, image_width):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
return torch.randn(
batch_size,
4,
latent_height,
latent_width,
dtype=torch.float32,
device=self.device,
)
def make_VAE(
version, hf_token, device, verbose, max_batch_size, inpaint=False
):
return VAE(
hf_token=hf_token,
device=device,
verbose=verbose,
path=get_path(version, inpaint=inpaint),
max_batch_size=max_batch_size,
embedding_dim=get_embedding_dim(version),
)
class TorchVAEEncoder(torch.nn.Module):
def __init__(self, token, device, path):
super().__init__()
self.path = path
self.vae_encoder = AutoencoderKL.from_pretrained(
self.path, subfolder="vae", use_auth_token=token
).to(device)
def forward(self, x):
return self.vae_encoder.encode(x).latent_dist.sample()
class VAEEncoder(BaseModel):
def __init__(
self, hf_token, device, verbose, path, max_batch_size, embedding_dim
):
super(VAEEncoder, self).__init__(
hf_token,
device=device,
verbose=verbose,
path=path,
max_batch_size=max_batch_size,
embedding_dim=embedding_dim,
)
self.name = "VAE encoder"
def get_model(self):
vae_encoder = TorchVAEEncoder(self.hf_token, self.device, self.path)
return vae_encoder
def get_input_names(self):
return ["images"]
def get_output_names(self):
return ["latent"]
def get_dynamic_axes(self):
return {
"images": {0: "B", 2: "8H", 3: "8W"},
"latent": {0: "B", 2: "H", 3: "W"},
}
def get_input_profile(
self, batch_size, image_height, image_width, static_batch, static_shape
):
assert batch_size >= self.min_batch and batch_size <= self.max_batch
min_batch = batch_size if static_batch else self.min_batch
max_batch = batch_size if static_batch else self.max_batch
self.check_dims(batch_size, image_height, image_width)
(
min_batch,
max_batch,
min_image_height,
max_image_height,
min_image_width,
max_image_width,
_,
_,
_,
_,
) = self.get_minmax_dims(
batch_size, image_height, image_width, static_batch, static_shape
)
return {
"images": [
(min_batch, 3, min_image_height, min_image_width),
(batch_size, 3, image_height, image_width),
(max_batch, 3, max_image_height, max_image_width),
],
}
def get_shape_dict(self, batch_size, image_height, image_width):
latent_height, latent_width = self.check_dims(
batch_size, image_height, image_width
)
return {
"images": (batch_size, 3, image_height, image_width),
"latent": (batch_size, 4, latent_height, latent_width),
}
def get_sample_input(self, batch_size, image_height, image_width):
self.check_dims(batch_size, image_height, image_width)
return torch.randn(
batch_size,
3,
image_height,
image_width,
dtype=torch.float32,
device=self.device,
)
def make_VAEEncoder(
version, hf_token, device, verbose, max_batch_size, inpaint=False
):
return VAEEncoder(
hf_token=hf_token,
device=device,
verbose=verbose,
path=get_path(version, inpaint=inpaint),
max_batch_size=max_batch_size,
embedding_dim=get_embedding_dim(version),
)
def make_tokenizer(version, hf_token):
return CLIPTokenizer.from_pretrained(
get_path(version), subfolder="tokenizer", use_auth_token=hf_token
)
def is_diffusion_model(model) -> bool:
try:
from diffusers import UNet2DConditionModel
except ImportError:
return False
if is_diffusion_model_pipe(model):
return True
if isinstance(model, (UNet2DConditionModel, DiffusionUNetWrapper)):
return True
if hasattr(model, "model"):
return isinstance(model.model, UNet2DConditionModel)
return False
================================================
FILE: optimization/nebullvm/nebullvm/tools/feedback_collector.py
================================================
import json
import os
from pathlib import Path
from typing import Any
import requests
from nebullvm.config import VERSION
NEBULLVM_METADATA_PATH = Path.home() / ".nebullvm/collect.json"
class FeedbackCollector:
def __init__(
self, url: str, disable_telemetry_environ_var: str, app_version: str
):
self._disable_telemetry_environ_var = disable_telemetry_environ_var
self._is_active = (
int(os.getenv(disable_telemetry_environ_var, "0")) == 0
)
self._url = url
self._metadata = {
"nebullvm_version": VERSION,
"app_version": app_version,
}
def _store_ip_address(self):
try:
self._metadata["ip_address"] = requests.get(
"https://api.ipify.org"
).text
except Exception:
self._metadata["ip_address"] = "Unknown"
@property
def is_active(self):
return self._is_active
def _inform_user(self):
message = (
f"Nebuly collects anonymous usage statistics to help improve the "
f"product. You can opt-out by setting the environment variable "
f"{self._disable_telemetry_environ_var}=1."
)
print(message)
def store_info(self, key: str, value: Any):
if key in self._metadata and isinstance(value, list):
self._metadata[key] += value
else:
self._metadata[key] = value
def send_feedback(self, timeout: int = 30):
if not self.is_active:
return {}
self._store_ip_address()
request_body = self._metadata
headers = {
"accept": "application/json",
"Content-Type": "application/json",
}
response = requests.post(
self._url,
data=json.dumps(request_body),
headers=headers,
timeout=timeout,
)
return response
def get(self, key: str, default: Any = None):
return self._metadata.get(key, default)
def reset(self, key: str):
self._metadata.pop(key, None)
================================================
FILE: optimization/nebullvm/nebullvm/tools/hardware_utils.py
================================================
import os
import platform
import cpuinfo
import psutil
from nebullvm.core.models import HardwareSetup, Device, DeviceType
from nebullvm.optional_modules.torch_xla import xm
from nebullvm.optional_modules.utils import (
torch_is_available,
tensorflow_is_available,
)
from nebullvm.tools.pytorch import torch_get_device_name
from nebullvm.tools.tf import tensorflow_get_gpu_name
from nebullvm.tools.utils import (
gpu_is_available,
tpu_is_available,
neuron_is_available,
)
def get_hw_setup(device: Device = None) -> HardwareSetup:
accelerator = None
if (
device is not None and device.type is DeviceType.GPU
) or gpu_is_available():
accelerator = _get_gpu_name()
elif (
device is not None and device.type is DeviceType.TPU
) or tpu_is_available():
accelerator = _get_tpu_device_name()
elif (
device is not None and device.type is DeviceType.NEURON
) or neuron_is_available():
accelerator = _get_neuron_device_name()
return HardwareSetup(
cpu=cpuinfo.get_cpu_info()["brand_raw"],
operating_system=platform.system(),
memory_gb=round(psutil.virtual_memory().total * 1e-9, 2),
accelerator=accelerator,
)
def _get_gpu_name() -> str:
if torch_is_available():
name = torch_get_device_name()
elif tensorflow_is_available():
name = tensorflow_get_gpu_name()
else:
name = "Unknown"
return name
def _get_neuron_device_name() -> str:
output = os.popen("lshw -businfo").read()
neuron_name = "Unknown Neuron"
for line in output.splitlines():
if "neuron" in line.lower():
words = line.split(" ")
if len(words) > 2:
neuron_name = " ".join(words[-2:])
break
return neuron_name
def _get_tpu_device_name() -> str:
return xm.xla_device_hw(xm.xla_device())
================================================
FILE: optimization/nebullvm/nebullvm/tools/huggingface.py
================================================
from collections import OrderedDict
from typing import (
Union,
Iterable,
List,
Dict,
Tuple,
Type,
Any,
)
import numpy as np
from nebullvm.core.models import Device, DeviceType
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch, Module
try:
from transformers import (
PreTrainedModel,
)
from transformers.tokenization_utils import PreTrainedTokenizer
except ImportError:
# add placeholders for function definition
PreTrainedModel = None
PreTrainedTokenizer = None
class PyTorchTransformerWrapper(Module):
"""Class for wrappering the Transformers and give them an API compatible
with nebullvm. The class takes and input of the forward method positional
arguments and transform them in the input dictionaries needed by
transformers classes. At the end it also flattens their output.
"""
def __init__(
self,
core_model: Module,
encoded_input: Dict[str, torch.Tensor],
):
super().__init__()
self.core_model = core_model
self.inputs_types = OrderedDict()
for key, value in encoded_input.items():
self.inputs_types[key] = value.dtype
def forward(self, *args: torch.Tensor):
inputs = {
key: value for key, value in zip(self.inputs_types.keys(), args)
}
outputs = self.core_model(**inputs)
outputs = outputs.values() if isinstance(outputs, dict) else outputs
return tuple(flatten_outputs(outputs))
class TensorFlowTransformerWrapper(tf.keras.Model):
def __init__(
self,
core_model: tf.Module,
encoded_input: Dict[str, tf.Tensor],
):
super().__init__()
self.core_model = core_model
self.inputs_types = OrderedDict()
for key, value in encoded_input.items():
self.inputs_types[key] = value.dtype
def call(self, *args: tf.Tensor):
inputs = {
key: value for key, value in zip(self.inputs_types.keys(), args[0])
}
outputs = self.core_model(**inputs)
outputs = outputs.values() if isinstance(outputs, dict) else outputs
return tuple(flatten_outputs(list(outputs)))
def flatten_outputs(
outputs: Union[torch.Tensor, tf.Tensor, Iterable]
) -> List[Union[torch.Tensor, tf.Tensor]]:
new_outputs = []
for output in outputs:
if isinstance(output, (torch.Tensor, tf.Tensor)):
new_outputs.append(output)
else:
flatten_list = flatten_outputs(output)
new_outputs.extend(flatten_list)
return new_outputs
def get_size_recursively(
tensor_tuple: Union[torch.Tensor, tf.Tensor, Tuple]
) -> List[int]:
if isinstance(tensor_tuple[0], (torch.Tensor, tf.Tensor)):
return [len(tensor_tuple)]
else:
inner_size = get_size_recursively(tensor_tuple[0])
return [len(tensor_tuple), *inner_size]
def get_output_structure_from_text(
text: str,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
tokenizer_args: Dict,
device: Device,
) -> Tuple[OrderedDict, Type]:
"""Function needed for saving in a dictionary the output structure of the
transformers model.
"""
encoded_input = tokenizer([text], **tokenizer_args)
if isinstance(model, torch.nn.Module):
encoded_input = encoded_input.to(device.to_torch_format())
output = model(**encoded_input)
structure = OrderedDict()
if isinstance(output, tuple):
for i, value in enumerate(output):
if isinstance(value, (torch.Tensor, tf.Tensor)):
structure[f"output_{i}"] = None
else:
size = get_size_recursively(value)
structure[f"output_{i}"] = size
else:
for key, value in output.items():
if isinstance(value, (torch.Tensor, tf.Tensor)):
structure[key] = None
else:
size = get_size_recursively(value)
structure[key] = size
return structure, type(output)
def get_output_structure_from_dict(
input_example: Dict,
model: PreTrainedModel,
device: Device,
) -> Tuple[OrderedDict, Type]:
"""Function needed for saving in a dictionary the output structure of the
transformers model.
"""
if (
isinstance(model, torch.nn.Module)
and device.type is not DeviceType.TPU
):
model.to(device.to_torch_format())
input_example.to(device.to_torch_format())
output = model(**input_example)
structure = OrderedDict()
if isinstance(output, tuple):
for i, value in enumerate(output):
if isinstance(value, (torch.Tensor, tf.Tensor)):
structure[f"output_{i}"] = None
else:
size = get_size_recursively(value)
structure[f"output_{i}"] = size
else:
for key, value in output.items():
if isinstance(value, (torch.Tensor, tf.Tensor)):
structure[key] = None
else:
size = get_size_recursively(value)
structure[key] = size
return structure, type(output)
def restructure_output(
output: Tuple[Union[torch.Tensor, tf.Tensor]],
structure: OrderedDict,
output_type: Any = None,
):
"""Restructure the flatter output using the structure dictionary given as
input.
"""
output_dict = {}
idx = 0
for key, value in structure.items():
if value is None:
output_dict[key] = output[idx]
idx += 1
else:
tensor_shape = output[idx].shape[1:]
stack_fn = (
torch.stack
if isinstance(output[idx], torch.Tensor)
else tf.stack
)
reshape_fn = (
torch.reshape
if isinstance(output[idx], torch.Tensor)
else tf.reshape
)
output_dict[key] = list(
reshape_fn(
stack_fn(
output[idx : int(np.prod(value)) + idx] # noqa E203
),
(*value, *tensor_shape),
)
)
idx += np.prod(value)
if output_type is not None:
return output_type(**output_dict)
return output_dict
================================================
FILE: optimization/nebullvm/nebullvm/tools/logger.py
================================================
import logging
import os
import sys
import warnings
from typing import Any
from loguru import logger
levels_map = {
0: "ERROR",
1: "WARNING",
2: "INFO",
3: "DEBUG",
}
def debug_mode_enabled():
return int(os.environ.get("DEBUG_MODE", "0")) > 0
def setup_logger():
if not debug_mode_enabled():
warnings.filterwarnings("ignore")
logging_level = int(os.environ.get("NEBULLVM_LOG_LEVEL", "2"))
logger.remove()
logger.add(
sys.stdout,
colorize=True,
format=(
"{time:YYYY-MM-DD HH:mm:ss} | "
"{level: <8} | {message}"
),
level=levels_map[logging_level],
)
logger.level("WARNING", color="")
class LoggingContext(object):
def __init__(
self,
logger: logging.Logger,
disabled: bool = False,
handler: Any = None,
close: bool = True,
):
self.logger = logger
self.disabled = disabled
self.handler = handler
self.close = close
def __enter__(self):
self.logger.disabled = self.disabled
if self.handler:
self.logger.addHandler(self.handler)
def __exit__(self, et: Any, ev: Any, tb: Any):
if self.disabled is True:
self.logger.disabled = False
if self.handler:
self.logger.removeHandler(self.handler)
if self.handler and self.close:
self.handler.close()
# implicit return of None => don't swallow exceptions
================================================
FILE: optimization/nebullvm/nebullvm/tools/onnx.py
================================================
from typing import List, Tuple, Any, Optional, Dict
import numpy as np
from loguru import logger
from nebullvm.config import ONNX_PROVIDERS
from nebullvm.core.models import (
DeepLearningFramework,
Device,
DeviceType,
InputInfo,
DataType,
)
from nebullvm.optional_modules.onnx import onnx
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
def convert_to_numpy(tensor: Any):
if isinstance(tensor, torch.Tensor):
tensor = tensor.cpu().detach().numpy()
elif isinstance(tensor, tf.Tensor) and tensor is not None:
tensor = tensor.numpy()
elif isinstance(tensor, int):
tensor = np.array([tensor])
else:
if not isinstance(tensor, np.ndarray):
raise TypeError(f"Unsupported data type: {type(tensor)}")
return tensor
def convert_to_target_framework(
tensor: np.ndarray, framework: DeepLearningFramework
) -> Any:
if framework is DeepLearningFramework.PYTORCH:
return torch.from_numpy(tensor)
elif framework is DeepLearningFramework.TENSORFLOW:
return tf.convert_to_tensor(tensor)
else:
return tensor
def get_input_names(onnx_model: str):
model = onnx.load(onnx_model)
input_all = [node.name for node in model.graph.input]
return input_all
def get_output_names(onnx_model: str):
model = onnx.load(onnx_model)
output_all = [node.name for node in model.graph.output]
return output_all
def run_onnx_model(
onnx_model: str, input_tensors: List[np.ndarray], device: Device
) -> List[np.ndarray]:
from nebullvm.optional_modules.onnxruntime import onnxruntime as ort
if device.type is DeviceType.GPU and len(ONNX_PROVIDERS["cuda"]) == 3:
ONNX_PROVIDERS["cuda"][1] = (
"CUDAExecutionProvider",
{
"device_id": device.idx,
},
)
model = ort.InferenceSession(
onnx_model,
providers=ONNX_PROVIDERS["cuda"][1:]
if device.type is DeviceType.GPU
else ONNX_PROVIDERS["cpu"],
)
inputs = {
name: array
for name, array in zip(get_input_names(onnx_model), input_tensors)
}
res = model.run(
output_names=get_output_names(onnx_model), input_feed=inputs
)
return list(res)
def _extract_dynamic_axis(
onnx_model: str,
data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],
input_sizes: List[Tuple[int, ...]],
device: Device,
max_data: int = 100,
) -> Optional[Dict]:
from nebullvm.tools.utils import inspect_dynamic_size
dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
output_sizes = []
for i, input_data in enumerate(data):
input_tensors = input_data[0]
if i >= max_data:
break
inspect_dynamic_size(
input_tensors, input_sizes, dynamic_axis["inputs"]
)
outputs = tuple(
run_onnx_model(onnx_model, list(input_tensors), device)
)
if i == 0:
dynamic_axis["outputs"] = [{}] * len(outputs)
output_sizes = [tuple(output.shape[1:]) for output in outputs]
inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
if any(
len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
):
return dynamic_axis
return None
def extract_info_from_np_data(
onnx_model: str,
data: List[Tuple[Tuple[np.ndarray, ...], np.ndarray]],
dynamic_axis: Dict,
device: Device,
**kwargs,
):
from nebullvm.tools.utils import ifnone
input_row = data[0][0]
batch_size = int(input_row[0].shape[0])
if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
logger.warning("Detected not consistent batch size in the inputs.")
input_sizes = [tuple(x.shape) for x in input_row]
input_types = [
"int32"
if x.dtype is np.int32
else "int64"
if x.dtype is np.int64
else "float16"
if x.dtype is np.float16
else "float32"
for x in input_row
]
dynamic_axis = ifnone(
dynamic_axis,
_extract_dynamic_axis(onnx_model, data, input_sizes, device),
)
return batch_size, input_sizes, input_types, dynamic_axis
def get_output_info_onnx(
onnx_model: str, input_tensors: List[np.ndarray], device
) -> List[Tuple[Tuple[int, ...], DataType]]:
res = run_onnx_model(onnx_model, input_tensors, device)
sizes = [
(tuple(output.shape), DataType.from_framework_format(output.dtype))
for output in res
]
return sizes
def create_model_inputs_onnx(input_infos: List[InputInfo]) -> List[np.ndarray]:
input_tensors = (
np.random.randn(*input_info.size).astype(np.float32)
if input_info.dtype is DataType.FLOAT32
else np.random.randint(
size=input_info.size,
low=input_info.min_value or 0,
high=input_info.max_value or 100,
)
for input_info in input_infos
)
return list(input_tensors)
def onnx_is_gpu_available():
return ort.get_device() == "GPU"
================================================
FILE: optimization/nebullvm/nebullvm/tools/pytorch.py
================================================
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Union, Sequence
from loguru import logger
from nebullvm.core.models import Device, DataType, DeviceType, InputInfo
from nebullvm.optional_modules.torch import torch, DataLoader
from nebullvm.tools.data import DataManager
from nebullvm.tools.diffusers import get_default_dynamic_info
FX_MODULE_NAME = "NebullvmFxModule"
def save_with_torch_fx(model: torch.nn.Module, path: Path):
traced_model = torch.fx.symbolic_trace(model)
traced_model.to_folder(path, FX_MODULE_NAME)
def load_with_torch_fx(
path: Path, state_dict_name: str = "pruned_state_dict.pt"
):
module_file = path / "module.py"
with open(module_file, "r") as f:
module_str = f.read()
exec(module_str, globals())
model = eval(FX_MODULE_NAME)()
model.load_state_dict(torch.load(path / state_dict_name))
return model
def get_output_info_torch(
torch_model: torch.nn.Module,
input_tensors: List[torch.Tensor],
device: Device,
) -> List[Tuple[Tuple[int, ...], DataType]]:
if device.type is DeviceType.GPU:
input_tensors = [x.to(device.to_torch_format()) for x in input_tensors]
torch_model.to(device.to_torch_format())
with torch.no_grad():
outputs = torch_model(*input_tensors)
if isinstance(outputs, torch.Tensor):
return [
(
tuple(outputs.size()),
DataType.from_framework_format(outputs.dtype),
)
]
else:
return [
(
tuple(output.size()),
DataType.from_framework_format(output.dtype),
)
for output in outputs
]
def create_model_inputs_torch(
input_infos: List[InputInfo],
) -> List[torch.Tensor]:
input_tensors = (
torch.randn(*input_info.size)
if input_info.dtype is DataType.FLOAT32
else torch.randint(
size=input_info.size,
low=input_info.min_value or 0,
high=input_info.max_value or 100,
)
for input_info in input_infos
)
return list(input_tensors)
def run_torch_model(
torch_model: torch.nn.Module,
input_tensors: List[torch.Tensor],
device: Device,
dtype: torch.dtype = torch.float,
) -> List[torch.Tensor]:
torch_model.eval()
if device.type is DeviceType.GPU:
torch_model.to(device.to_torch_format())
if dtype != torch.half:
input_tensors = (
t.to(device.to_torch_format()) for t in input_tensors
)
else:
input_tensors = (
t.to(device.to_torch_format()).half()
if t.dtype == torch.float
else t.to(device.to_torch_format())
for t in input_tensors
)
with torch.no_grad():
pred = torch_model(*input_tensors)
if isinstance(pred, torch.Tensor):
pred = [pred.cpu()]
else:
pred = [p.cpu() for p in pred]
return pred
def _extract_dynamic_axis(
torch_model: torch.nn.Module,
dataloader: DataManager,
input_sizes: List[Tuple[int, ...]],
device: Device,
max_data: int = 100,
) -> Optional[Dict]:
from nebullvm.tools.utils import inspect_dynamic_size
dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
output_sizes = []
for i, input_data in enumerate(dataloader):
input_tensors = input_data[0]
if i >= max_data:
break
inspect_dynamic_size(
input_tensors, input_sizes, dynamic_axis["inputs"]
)
outputs = tuple(run_torch_model(torch_model, input_tensors, device))
if i == 0:
dynamic_axis["outputs"] = [{}] * len(outputs)
output_sizes = [tuple(output.shape) for output in outputs]
inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
if any(
len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
):
return dynamic_axis
return None
def extract_info_from_torch_data(
model: torch.nn.Module,
dataloader: Union[DataLoader, Sequence],
dynamic_axis: Dict,
device: Device,
is_diffusion: bool = False,
):
from nebullvm.tools.utils import ifnone
input_data = (
dataloader[0]
if isinstance(dataloader, Sequence)
else next(iter(dataloader))
)
input_row = input_data[0]
batch_size = int(input_row[0].shape[0])
if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
logger.warning("Detected not consistent batch size in the inputs.")
input_sizes = [tuple(x.shape) for x in input_row]
input_types = [
"int64"
if isinstance(x.cpu(), torch.LongTensor)
else "int32"
if isinstance(x.cpu(), torch.IntTensor)
else "float16"
if isinstance(x.cpu(), torch.HalfTensor)
else "float32"
for x in input_row
]
# For the Stable Diffusion UNet we must provide dynamic axis
# even when using static shapes, because otherwise the converted
# onnx model will have size issues.
if dynamic_axis is None and device.type is DeviceType.GPU and is_diffusion:
dynamic_axis = get_default_dynamic_info(input_sizes)
if dynamic_axis is not None:
dynamic_axis["inputs"] = [
{int(k): v for (k, v) in val.items()}
for val in dynamic_axis["inputs"]
]
dynamic_axis["outputs"] = [
{int(k): v for (k, v) in val.items()}
for val in dynamic_axis["outputs"]
]
dynamic_axis = ifnone(
dynamic_axis,
_extract_dynamic_axis(model, dataloader, input_sizes, device),
)
return batch_size, input_sizes, input_types, dynamic_axis
def torch_is_gpu_available():
return torch.cuda.is_available()
def torch_get_device_name():
return torch.cuda.get_device_name(0)
def get_torch_model_size(
model: Union[torch.nn.Module, torch.jit.ScriptModule, torch.fx.GraphModule]
):
param_size = 0
for param in model.parameters():
param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
buffer_size += buffer.nelement() * buffer.element_size()
return param_size + buffer_size
================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/__init__.py
================================================
================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_data.py
================================================
import tensorflow as tf
import torch
from nebullvm.tools.data import DataManager
def test_custom_input_data():
input_data = [
((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
((torch.randn(2, 3, 10, 10),), torch.randn(2, 1)),
]
data_manager = DataManager(input_data)
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 1
assert data_manager[0][0][0].shape == (2, 3, 10, 10)
assert data_manager[0][1].shape == (2, 1)
def test_torch_dataloader_single_input_with_label():
dataset = torch.utils.data.TensorDataset(
torch.randn(8, 3, 10, 10), torch.randn(8, 1)
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
data_manager = DataManager.from_dataloader(dataloader)
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 1
assert data_manager[0][0][0].shape == (2, 3, 10, 10)
assert data_manager[0][1].shape == (2, 1)
def test_torch_dataloader_two_inputs_with_label():
dataset = torch.utils.data.TensorDataset(
torch.randn(8, 3, 10, 10), torch.randn(8, 3, 10, 10), torch.randn(8, 1)
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
data_manager = DataManager.from_dataloader(dataloader)
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 2
assert data_manager[0][0][0].shape == (2, 3, 10, 10)
assert data_manager[0][0][1].shape == (2, 3, 10, 10)
assert data_manager[0][1].shape == (2, 1)
def test_torch_dataloader_three_inputs_with_label():
dataset = torch.utils.data.TensorDataset(
torch.randn(8, 3, 10, 10),
torch.randn(8, 3, 10, 10),
torch.randn(8, 3, 10, 10),
torch.randn(8, 1),
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
data_manager = DataManager.from_dataloader(dataloader)
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 3
assert data_manager[0][0][0].shape == (2, 3, 10, 10)
assert data_manager[0][0][1].shape == (2, 3, 10, 10)
assert data_manager[0][0][2].shape == (2, 3, 10, 10)
assert data_manager[0][1].shape == (2, 1)
def test_torch_dataloader_single_input_without_label():
dataset = torch.utils.data.TensorDataset(torch.randn(8, 3, 10, 10))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
data_manager = DataManager.from_dataloader(dataloader)
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 1
assert data_manager[0][0][0].shape == (2, 3, 10, 10)
def test_tensorflow_dataloader_single_input_with_label():
dataset = tf.data.Dataset.from_tensor_slices(
(tf.random.normal([8, 10, 10, 3]), tf.random.normal([8, 1]))
)
data_manager = DataManager.from_dataloader(dataset.batch(2))
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 1
assert data_manager[0][0][0].shape == (2, 10, 10, 3)
assert data_manager[0][1].shape == (2, 1)
def test_tensorflow_dataloader_two_inputs_with_label():
dataset = tf.data.Dataset.from_tensor_slices(
(
tf.random.normal([8, 10, 10, 3]),
tf.random.normal([8, 10, 10, 3]),
tf.random.normal([8, 1]),
)
)
data_manager = DataManager.from_dataloader(dataset.batch(2))
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 2
assert data_manager[0][0][0].shape == (2, 10, 10, 3)
assert data_manager[0][0][1].shape == (2, 10, 10, 3)
assert data_manager[0][1].shape == (2, 1)
def test_tensorflow_dataloader_three_inputs_with_label():
dataset = tf.data.Dataset.from_tensor_slices(
(
tf.random.normal([8, 10, 10, 3]),
tf.random.normal([8, 10, 10, 3]),
tf.random.normal([8, 10, 10, 3]),
tf.random.normal([8, 1]),
)
)
data_manager = DataManager.from_dataloader(dataset.batch(2))
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 3
assert data_manager[0][0][0].shape == (2, 10, 10, 3)
assert data_manager[0][0][1].shape == (2, 10, 10, 3)
assert data_manager[0][0][2].shape == (2, 10, 10, 3)
assert data_manager[0][1].shape == (2, 1)
def test_tensorflow_dataloader_single_input_without_label():
dataset = tf.data.Dataset.from_tensor_slices(
tf.random.normal([8, 10, 10, 3])
)
data_manager = DataManager.from_dataloader(dataset.batch(2))
assert len(data_manager) == 4
assert len(data_manager[0]) == 2
assert len(data_manager[0][0]) == 1
assert data_manager[0][0][0].shape == (2, 10, 10, 3)
================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_hardware_utils.py
================================================
import unittest
from unittest.mock import patch
from nebullvm.tools import hardware_utils
class TestGetHwSetup(unittest.TestCase):
@patch(
"nebullvm.tools.hardware_utils.gpu_is_available", return_value=False
)
@patch(
"nebullvm.tools.hardware_utils.tpu_is_available", return_value=False
)
@patch(
"nebullvm.tools.hardware_utils.neuron_is_available", return_value=False
)
def test_hw_setup__gpu_not_available(self, *_):
setup = hardware_utils.get_hw_setup()
self.assertIsNone(setup.accelerator)
self.assertGreater(len(setup.cpu), 0)
self.assertGreater(len(setup.operating_system), 0)
self.assertGreater(setup.memory_gb, 0)
@patch("nebullvm.tools.hardware_utils.gpu_is_available", return_value=True)
@patch(
"nebullvm.tools.hardware_utils._get_gpu_name", return_value="mock-gpu"
)
def test_hw_setup__gpu_is_available(self, *_):
setup = hardware_utils.get_hw_setup()
self.assertEqual("mock-gpu", setup.accelerator)
self.assertGreater(len(setup.cpu), 0)
self.assertGreater(len(setup.operating_system), 0)
self.assertGreater(setup.memory_gb, 0)
================================================
FILE: optimization/nebullvm/nebullvm/tools/tests/test_utils.py
================================================
import unittest
from unittest.mock import patch
from nebullvm.core.models import DeviceType
from nebullvm.tools import utils
class TestGetThroughput(unittest.TestCase):
def test_latency_is_zero(self):
self.assertEqual(-1, utils.get_throughput(0, 10))
class TestCheckDevice(unittest.TestCase):
@patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
def test_device_is_none_no_device_available(self, *_):
device = utils.check_device()
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.gpu_is_available", return_value=True)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
def test_device_is_none_gpu_is_available(self, *_):
device = utils.check_device()
self.assertEqual(DeviceType.GPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=True)
@patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
def test_device_is_none_tpu_is_available(self, *_):
device = utils.check_device()
self.assertEqual(DeviceType.TPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=True)
@patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
def test_device_is_none_neuron_is_available(self, *_):
device = utils.check_device()
self.assertEqual(DeviceType.NEURON, device.type)
self.assertEqual(device.idx, 0)
def test_device_is_cpu(self):
device = utils.check_device("cpu")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.gpu_is_available", return_value=False)
def test_device_is_gpu_no_gpu_available(self, _):
device = utils.check_device("gpu")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("cuda")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("cuda:1")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("gpu:2")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.gpu_is_available", return_value=True)
def test_device_is_gpu_gpu_is_available(self, _):
device = utils.check_device("gpu")
self.assertEqual(DeviceType.GPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("cuda")
self.assertEqual(DeviceType.GPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("cuda:1")
self.assertEqual(DeviceType.GPU, device.type)
self.assertEqual(device.idx, 1)
device = utils.check_device("gpu:2")
self.assertEqual(DeviceType.GPU, device.type)
self.assertEqual(device.idx, 2)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=False)
def test_device_is_tpu_no_tpu_available(self, _):
device = utils.check_device("tpu")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("tpu:1")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.tpu_is_available", return_value=True)
def test_device_is_tpu_tpu_is_available(self, _):
device = utils.check_device("tpu")
self.assertEqual(DeviceType.TPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("tpu:1")
self.assertEqual(DeviceType.TPU, device.type)
self.assertEqual(device.idx, 1)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=False)
def test_device_is_neuron_no_neuron_available(self, _):
device = utils.check_device("neuron")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("neuron:1")
self.assertEqual(DeviceType.CPU, device.type)
self.assertEqual(device.idx, 0)
@patch("nebullvm.tools.utils.neuron_is_available", return_value=True)
def test_device_is_neuron_neuron_is_available(self, _):
device = utils.check_device("neuron")
self.assertEqual(DeviceType.NEURON, device.type)
self.assertEqual(device.idx, 0)
device = utils.check_device("neuron:1")
self.assertEqual(DeviceType.NEURON, device.type)
self.assertEqual(device.idx, 1)
================================================
FILE: optimization/nebullvm/nebullvm/tools/tf.py
================================================
from typing import Union, List, Tuple, Any, Optional, Dict
import numpy as np
from loguru import logger
from nebullvm.core.models import Device, DataType, InputInfo
from nebullvm.optional_modules.tensorflow import tensorflow as tf
def get_output_info_tf(
tf_model: Union[tf.Module, tf.keras.Model],
input_tensors: List[tf.Tensor],
device: Device,
) -> List[Tuple[Tuple[int, ...], DataType]]:
with tf.device(device.to_tf_format()):
outputs = tf_model(input_tensors)
if isinstance(outputs, tf.Tensor) and outputs is not None:
return [
(
tuple(outputs.shape),
DataType.from_framework_format(outputs.dtype),
)
]
return [
(tuple(x.shape), DataType.from_framework_format(x.dtype))
for x in outputs
]
def create_model_inputs_tf(input_infos: List[InputInfo]) -> List[tf.Tensor]:
return [
tf.random_normal_initializer()(
shape=(
input_info.size[0],
*input_info.size[2:],
input_info.size[1],
)
)
if input_info.dtype is DataType.FLOAT32
else tf.random.uniform(
shape=(
input_info.size[0],
*input_info.size[2:],
input_info.size[1],
),
minval=input_info.min_value or 0,
maxval=input_info.max_value or 100,
dtype=tf.int32,
)
for input_info in input_infos
]
def run_tf_model(
model: tf.Module,
input_tensors: Tuple[tf.Tensor],
device: Device,
) -> Tuple[tf.Tensor]:
with tf.device(device.to_tf_format()):
pred = model(input_tensors)
if isinstance(pred, tf.Tensor):
pred = (pred,)
return pred
def _extract_dynamic_axis(
tf_model: tf.Module,
dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],
input_sizes: List[Tuple[int, ...]],
device: Device,
max_data: int = 100,
) -> Optional[Dict]:
from nebullvm.tools.utils import inspect_dynamic_size
dynamic_axis = {"inputs": [{}] * len(input_sizes), "outputs": []}
output_sizes = []
for i, input_data in enumerate(dataset):
input_tensors = input_data[0]
if i >= max_data:
break
inspect_dynamic_size(
input_tensors, input_sizes, dynamic_axis["inputs"]
)
outputs = tuple(run_tf_model(tf_model, input_tensors, device))
if i == 0:
dynamic_axis["outputs"] = [{}] * len(outputs)
output_sizes = [tuple(output.shape[1:]) for output in outputs]
inspect_dynamic_size(outputs, output_sizes, dynamic_axis["outputs"])
if any(
len(x) > 0 for x in (dynamic_axis["inputs"] + dynamic_axis["outputs"])
):
return dynamic_axis
return None
def extract_info_from_tf_data(
tf_model: tf.Module,
dataset: List[Tuple[Tuple[tf.Tensor, ...], Any]],
dynamic_axis: Dict,
device: Device,
**kwargs,
):
from nebullvm.tools.utils import ifnone
input_row = dataset[0][0]
batch_size = int(input_row[0].shape[0])
if not all([input_row[0].shape[0] == x.shape[0] for x in input_row]):
logger.warning("Detected not consistent batch size in the inputs.")
input_sizes = [tuple(x.shape) for x in input_row]
input_types = [
"int32"
if x.dtype in [tf.int32, np.int32]
else "int64"
if x.dtype in [tf.int64, np.int64]
else "float16"
if x.dtype in [tf.float16, np.float16]
else "float32"
for x in input_row
]
dynamic_axis = ifnone(
dynamic_axis,
_extract_dynamic_axis(tf_model, dataset, input_sizes, device),
)
return batch_size, input_sizes, input_types, dynamic_axis
def tensorflow_is_gpu_available():
return len(tf.config.list_physical_devices("GPU")) > 0
def tensorflow_get_gpu_name():
gpu_devices = tf.config.list_physical_devices("GPU")
if gpu_devices:
details = tf.config.experimental.get_device_details(gpu_devices[0])
details.get("device_name", "Unknown GPU")
return details["device_name"]
else:
return "Unknown GPU"
================================================
FILE: optimization/nebullvm/nebullvm/tools/transformations.py
================================================
import copy
from abc import ABC, abstractmethod
from typing import List, Any, Dict
import numpy as np
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
class BaseTransformation(ABC):
@abstractmethod
def _transform(self, _input: Any, **kwargs) -> Any:
raise NotImplementedError()
def __call__(self, _input: Any, **kwargs):
return self._transform(_input, **kwargs)
def to_dict(self):
return {
"module": self.__class__.__module__,
"name": self.__class__.__name__,
}
@classmethod
def from_dict(cls, tfm_dict: Dict):
return cls()
class MultiStageTransformation(BaseTransformation):
def __init__(self, transformations: List[BaseTransformation]):
self._tfms = transformations
def _transform(self, _input: Any, **kwargs) -> Any:
for tfm in self._tfms:
_input = tfm(_input, **kwargs)
return _input
def append(self, __tfm: BaseTransformation):
self._tfms.append(__tfm)
def extend(self, tfms: List[BaseTransformation]):
self._tfms += tfms
def to_dict(self) -> Dict:
return {"tfms": [tfm.to_dict() for tfm in self._tfms]}
def to_list(self):
return self._tfms
@classmethod
def from_dict(cls, tfms_dict: Dict):
tfms = []
for tfm_dict in tfms_dict["tfms"]:
exec(f"from {tfm_dict['module']} import {tfm_dict['name']}")
tfm = eval(tfm_dict["name"]).from_dict(tfm_dict)
tfms.append(tfm)
return cls(tfms)
def copy(self):
new_list = copy.deepcopy(self._tfms)
return self.__class__(new_list)
def __len__(self):
return len(self._tfms)
class HalfPrecisionTransformation(BaseTransformation):
@staticmethod
def _transform_numpy(_input: np.ndarray) -> np.ndarray:
return _input.astype(dtype=np.float16)
@staticmethod
def _transform_tf(_input: tf.Tensor) -> tf.Tensor:
return tf.cast(_input, tf.float16)
@staticmethod
def _transform_torch(_input: torch.Tensor) -> torch.Tensor:
return _input.half()
def _transform(self, _input: Any, **kwargs) -> Any:
if isinstance(_input, np.ndarray):
return (
self._transform_numpy(_input)
if _input.dtype == np.float32
else _input
)
elif isinstance(_input, torch.Tensor):
return (
self._transform_torch(_input)
if _input.dtype == torch.float32
else _input
)
elif isinstance(_input, tf.Tensor) and _input is not None:
return (
self._transform_tf(_input)
if _input.dtype == tf.float32
else _input
)
else:
raise TypeError(
f"The given input type is not currently supported. "
f"Got {type(_input)}, expected one between (np.ndarray, "
f"torch.Tensor, tf.Tensor)"
)
class NoOp(BaseTransformation):
def _transform(self, _input: Any, **kwargs):
return _input
class VerifyContiguity(BaseTransformation):
def _transform(self, _input: Any, **kwargs) -> Any:
if not isinstance(_input, torch.Tensor):
return _input
if not _input.is_contiguous():
_input = _input.contiguous()
return _input
================================================
FILE: optimization/nebullvm/nebullvm/tools/utils.py
================================================
import os
import subprocess
import sys
import uuid
from pathlib import Path
from types import ModuleType
from typing import (
Tuple,
Any,
List,
Dict,
Union,
Iterable,
Sequence,
Optional,
Callable,
)
import numpy as np
from loguru import logger
from packaging import version
from nebullvm.core.models import (
DeepLearningFramework,
Device,
ModelParams,
DeviceType,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.data import DataManager
from nebullvm.tools.onnx import (
extract_info_from_np_data,
get_output_info_onnx,
)
from nebullvm.tools.pytorch import (
extract_info_from_torch_data,
get_output_info_torch,
)
from nebullvm.tools.tf import (
extract_info_from_tf_data,
get_output_info_tf,
)
def get_model_size_mb(model: Any) -> float:
if isinstance(model, str):
size = os.stat(model).st_size
elif isinstance(model, Path):
size = os.path.getsize(model.as_posix())
elif isinstance(model, torch.nn.Module):
size = sum(p.nelement() * p.element_size() for p in model.parameters())
else:
# we assume it is a tf_model
# assuming full precision 32 bit
size = model.count_params() * 4
return round(size * 1e-6, 2)
def get_model_name(model: Any) -> str:
if isinstance(model, str):
return model
if isinstance(model, Path):
return model.as_posix()
return model.__class__.__name__
def generate_model_id(model: Any) -> str:
model_name = get_model_name(model)
return f"{str(uuid.uuid4())}_{hash(model_name)}"
def get_throughput(latency: float, batch_size: int = 1) -> float:
if latency == 0:
return -1
return (1 / latency) * batch_size
def ifnone(target, new_value):
if target is None:
return new_value
else:
return target
def inspect_dynamic_size(
tensors: Tuple[Any, ...],
sizes: List[Tuple[int, ...]],
axis_list: List[Dict],
):
for idx, (tensor, size) in enumerate(zip(tensors, sizes)):
for idy, (j, k) in enumerate(zip(tensor.shape, size)):
if j != k:
if idy == 0:
tag = "batch_size"
else:
tag = f"val_{j}_{k}"
axis_list[idx][idy] = tag
def gpu_is_available():
try:
subprocess.check_output("nvidia-smi")
return True
except Exception:
return False
def neuron_is_available():
try:
subprocess.check_output("neuron-ls")
return True
except Exception:
return False
def tpu_is_available():
# Check if a tpu is available
try:
import torch_xla
import torch_xla.core.xla_model as xm
return xm.xla_device_hw(torch_xla.core.xla_model.xla_device()) == "TPU"
except Exception:
return False
def check_module_version(
module: ModuleType, min_version: str = None, max_version: str = None
) -> bool:
installed_version = module.__version__
if min_version is not None:
if version.parse(installed_version) < version.parse(min_version):
return False
if max_version is not None:
if version.parse(installed_version) > version.parse(max_version):
return False
return True
def is_python_version_3_10():
return (
str(sys.version_info.major) + "." + str(sys.version_info.minor)
== "3.10"
)
def get_dl_framework(model: Any):
if isinstance(model, torch.nn.Module):
return DeepLearningFramework.PYTORCH
elif isinstance(model, tf.Module) and model is not None:
return DeepLearningFramework.TENSORFLOW
elif isinstance(model, str):
if Path(model).is_file():
return DeepLearningFramework.NUMPY
else:
raise FileNotFoundError(
f"No file '{model}' found, please provide a valid path to "
f"a model."
)
else:
raise TypeError(f"Model type {type(model)} not supported.")
def check_input_data(input_data: Union[Iterable, Sequence]):
try:
assert len(input_data) > 0
assert isinstance(input_data[0], tuple)
assert isinstance(input_data[0][0], tuple)
assert isinstance(
input_data[0][0][0], (np.ndarray, torch.Tensor, tf.Tensor)
)
if len(input_data[0]) > 1:
assert isinstance(
input_data[0][1],
(np.ndarray, torch.Tensor, tf.Tensor, int, float, type(None)),
)
except: # noqa E722
return False
else:
return True
def is_data_subscriptable(input_data: Union[Iterable, Sequence]):
try:
input_data[0]
except: # noqa E722
return False
else:
return True
def check_dynamic_info_inputs(
dynamic_info: Optional[Dict], input_sample: Tuple[Any]
):
if dynamic_info is not None:
assert dynamic_info.get("inputs") is not None, (
"Dynamic info must contain an 'inputs' key with a list of "
"dictionaries as value."
)
num_dynamic_inputs = len(dynamic_info["inputs"])
num_model_inputs = len(input_sample)
assert len(dynamic_info["inputs"]) == len(input_sample), (
f"The number of dynamic inputs provided in the dynamic info "
f"dict ({num_dynamic_inputs}) is not equal to the number "
f"of inputs of the model ({num_model_inputs}). Detected model "
f"input shapes are: {[input.shape for input in input_sample]} "
)
assert dynamic_info.get("outputs") is not None, (
"Dynamic info must contain an 'outputs' key with a list of "
"dictionaries as value."
)
def extract_info_from_data(
model: Any,
input_data: DataManager,
dl_framework: DeepLearningFramework,
dynamic_info: Optional[Dict],
device: Device,
is_diffusion: bool = False,
):
check_dynamic_info_inputs(dynamic_info, input_data.get_list(1)[0])
batch_size, input_sizes, input_types, dynamic_info = INFO_EXTRACTION_DICT[
dl_framework
](
model,
input_data,
dynamic_axis=dynamic_info,
device=device,
is_diffusion=is_diffusion,
)
output_infos = OUTPUT_INFO_COMPUTATION_DICT[dl_framework](
model, input_data[0][0], device
)
model_params = ModelParams(
batch_size=batch_size,
input_infos=[
{"size": size, "dtype": dtype}
for size, dtype in zip(input_sizes, input_types)
],
output_sizes=[info[0] for info in output_infos],
output_types=[info[1] for info in output_infos],
dynamic_info=dynamic_info,
)
return model_params
def is_huggingface_data(data_sample: Any) -> bool:
if is_dict_type(data_sample):
return True
elif isinstance(data_sample, str):
return True
elif isinstance(data_sample[0], str):
return True
return False
def is_dict_type(data_sample: Any):
try:
data_sample.items()
except AttributeError:
return False
else:
return True
def _get_idx(device: str) -> int:
device_info = device.split(":")
if len(device_info) == 2 and device_info[1].isdigit():
idx = int(device_info[1])
else:
idx = 0
return idx
def _set_device(
accelerator_is_available: bool, device_type: DeviceType, idx: int
) -> Device:
if not accelerator_is_available:
logger.warning(
f"Selected {device_type.name} device but no available "
f"{device_type.name} found on this platform. CPU will "
f"be used instead. Please make sure that the "
f"{device_type.name} is installed and can be used by your "
"framework."
)
device = Device(DeviceType.CPU)
else:
device = Device(device_type, idx=idx)
return device
def check_device(device: Optional[str] = None) -> Device:
if device is None:
if gpu_is_available():
device = Device(DeviceType.GPU)
elif neuron_is_available():
device = Device(DeviceType.NEURON)
elif tpu_is_available():
device = Device(DeviceType.TPU)
else:
device = Device(DeviceType.CPU)
else:
if any(x in device.lower() for x in ["cuda", "gpu"]):
device = _set_device(
accelerator_is_available=gpu_is_available(),
device_type=DeviceType.GPU,
idx=_get_idx(device),
)
elif "neuron" in device.lower():
device = _set_device(
accelerator_is_available=neuron_is_available(),
device_type=DeviceType.NEURON,
idx=_get_idx(device),
)
elif "tpu" in device.lower():
device = _set_device(
accelerator_is_available=tpu_is_available(),
device_type=DeviceType.TPU,
idx=_get_idx(device),
)
else:
device = Device(DeviceType.CPU)
return device
def get_gpu_compute_capability(gpu_idx: int) -> float:
compute_capability = subprocess.check_output(
["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"]
).decode("utf-8")
return float(compute_capability.split("\n")[gpu_idx])
INFO_EXTRACTION_DICT: Dict[DeepLearningFramework, Callable] = {
DeepLearningFramework.PYTORCH: extract_info_from_torch_data,
DeepLearningFramework.TENSORFLOW: extract_info_from_tf_data,
DeepLearningFramework.NUMPY: extract_info_from_np_data,
}
OUTPUT_INFO_COMPUTATION_DICT: Dict[DeepLearningFramework, Callable] = {
DeepLearningFramework.PYTORCH: get_output_info_torch,
DeepLearningFramework.TENSORFLOW: get_output_info_tf,
DeepLearningFramework.NUMPY: get_output_info_onnx,
}
================================================
FILE: optimization/nebullvm/nebullvm/tools/venv.py
================================================
import subprocess
import tempfile
import venv
from loguru import logger
class EnvBuilder(venv.EnvBuilder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.context = None
def post_setup(self, context):
self.context = context
def run_in_different_venv(
requirements_file: str,
script_path: str,
use_gpu: bool,
*args,
):
"""Run a python scripts in a new temporary environment. Arguments for the
script must be passed in the function args.
it is equivalent to create and activate a new environment and running
> pip install -r $requirement_file
> python -m script_path *args
Args:
requirements_file (str): File (.txt) containing the list of
requirements.
script_path (str): Path to the script that must be run.
args: Arguments of the script.
"""
logger.debug(f"Debug: Running script {script_path} in a new virtual env.")
with tempfile.TemporaryDirectory() as target_dir_path:
logger.debug("Debug: Creating virtual environment...")
venv_builder = EnvBuilder(with_pip=True)
venv_builder.create(str(target_dir_path))
venv_context = venv_builder.context
logger.debug("Debug: Installing requirements...")
if use_gpu:
pip_install_command = [
venv_context.env_exe,
"-m",
"pip",
"install",
"torch==1.9.1+cu111",
"torchvision==0.10.1+cu111",
"-f",
"https://download.pytorch.org/whl/torch_stable.html",
]
else:
pip_install_command = [
venv_context.env_exe,
"-m",
"pip",
"install",
"torch<=1.9.1",
"torchvision<=0.10.1",
]
subprocess.check_call(pip_install_command)
pip_install_command = [
venv_context.env_exe,
"-m",
"pip",
"install",
"-r",
requirements_file,
]
subprocess.check_call(pip_install_command)
logger.debug("Debug: Executing script...")
script_command = [venv_context.env_exe, script_path, *args]
subprocess.check_call(script_command)
================================================
FILE: optimization/nebullvm/nebullvm.toml
================================================
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"
================================================
FILE: optimization/nebullvm/requirements-dev.txt
================================================
pytest
pytest-mock
torchvision
sentencepiece
================================================
FILE: optimization/nebullvm/requirements.txt
================================================
numpy>=1.21.0, <1.24.0
packaging>=21.3
py-cpuinfo==8.0.0
PyYAML>=6.0
psutil>=5.0.0
requests>=2.26.1
tqdm>=4.36.0
loguru>=0.5.3
================================================
FILE: optimization/nebullvm/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages
REQUIREMENTS = [
"numpy>=1.21.0, <1.24.0",
"py-cpuinfo>=8.0.0",
"PyYAML>=6.0",
"psutil>=5.0.0",
"requests>=2.26.0",
"tqdm>=4.36.0",
"packaging>=21.3",
"loguru>=0.5.3",
]
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")
setup(
name="nebullvm",
version="0.10.0",
packages=find_packages(),
install_requires=REQUIREMENTS,
long_description=long_description,
include_package_data=True,
long_description_content_type="text/markdown",
)
================================================
FILE: optimization/open_alpha_tensor/README.md
================================================
# 🐉 OpenAlphaTensor
OpenAlphaTensor provides an open-source implementation of Deepmind's AlphaTensor algorithm.
With OpenAlphaTensor, you can increase the computational performances of an AI model with custom-generated matrix multiplication algorithms. You can train your own AlphaTensor algorithm for a specific matrix size or fine-tune a pre-trained AlphaTensor model to produce optimized kernels for a specific hardware.
OpenAlphaTensor is based on Deepmind's paper [Discovering Faster Matrix Multiplication Algorithms with Reinforcement Learning](https://www.nature.com/articles/s41586-022-05172-4).
If you appreciate the project, show it by [leaving a star ⭐](https://github.com/nebuly-ai/nebullvm/stargazers)
## 🧑🏫 Installation
You can install the package cloning the repository and running the following commands:
```bash
git clone https://github.com/nebuly-ai/nebullvm.git
cd nebullvm/apps/accelerate/open_alpha_tensor
pip install -e .
```
## 🚀 Get started
For training your AlphaTensor model, you can execute the following command:
```bash
python main.py
```
Model parameters can be given either as command line arguments or as a JSON file. The `config.json` file contains the default parameters for training a model for matrix size 4x4x4.
Alternatively, if you want to have a more fine-grained control over the training process, you can use the python API:
```python
from open_alpha_tensor import train_alpha_tensor
cardinality_vector = 5 # The actions can have values in range [-2, 2]
N_bar = 100 # parameter for smoothing the temperature while adjusting the probability distribution
matrix_size = 5
input_size = matrix_size**2
n_steps = 15
n_actions = cardinality_vector ** (3 * input_size // n_steps)
action_memory = 7
train_alpha_tensor(
tensor_length=action_memory + 1,
input_size=input_size,
scalars_size=1,
emb_dim=2048,
n_steps=n_steps,
n_logits=n_actions,
n_samples=32,
device="cuda",
len_data=2048,
n_synth_data=1000000,
pct_synth=0.7,
batch_size=32,
epochs=600000,
lr=1e-4,
lr_decay_factor=0.5,
lr_decay_steps=5000,
weight_decay=1e-5,
optimizer_name="adamw",
loss_params=(1, 1),
limit_rank=150,
checkpoint_dir="path/to/checkpoint/dir",
checkpoint_data_dir="path/where/to/save/data/generated/by/the/model",
n_actors=1,
mc_n_sim=200,
n_cob=100000,
cob_prob=0.9983,
data_augmentation=True,
N_bar=N_bar,
random_seed=42,
extra_devices=None,
save_dir="path/to/save/final/model",
)
```
## 🧪 Missing features
- [ ] Release weights of pre-trained models. **Coming out soon**.
- [ ] Add compilation of Alpha Tensor kernels in OpenAI's Triton and JAX/XLA.
- [ ] Add support for fine-tuning on target hardware.
- [ ] Support training on Multiple GPUs (it allows training on a larger batch size).
- [ ] Add support for other compilers (e.g. llvm).
- [ ] Reduce memory footprint of the Acting Agent.
- [ ] Improve acting speed.
## 💫 Contributing
We welcome contributions of all kinds, including new features, improved infrastructure, and better documentation. If you're interested in contributing, please see the [linked](https://docs.nebuly.com/contributions) page for more information on how to get involved.
A special thanks to [BrianPulfer](https://github.com/BrianPulfer) for his awesome contribution to the OpenAlphaTensor module.
================================================
FILE: optimization/open_alpha_tensor/config.json
================================================
{
"batch_size": 16,
"max_epochs": 600000,
"action_memory": 7,
"optimizer": "adamw",
"weight_decay": 1e-5,
"lr": 1e-4,
"lr_decay_factor": 0.1,
"lr_decay_steps": 500000,
"device": "cuda:0",
"len_data": 2048,
"pct_synth": 0.9,
"n_synth_data": 100000,
"limit_rank": 125,
"alpha": 1.0,
"beta": 1.0,
"matrix_size": 4,
"embed_dim": 1024,
"actions_sampled": 32,
"n_actors": 1,
"mc_n_sim": 200,
"n_cob": 100000,
"cob_prob": 0.9983,
"cardinality_vector": 5,
"n_bar": 100
}
================================================
FILE: optimization/open_alpha_tensor/main.py
================================================
import json
import os
from argparse import ArgumentParser
from pathlib import Path
from open_alpha_tensor import train_alpha_tensor
def _compute_largest_divisor(n: int) -> int:
"""Compute the largest divisor of n."""
for i in range(n // 2, 0, -1):
if n % i == 0:
return i
return 1
def main():
config_file = Path(os.getenv("CONFIG_FILE", "config.json"))
if config_file.exists():
with open(config_file) as f:
config = json.load(f)
else:
config = {}
parser = ArgumentParser()
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--max_epochs", type=int, default=1)
parser.add_argument("--action_memory", type=int, default=1)
parser.add_argument("--optimizer", type=str, default="adamw")
parser.add_argument("--weight_decay", type=float, default=1e-5)
parser.add_argument("--lr", type=float, default=1e-4)
parser.add_argument("--lr_decay_factor", type=float, default=0.5)
parser.add_argument("--lr_decay_steps", type=int, default=5000)
parser.add_argument("--device", type=str, default="cuda")
# parser.add_argument("--half", action="store_true")
parser.add_argument("--len_data", type=int, default=100)
parser.add_argument("--pct_synth", type=float, default=0.5)
parser.add_argument("--n_synth_data", type=int, default=100)
parser.add_argument("--limit_rank", type=int, default=15)
parser.add_argument("--alpha", type=float, default=1.0)
parser.add_argument("--beta", type=float, default=1.0)
parser.add_argument("--random_seed", type=int, default=None)
parser.add_argument("--checkpoint_dir", type=str, default=None)
parser.add_argument("--checkpoint_data_dir", type=str, default=None)
parser.add_argument("--matrix_size", type=int, default=3)
parser.add_argument("--embed_dim", type=int, default=1024)
parser.add_argument("--actions_sampled", type=int, default=10)
parser.add_argument("--n_actors", type=int, default=1)
parser.add_argument("--mc_n_sim", type=int, default=100)
parser.add_argument("--n_cob", type=int, default=100000)
parser.add_argument("--cob_prob", type=float, default=0.9983) # 1 - 0.0017
parser.add_argument("--data_augmentation", action="store_true")
parser.add_argument("--cardinality_vector", type=int, default=5)
parser.add_argument(
"--n_bar",
type=int,
default=100,
help="N_bar parameter for policy temperature.",
)
parser.add_argument("--save_dir", type=str, default=None)
parser.add_argument("extra_devices", nargs="*", type=str, default=[])
parser.set_defaults(**config)
args = parser.parse_args()
cardinality_vector = args.cardinality_vector
N_bar = args.n_bar
input_size = args.matrix_size**2
n_steps = _compute_largest_divisor(input_size)
n_actions = cardinality_vector ** (3 * input_size // n_steps)
loss_params = (args.alpha, args.beta)
train_alpha_tensor(
tensor_length=args.action_memory + 1,
input_size=input_size,
scalars_size=1,
emb_dim=args.embed_dim,
n_steps=n_steps,
n_logits=n_actions,
n_samples=args.actions_sampled,
device=args.device,
len_data=args.len_data,
n_synth_data=args.n_synth_data,
pct_synth=args.pct_synth,
batch_size=args.batch_size,
epochs=args.max_epochs,
lr=args.lr,
lr_decay_factor=args.lr_decay_factor,
lr_decay_steps=args.lr_decay_steps,
weight_decay=args.weight_decay,
optimizer_name=args.optimizer,
loss_params=loss_params,
limit_rank=args.limit_rank,
random_seed=args.random_seed,
checkpoint_dir=args.checkpoint_dir,
checkpoint_data_dir=args.checkpoint_data_dir,
n_actors=args.n_actors,
mc_n_sim=args.mc_n_sim,
n_cob=args.n_cob,
cob_prob=args.cob_prob,
data_augmentation=args.data_augmentation or False,
N_bar=N_bar,
extra_devices=args.extra_devices,
save_dir=args.save_dir,
)
if __name__ == "__main__":
main()
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/__init__.py
================================================
from open_alpha_tensor.api.functions import train_alpha_tensor # noqa: F401
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/api/functions.py
================================================
from typing import List, Tuple
from open_alpha_tensor.root_op import TrainAlphaTensorRootOp
def train_alpha_tensor(
tensor_length: int,
input_size: int,
scalars_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_samples: int,
optimizer_name: str,
lr: float,
lr_decay_factor: float,
lr_decay_steps: int,
weight_decay: float,
loss_params: Tuple[float, float],
checkpoint_dir: str,
checkpoint_data_dir: str,
epochs: int,
batch_size: int,
len_data: int,
n_synth_data: int,
pct_synth: float,
limit_rank: int,
n_actors: int,
mc_n_sim: int,
N_bar: int,
device: str,
save_dir: str,
random_seed: int,
n_cob: int,
cob_prob: float,
data_augmentation: bool,
extra_devices: List[str],
):
"""Trains an AlphaTensor model to learn more efficient matrix
multiplications and returns it.
Args:
tensor_length (int): Number of tensors to as history.
input_size (int): Flattened size of the matrices to be multiplied.
scalars_size (int): Size of the scalar vectors fed to the torso model.
emb_dim (int): Embedding dimension.
n_steps (int): Number of steps used to get a single action out of a
triplet.
n_logits (int): Number of logits output by the policy head.
n_samples (int): Number of samples used by the policy head at
evaluation time.
optimizer_name (str): Name of the optimizer used.
lr (float): Learning rate.
lr_decay_factor (float): Learning rate's decay factor.
lr_decay_steps (int): Number of learning rate's decay steps.
weight_decay (float): Weight decay used by the optimizer.
loss_params (Tuple[float, float]): Alpha and Beta parameters used in
the loss function.
checkpoint_dir (str): Directory used to store model checkpoints.
checkpoint_data_dir (str): Directory used to store games as JSON files.
epochs (int): Number of training epochs.
batch_size (int): Batch size.
len_data (int): Number of training samples used (both actor generated
and synthetic).
n_synth_data (int): Number of synthetic training samples.
pct_synth (float): Initial percentage of synthetic samples used for
training.
limit_rank (int): Maximum number of steps per episode and maximum rank
for synthetically-generated matrices.
n_actors (int): Number of actors to play a single each game at each
training step.
mc_n_sim (int): Number of simulations during Monte Carlo tree search.
N_bar (int): N_bar parameter used to compute tau when improving the
policy.
device (str): The name of the torch device used for training.
save_dir (str): Directory where the final trained model will be stored.
random_seed (int): Randomizing seed.
n_cob (int): Number of change of basis (cob) used for a single
training sample.
cob_prob (float): Probability of applying a change of basis.
data_augmentation (bool): Whether to randomly swap the last operation
of an episode with another operation.
extra_devices (List[str]): Extra devices names used for multi-GPU
training.
"""
root_op = TrainAlphaTensorRootOp()
root_op.execute(
tensor_length=tensor_length,
input_size=input_size,
scalars_size=scalars_size,
emb_dim=emb_dim,
n_steps=n_steps,
n_logits=n_logits,
n_samples=n_samples,
optimizer_name=optimizer_name,
lr=lr,
lr_decay_factor=lr_decay_factor,
lr_decay_steps=lr_decay_steps,
weight_decay=weight_decay,
loss_params=loss_params,
checkpoint_dir=checkpoint_dir,
checkpoint_data_dir=checkpoint_data_dir,
epochs=epochs,
batch_size=batch_size,
len_data=len_data,
n_synth_data=n_synth_data,
pct_synth=pct_synth,
limit_rank=limit_rank,
n_actors=n_actors,
mc_n_sim=mc_n_sim,
N_bar=N_bar,
device=device,
save_dir=save_dir,
random_seed=random_seed,
n_cob=n_cob,
cob_prob=cob_prob,
data_augmentation=data_augmentation,
extra_devices=extra_devices,
)
return root_op.get_result()
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/config.py
================================================
BASE_CHECKPOINT_DIR = "checkpoints"
BASE_CHECKPOINT_DATA_DIR = "games"
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/actors/stage.py
================================================
from typing import Dict, List
import torch
from open_alpha_tensor.core.data.utils import (
get_scalars,
map_action_to_triplet,
)
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
def game_is_finished(state):
"""Tells if the game is finished or not.
Args:
state (torch.Tensor): The state of the game.
"""
# state size (1, S, S, S)
return (state == 0).all()
def remove_duplicates(reducing_tensor: torch.Tensor):
"""Remove duplicates from a tensor.
Args:
reducing_tensor (torch.Tensor): The tensor to remove duplicates from.
"""
# reducing tensor has shape (1, N_mc, S, S, S)
n_mc = reducing_tensor.shape[1]
indexes = []
idx_map = {}
for idx in range(n_mc):
if len(indexes) == 0:
indexes.append(idx)
idx_map[idx] = []
else:
idx_tensor = reducing_tensor[:, idx]
for index in indexes:
if (reducing_tensor[:, index] - idx_tensor == 0).all():
idx_map[index].append(idx)
break
else:
indexes.append(idx)
idx_map[idx] = []
# idx_map = {i: len(v) for i, v in enumerate(idx_map.values())}
old_idx_to_new_idx_map = {}
for new_idx, (key, values) in enumerate(idx_map.items()):
old_idx_to_new_idx_map[key] = new_idx
for second_idx in values:
old_idx_to_new_idx_map[second_idx] = new_idx
return (
reducing_tensor[:, indexes],
old_idx_to_new_idx_map,
idx_map,
indexes,
)
def extract_children_states_from_actions(
state: torch.Tensor,
actions: torch.Tensor,
vec_cardinality: int = 5,
):
"""Extract the children states from the actions.
Args:
state (torch.Tensor): The state of the game.
actions (torch.Tensor): The actions to apply to the state.
vec_cardinality (int, optional): The cardinality of the vectors.
"""
# state (1, T, S, S, S)
# actions (1, K, N_steps)
# we assume actions to be with N_steps = 1,
# and N_logits = |F|^(3S/N_steps). Each action is then mapped in a
# unique way to a triplet (u, v, w) where each vector has size S.
# vector cardinality represents the number of values it can take an entry
# of u, v or w.
bs, k, n_steps = actions.shape[:3]
len_token = 3 * state.shape[2] // n_steps
actions = map_action_to_triplet(actions, vec_cardinality, len_token)
actions = actions.reshape(bs, k, n_steps * len_token)
vec_dim = state.shape[2]
u = actions[:, :, :vec_dim].reshape(bs, k, vec_dim, 1, 1)
v = actions[:, :, vec_dim : 2 * vec_dim].reshape( # noqa E203
bs, k, 1, vec_dim, 1
)
w = actions[:, :, 2 * vec_dim :].reshape(bs, k, 1, 1, vec_dim) # noqa E203
reducing_tensor = u * v * w
(
reducing_tensor,
old_idx_to_new_idx,
repetition_map,
not_duplicate_indexes,
) = remove_duplicates(reducing_tensor)
old_state = state[:, 0]
new_state = old_state.unsqueeze(1) - reducing_tensor
rolling_states = torch.roll(state, 1)[:, 2:]
return (
[
torch.cat(
[
new_state[:, i : i + 1], # noqa E203
reducing_tensor[:, i : i + 1], # noqa E203
rolling_states,
],
dim=1,
)
for i in range(k)
],
old_idx_to_new_idx,
repetition_map,
not_duplicate_indexes,
)
def _reduce_memory_consumption_before_storing(
possible_states: List[torch.Tensor],
):
"""Reduce the memory consumption before storing the states.
Args:
possible_states (List[torch.Tensor]): The possible states.
"""
final_states = [state[:, 0:2] for state in possible_states]
previous_actions = possible_states[0][:, 2:]
storing_dict = {
"final_states": final_states,
"previous_actions": previous_actions,
}
return storing_dict
def _recompose_possible_states(reduced_memory_states_dict: Dict):
"""Recompose the possible states from the reduced memory states.
Args:
reduced_memory_states_dict (Dict): The reduced memory states.
"""
final_states = reduced_memory_states_dict["final_states"]
previous_actions = reduced_memory_states_dict["previous_actions"]
possible_states = [
torch.cat(
[
final_states[i],
previous_actions,
],
dim=1,
)
for i in range(len(final_states))
]
return possible_states
def extract_present_state(state: torch.Tensor) -> torch.Tensor:
return state[:, 0]
def to_hash(tensor: torch.Tensor) -> str:
"""Converts a tensor to a hash string.
Args:
tensor: The tensor to convert.
"""
hashable_tensor = "_".join(
tensor.reshape(-1).long().detach().cpu().numpy().astype(str).tolist()
)
return hashable_tensor
def from_hash(hashable_tensor: str, shape: tuple) -> torch.Tensor:
"""Converts a hash string back to the original tensor.
Args:
hashable_tensor (str): The hash string.
shape (tuple): The shape of the original tensor.
"""
return torch.tensor([float(x) for x in hashable_tensor.split("_")]).resize(
shape
)
def record_action(tree_dict: Dict, state: str, action: str):
"""Record the action in the tree dictionary.
Args:
tree_dict (Dict): The tree dictionary.
state (str): The state as a hash string.
action (str): The action as a hash string.
"""
if state in tree_dict:
tree_dict[state].append(action)
else:
tree_dict[state] = [action]
def select_future_state(
possible_states: List[torch.Tensor],
q_values: torch.Tensor,
N_s_a: torch.Tensor,
repetitions: Dict[int, list],
c_1: float = 1.25,
c_2: float = 19652,
return_idx: bool = False,
) -> torch.Tensor:
"""Select the future state maximizing the upper confidence bound."""
# q_values (1, K, 1)
pi = torch.tensor(
[
len(repetitions[i])
for i in range(len(possible_states))
if i in repetitions
]
).to(q_values.device)
if pi.shape[0] != N_s_a.shape[1]:
print(pi)
print(pi.shape, q_values.shape, N_s_a.shape)
pi = pi[: N_s_a.shape[1]]
ucb = q_values.reshape(-1) + pi * torch.sqrt(
torch.sum(N_s_a) / (1 + N_s_a)
) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))
if return_idx:
return ucb.argmax()
return possible_states[ucb.argmax()]
@torch.no_grad()
def simulate_game(
model,
state: torch.Tensor,
t_time: int,
max_steps: int,
game_tree: Dict,
states_dict: Dict,
horizon: int = 5,
):
"""Simulates a game from a given state.
Args:
model: The model to use for the simulation.
state (torch.Tensor): The initial state.
t_time (int): The current time step.
max_steps (int): The maximum number of steps to simulate.
game_tree (Dict): The game tree.
states_dict (Dict): The states dictionary.
horizon (int): The horizon to use for the simulation.
"""
idx = t_time
max_steps = min(max_steps, t_time + horizon)
state_hash = to_hash(extract_present_state(state))
trajectory = []
# selection
while state_hash in game_tree:
(
possible_states_dict,
old_idx_to_new_idx,
repetition_map,
N_s_a,
q_values,
actions,
) = states_dict[state_hash]
possible_states = _recompose_possible_states(possible_states_dict)
state_idx = select_future_state(
possible_states, q_values, N_s_a, repetition_map, return_idx=True
)
trajectory.append((state_hash, state_idx)) # state_hash, action_idx
future_state = extract_present_state(possible_states[state_idx])
state = possible_states[state_idx]
state_hash = to_hash(future_state)
idx += 1
# expansion
if idx <= max_steps:
trajectory.append((state_hash, None))
if not game_is_finished(extract_present_state(state)):
state = state.to(model.device)
scalars = get_scalars(state, idx).to(state.device)
actions, probs, q_values = model(state, scalars)
(
possible_states,
cloned_idx_to_idx,
repetitions,
not_dupl_indexes,
) = extract_children_states_from_actions(
state,
actions,
)
not_dupl_actions = actions[:, not_dupl_indexes].to("cpu")
not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(
"cpu"
)
N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu")
present_state = extract_present_state(state)
states_dict[to_hash(present_state)] = (
_reduce_memory_consumption_before_storing(possible_states),
cloned_idx_to_idx,
repetitions,
N_s_a,
not_dupl_q_values,
not_dupl_actions,
)
game_tree[to_hash(present_state)] = [
to_hash(extract_present_state(fut_state))
for fut_state in possible_states
]
leaf_q_value = q_values
else:
leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())
# backup
backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)
def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):
"""Backward pass of the montecarlo algorithm"""
reward = 0
for idx, (state, action_idx) in enumerate(reversed(trajectory)):
if action_idx is None: # leaf node
reward += leaf_q_value
else:
(
_,
old_idx_to_new_idx,
_,
N_s_a,
q_values,
_,
) = states_dict[state]
if isinstance(reward, torch.Tensor):
reward = reward.to(q_values.device)
action_idx = int(action_idx)
if action_idx in old_idx_to_new_idx:
not_dupl_index = old_idx_to_new_idx[int(action_idx)]
else:
not_dupl_index = action_idx
reward -= 1
q_values[:, not_dupl_index] = (
N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward
) / (N_s_a[:, not_dupl_index] + 1)
N_s_a[:, not_dupl_index] += 1
def monte_carlo_tree_search(
model: torch.nn.Module,
state: torch.Tensor,
n_sim: int,
t_time,
n_steps: int,
game_tree: Dict,
state_dict: Dict,
):
"""Runs the monte carlo tree search algorithm.
Args:
model (torch.nn.Module): The model to use for the simulation.
state (torch.Tensor): The initial state.
n_sim (int): The number of simulations to run.
t_time (int): The current time step.
n_steps (int): The maximum number of steps to simulate.
game_tree (Dict): The game tree.
state_dict (Dict): The dictionary containing the states.
"""
# Note that game tree is not the full tree, but just the one having as root
# the current node(state).
# should we accept also previous updated trajectories for the current node?
# is it something we should considering when deciding how many simulations
# we should run? (I think yes)
state_hash = to_hash(extract_present_state(state))
if state_hash in state_dict:
with torch.no_grad():
N_s_a = state_dict[state_hash][3]
n_sim -= int(N_s_a.sum())
n_sim = max(n_sim, 0)
for _ in range(n_sim):
simulate_game(model, state, t_time, n_steps, game_tree, state_dict)
# return next state
possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[
state_hash
]
possible_states = _recompose_possible_states(possible_states_dict)
next_state_idx = select_future_state(
possible_states, q_values, N_s_a, repetitions, return_idx=True
)
next_state = possible_states[next_state_idx]
return next_state
@torch.no_grad()
def compute_improved_policy(
state_dict: Dict,
states: List[str],
model_n_steps: int,
model_n_logits: int,
N_bar: int,
):
"""Compute the improved policy given the state_dict, the list of states.
The improved policy is computed as (N_s_aˆ(1/tau) / (N_s_aˆ(1/tau)).sum())
where tau is (log(N_s_a.sum()) / log(N_bar))
"""
policies = torch.zeros(len(states), model_n_steps, model_n_logits)
N_bar = torch.tensor(N_bar)
for idx, state in enumerate(states):
N_s_a = state_dict[state][3]
actions = state_dict[state][5]
if N_s_a.sum() > N_bar:
tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()
else:
tau = 1
N_s_a = N_s_a ** (1 / tau)
improved_policy = N_s_a / N_s_a.sum()
for sample_id in range(actions.shape[1]):
action_ids = actions[0, sample_id]
for step_id, action_id in enumerate(action_ids):
policies[idx, step_id, action_id] += improved_policy[
0, sample_id
]
return policies
def actor_prediction(
model: AlphaTensorModel,
input_tensor: torch.Tensor,
maximum_rank: int,
mc_n_sim: int,
N_bar: int,
return_actions: bool = False,
):
"""Runs the monte carlo tree search algorithm to obtain the next states,
policies and rewards.
Args:
model (AlphaTensorModel): The model to use for the simulation.
input_tensor (torch.Tensor): The initial state.
maximum_rank (int): The maximum number of steps to simulate.
mc_n_sim (int): The number of simulations to run.
N_bar (int): The parameter used to compute the improved policy.
return_actions (bool): If True, only actions are returned.
"""
# input_tensor has shape (1, T, S, S, S)
state = input_tensor
rank = 0
game_tree = {}
state_dict = {}
hash_states = []
states = []
while rank < maximum_rank:
states.append(state)
hash_states.append(to_hash(extract_present_state(state)))
state = monte_carlo_tree_search(
model,
state,
mc_n_sim,
rank,
maximum_rank,
game_tree,
state_dict,
)
if game_is_finished(extract_present_state(state)):
break
rank += 1
final_state = extract_present_state(state)
policies = compute_improved_policy(
state_dict, hash_states, model.n_steps, model.n_logits, N_bar
)
reward = (
int(torch.linalg.matrix_rank(final_state).sum())
if not game_is_finished(final_state)
else 0
)
rewards = torch.cumsum(
torch.tensor([-1] * (len(policies) - 1) + [reward]), dim=0
)
if return_actions:
actions = [state_dict[hash_state][5] for hash_state in hash_states]
return actions
# policies do not have the batch size, but states still have it
states = [s.squeeze(0) for s in states]
return states, policies, rewards
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/basis_change.py
================================================
from pathlib import Path
from typing import Callable
import numpy as np
import torch
def get_change_basis_matrix(
tensor_size: int,
n_cob: int,
entry_distribution: Callable = torch.randn,
random_seed: int = None,
):
"""Generate a list of change of basis matrices.
Args:
tensor_size (int): Size of the tensor.
n_cob (int): Number of change of basis matrices.
entry_distribution (Callable, optional): Distribution of the entries
of the change of basis matrices.
random_seed (int, optional): Random seed for reproducibility.
"""
if random_seed is not None:
torch.random.manual_seed(random_seed)
for _ in range(n_cob):
diag_p = 2 * (torch.rand(tensor_size) > 0.5).float() - 1
diag_l = 2 * (torch.rand(tensor_size) > 0.5).float() - 1
random_matrix = entry_distribution((tensor_size, tensor_size))
p_matrix = torch.diag(diag_p)
l_matrix = torch.diag(diag_l)
p_matrix = p_matrix + torch.triu(random_matrix, diagonal=1)
l_matrix = l_matrix + torch.tril(random_matrix, diagonal=-1)
yield torch.matmul(p_matrix, l_matrix)
def cob_entry_prob_distribution(size):
full_size = int(np.prod(size))
vals = torch.tensor([-1, 0, 1])
probs = torch.tensor([0.0075, 0.985, 0.0075]).unsqueeze(0)
cum_sum = torch.cumsum(probs, dim=-1)
unif_prob = torch.rand((full_size, 1))
tensor_idx = torch.argmax((unif_prob <= cum_sum).int(), dim=1)
tensor = vals[tensor_idx]
return tensor.reshape(size)
class ChangeOfBasis:
"""Change of Basis class."""
"""Change of Basis class."""
def __init__(
self,
tensor_size: int,
n_cob: int,
cob_prob: float,
device: str,
random_seed: int = None,
):
"""Builds a ChangeOfBasis object.
Args:
tensor_size (int): Size of the tensor.
n_cob (int): Number of change of basis matrices.
cob_prob (float): Probability of applying a change of basis.
device (str): Name of the torch device to use.
random_seed (int, optional): Random seed for reproducibility.
"""
self.tmp_dir = Path.home() / ".data_alpha_tensor/cob_matrices"
self.tmp_dir.mkdir(exist_ok=True, parents=True)
for i, cob_matrix in enumerate(
get_change_basis_matrix(
tensor_size, n_cob, cob_entry_prob_distribution, random_seed
)
):
torch.save(cob_matrix, f"{self.tmp_dir}/cob_matrix_{i}.pt")
self.tensor_size = tensor_size
self.n_cob = n_cob
self.cob_prob = cob_prob
self.device = device
@torch.no_grad()
def __call__(self, tensor: torch.Tensor, return_basis: bool = False):
"""Apply a change of basis to a tensor.
Args:
tensor (torch.Tensor): Tensor to apply the change of basis to.
return_basis (bool, optional): Whether to return the change of
basis matrix as well.
"""
cob_prob = torch.rand(1).item()
if cob_prob > self.cob_prob:
return tensor
random_cob = torch.randint(low=0, high=self.n_cob, size=(1,))
cob_matrix = torch.load(
f"{self.tmp_dir}/cob_matrix_{int(random_cob)}.pt"
).to(self.device)
# apply change of basis to each tensor dimension
inner_tensor = tensor[0, 0]
tensor_size = inner_tensor.shape[-1]
original_shape = inner_tensor.shape
cob_matrix = cob_matrix.transpose(0, 1)
inner_tensor = torch.matmul(
inner_tensor.reshape(-1, tensor_size), cob_matrix
).reshape(original_shape)
inner_tensor = inner_tensor.permute(0, 2, 1)
inner_tensor = torch.matmul(
inner_tensor.reshape(-1, tensor_size), cob_matrix
).reshape(original_shape)
inner_tensor = inner_tensor.permute(2, 1, 0)
inner_tensor = torch.matmul(
inner_tensor.reshape(-1, tensor_size), cob_matrix
).reshape(original_shape)
inner_tensor = inner_tensor.permute(2, 0, 1)
tensor[0, 0] = inner_tensor
if return_basis:
return tensor, cob_matrix.transpose(0, 1)
return tensor
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/dataset.py
================================================
import json
import os
import shutil
import tempfile
from pathlib import Path
from typing import List, Tuple
import numpy as np
import torch
from torch.utils.data import Dataset
from open_alpha_tensor.core.data.generation import generate_synthetic_data
from open_alpha_tensor.core.data.utils import (
get_scalars,
map_triplet_to_action,
)
SAVE_DIR_SYNT = str(Path.home() / ".data_alpha_tensor/synthetic_data")
def compute_move(triplets: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
"""Computes the outer product of the three tensors in the triplet that
will be subtracted from the current state.
Args:
triplets (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tensors u,
v, and w.
"""
u, v, w = triplets
return u.reshape(-1, 1, 1) * v.reshape(1, -1, 1) * w.reshape(1, 1, -1)
class SyntheticDataBuffer(Dataset):
"""Dataset of synthetically generated demonstrations."""
def __init__(
self,
tensor_size,
n_data,
limit_rank,
prob_distr,
n_prev_actions: int,
device: str,
n_steps: int,
random_seed=None,
):
"""Builds a dataset of synthetic demonstrations.
Args:
tensor_size (int): Size of the tensor.
n_data (int): Number of demonstrations to generate.
limit_rank (int): Maximum rank of the generated tensors.
prob_distr (Callable): Probability distribution to use to generate
the tensors.
n_prev_actions (int): Number of previous actions to use as input.
device (str): Name of the torch device to use.
n_steps (int): Number of steps to perform in the environment.
random_seed (int, optional): Random seed to use.
"""
self.device = device
self.len_data = 0
self.n_prev_actions = n_prev_actions
self.limit_rank = limit_rank
self.n_steps = n_steps
self.save_dir = os.path.join(SAVE_DIR_SYNT, f"size_{tensor_size}")
Path(self.save_dir).mkdir(parents=True, exist_ok=True)
number_of_triplets = len(list(Path(self.save_dir).glob("*.pt"))) // 2
if number_of_triplets < n_data:
self.len_data = number_of_triplets
for i, (output_tensor, list_of_triplets) in enumerate(
generate_synthetic_data(
tensor_size,
n_data - number_of_triplets,
limit_rank,
prob_distr,
random_seed,
)
):
torch.save(
output_tensor,
os.path.join(
self.save_dir, f"output_tensor_{self.len_data}.pt"
),
)
torch.save(
list_of_triplets,
os.path.join(
self.save_dir, f"list_of_triplets_{self.len_data}.pt"
),
)
self.len_data += 1
else:
self.len_data = n_data
def __len__(self):
return self.len_data * self.limit_rank
@torch.no_grad()
def __getitem__(self, idx):
i = idx // self.limit_rank
j = idx % self.limit_rank
output_tensor = torch.load(
os.path.join(self.save_dir, f"output_tensor_{i}.pt")
)
list_of_triplets = torch.load(
os.path.join(self.save_dir, f"list_of_triplets_{i}.pt")
)
if j != self.limit_rank - 1:
moves = list_of_triplets[j + 1 :] # noqa E203
output_tensor = self._apply_moves(output_tensor, moves)
triplet = list_of_triplets[j]
output_tensor = torch.stack(
[
output_tensor,
*(
compute_move(t)
for t in reversed(
list_of_triplets[
j + 1 : j + 1 + self.n_prev_actions # noqa E203
]
)
),
]
)
if len(output_tensor) < self.n_prev_actions + 1:
output_tensor = torch.cat(
[
output_tensor,
torch.zeros(
self.n_prev_actions + 1 - len(output_tensor),
*output_tensor.shape[1:],
),
]
)
policy = map_triplet_to_action(triplet, base=5, n_steps=self.n_steps)
reward = torch.tensor([-(j + 1)])
scalar = get_scalars(output_tensor, self.limit_rank - j, with_bs=False)
return (
output_tensor.to(self.device),
scalar.to(self.device),
policy.to(self.device),
reward.to(self.device),
)
@staticmethod
def _apply_moves(
tensor: torch.Tensor,
moves: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
):
"""Given an initial state and a list of moves, applies the moves to
the state.
Args:
tensor (torch.Tensor): Initial state.
moves (List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
List of moves.
"""
for u, v, w in moves:
tensor = tensor - u.reshape(-1, 1, 1) * v.reshape(
1, -1, 1
) * w.reshape(1, 1, -1)
return tensor
class GameDataBuffer(Dataset):
"""Buffer to store the data from the games played by the MCTS agent."""
def __init__(self, device: str, max_buffer_size: int):
"""Initializes the buffer.
Args:
device (str): Name of the torch device to use.
max_buffer_size (int): Maximum size of the buffer.
"""
self.num_games = 0
self.temp_dir = tempfile.mkdtemp("game_data_buffer")
self.game_data = {}
self.max_buffer_size = max_buffer_size
self.device = device
def __del__(self):
shutil.rmtree(self.temp_dir)
def add_game(
self,
states: List[torch.Tensor],
policies: List[torch.Tensor],
rewards: List[torch.Tensor],
):
"""Adds a played game to the buffer.
Args:
states (List[torch.Tensor]): Observed game states.
policies (List[torch.Tensor]): List of policies.
rewards (List[torch.Tensor]): Observed rewards.
"""
self.game_data[self.num_games] = len(states)
torch.save(
states, os.path.join(self.temp_dir, f"states_{self.num_games}.pt")
)
torch.save(
policies,
os.path.join(self.temp_dir, f"policies_{self.num_games}.pt"),
)
torch.save(
rewards,
os.path.join(self.temp_dir, f"rewards_{self.num_games}.pt"),
)
self.num_games += 1
if self.num_games >= self.max_buffer_size:
# remove oldest game. Note that this line is not thread safe. Lock
# should be added if multiple threads are used.
self.num_games = 0
def __len__(self):
return sum(self.game_data.values())
@torch.no_grad()
def __getitem__(self, idx):
i = 0
while idx >= self.game_data[i]:
idx -= self.game_data[i]
i += 1
states = torch.load(os.path.join(self.temp_dir, f"states_{i}.pt"))
policies = torch.load(os.path.join(self.temp_dir, f"policies_{i}.pt"))
rewards = torch.load(os.path.join(self.temp_dir, f"rewards_{i}.pt"))
return (
states[idx].to(self.device),
get_scalars(states[idx], idx, with_bs=False).to(self.device),
policies[idx].to(self.device).argmax(dim=-1),
rewards[idx].to(self.device).reshape(1),
)
def save_game_data(self, path: str):
"""Copy save_dir content in path and save game_data
in json format
"""
shutil.copytree(self.temp_dir, path, dirs_exist_ok=True)
with open(os.path.join(path, "game_data.json"), "w") as f:
json.dump(self.game_data, f)
def load_game_data(self, path: str):
"""Load game_data from json format and copy content
in save_dir
"""
with open(os.path.join(path, "game_data.json"), "r") as f:
self.game_data = json.load(f)
shutil.copytree(path, self.temp_dir)
self.num_games = len(self.game_data)
class TensorGameDataset(Dataset):
"""Dataset to be used for training the AlphaTensor algorithm using both
actor generated and synthetic data. A basis change can be applied to both
the data type with a probability specified in the constructor. The
synthetic data and the actor generated one are stored in two data buffers.
"""
def __init__(
self,
len_data,
pct_synth,
tensor_size,
n_synth_data,
limit_rank,
prob_distr,
action_memory_len: int,
device: str,
n_steps: int,
random_seed=None,
):
self.synthetic_data_buffer = SyntheticDataBuffer(
tensor_size,
n_synth_data,
limit_rank,
prob_distr,
action_memory_len,
n_steps=n_steps,
device=device,
random_seed=random_seed,
)
self.game_data_buffer = GameDataBuffer(
device=device, max_buffer_size=100000
)
self.best_game_data_buffer = GameDataBuffer(
device=device, max_buffer_size=1000
)
self.len_data = len_data
self.pct_synth = pct_synth
self.pct_best_game = 0
self.synth_bool = torch.ones(len_data, dtype=torch.bool)
self.synth_idx = torch.from_numpy(
np.random.choice(
len(self.synthetic_data_buffer), len_data, replace=False
)
)
self.game_idx = None
self.best_game_idx = None
self.action_memory_len = action_memory_len
self.tensor_size = tensor_size
self.device = device
def change_training_split(self, pct_synth, pct_best_game):
self.pct_synth = pct_synth
self.pct_best_game = pct_best_game
def recompute_synthetic_indexes(self):
if len(self.game_data_buffer) > 0:
self.synth_bool = torch.rand(self.len_data) < self.pct_synth
len_synth_data = self.synth_bool.sum().item()
self.synth_idx = torch.from_numpy(
np.random.choice(
len(self.synthetic_data_buffer),
len_synth_data,
replace=False,
)
)
if len(self.best_game_data_buffer) > 0 and self.pct_best_game > 0:
len_game_data = int(
(1 - self.pct_synth - self.pct_best_game) * self.len_data
)
replace_game = len_game_data > len(self.game_data_buffer)
len_best_game_data = (
self.len_data - len_synth_data - len_game_data
)
replace_best_game = len_best_game_data > len(
self.best_game_data_buffer
)
self.game_idx = torch.from_numpy(
np.random.choice(
len(self.game_data_buffer),
len_game_data,
replace=replace_game,
)
)
self.best_game_idx = torch.from_numpy(
np.random.choice(
len(self.best_game_data_buffer),
len_best_game_data,
replace=replace_best_game,
)
)
else:
len_game_data = self.len_data - len_synth_data
replace_game = len_game_data > len(self.game_data_buffer)
self.game_idx = torch.from_numpy(
np.random.choice(
len(self.game_data_buffer),
len_game_data,
replace=replace_game,
)
)
def __getitem__(self, idx):
if self.synth_bool[idx]:
return self.synthetic_data_buffer[
self.synth_idx[self.synth_bool[:idx].sum()]
]
else:
if self.pct_best_game > 0 and self.best_game_idx is not None:
if idx - self.synth_bool[:idx].sum() < len(self.best_game_idx):
return self.best_game_data_buffer[
self.best_game_idx[idx - self.synth_bool[:idx].sum()]
]
else:
return self.game_data_buffer[
self.game_idx[
idx
- self.synth_bool[:idx].sum()
- len(self.best_game_idx)
]
]
else:
return self.game_data_buffer[
self.game_idx[idx - self.synth_bool[:idx].sum()]
]
def __len__(self):
return self.len_data
def add_game(
self,
states: List[torch.Tensor],
policies: List[torch.Tensor],
rewards: List[torch.Tensor],
):
self.game_data_buffer.add_game(states, policies, rewards)
def add_best_game(
self,
states: List[torch.Tensor],
policies: List[torch.Tensor],
rewards: List[torch.Tensor],
):
self.best_game_data_buffer.add_game(states, policies, rewards)
def save_game_data(self, path):
self.game_data_buffer.save_game_data(os.path.join(path, "game_data"))
self.best_game_data_buffer.save_game_data(
os.path.join(path, "best_game_data")
)
def load_game_data(self, path):
self.game_data_buffer.load_game_data(os.path.join(path, "game_data"))
self.best_game_data_buffer.load_game_data(
os.path.join(path, "best_game_data")
)
@property
def input_tensor(self) -> torch.Tensor:
max_matrix_size = int(np.sqrt(self.tensor_size))
input_tensor = torch.zeros(
1,
self.action_memory_len + 1,
self.tensor_size,
self.tensor_size,
self.tensor_size,
)
matrix_dims = (
torch.randint(1, max_matrix_size, (3,))
.detach()
.cpu()
.numpy()
.tolist()
)
operation_tensor = self._build_tensor_game_input(
*matrix_dims, action_memory_len=self.action_memory_len
)
input_tensor[
0,
:,
: operation_tensor.shape[1],
: operation_tensor.shape[2],
: operation_tensor.shape[3],
] = operation_tensor
return input_tensor.to(self.device)
@staticmethod
def _build_tensor_game_input(
dim_1: int, dim_k: int, dim_2: int, action_memory_len: int
):
"""Build the input tensor for the game. The input tensor has shape
(action_memory_len+1, matrix_size**2, matrix_size**2, matrix_size**2).
The first slice represent the matrix multiplication tensor which will
be reduced by the TensorGame algorithm. The other slices represent the
action memory.
"""
input_tensor = torch.zeros(
action_memory_len + 1, dim_1 * dim_k, dim_k * dim_2, dim_1 * dim_2
)
for r in range(dim_1 * dim_2):
for k in range(dim_k):
input_tensor[
0, (r // dim_2) * dim_k + k, k * dim_2 + r % dim_2, r
] = 1
return input_tensor
def games_are_good(self):
return False
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/generation.py
================================================
from typing import Callable
import torch
def generate_synthetic_data(
tensor_size: int,
n_data: int,
limit_rank: int,
prob_distr: Callable = torch.randn,
random_seed: int = None,
):
"""Generates synthetic demonstrations.
Args:
tensor_size (int): Size of the tensor.
n_data (int): Number of demonstrations.
limit_rank (int): Limit rank of each tensor.
prob_distr (Callable, optional): Distribution of the entries of the
tensor.
random_seed (int, optional): Random seed for reproducibility.
"""
if random_seed is not None:
torch.random.manual_seed(random_seed)
for _ in range(n_data):
# rank = torch.randint(low=1, high=limit_rank + 1, size=(1,)).item()
rank = limit_rank
output_tensor = torch.zeros(tensor_size, tensor_size, tensor_size)
list_of_triplets = []
for i in range(rank):
valid_triplet = False
while not valid_triplet:
u = prob_distr(tensor_size)
v = prob_distr(tensor_size)
w = prob_distr(tensor_size)
generated_tensor = (
u.reshape(-1, 1, 1)
* v.reshape(1, -1, 1)
* w.reshape(1, 1, -1)
)
if not (generated_tensor == 0).all():
valid_triplet = True
list_of_triplets.append((u, v, w))
output_tensor += generated_tensor
yield output_tensor, list_of_triplets
def f_prob_distribution(size):
"""Samples a tensor of values from a distribution with a peak at 0 and a
tail at -2 and 2.
Args:
size (int): Number of values to sample.
"""
f_vals = torch.tensor([-2, -1, 0, 1, 2])
f_probs = torch.tensor([0.001, 0.099, 0.8, 0.099, 0.001]).unsqueeze(0)
f_cum_sum = torch.cumsum(f_probs, dim=-1)
unif_prob = torch.rand((size, 1))
tensor_idx = torch.argmax((unif_prob <= f_cum_sum).int(), dim=1)
tensor = f_vals[tensor_idx]
return tensor
def z2_prob_distribution(size):
"""Samples a binary tensor with uniform probability of 0 and 1.
Args:
size (int): Number of values to sample.
"""
return (torch.rand(size) > 0.5).int()
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/data/utils.py
================================================
from typing import Tuple
import torch
def get_scalars(input_tensor: torch.Tensor, t_step: int, with_bs: bool = True):
"""Adds the time step to the current state tensor.
Args:
input_tensor (torch.Tensor): Current state tensor.
t_step (int): Current time step.
with_bs (bool, optional): Whether the batch size is present in the
input tensor.
"""
# scalars containing the iteration time
if with_bs:
bs = input_tensor.shape[0]
scalars = torch.zeros((bs, 1))
scalars[:, 0] = t_step
else:
scalars = torch.tensor(t_step).unsqueeze(-1).float()
return scalars
def map_triplet_to_action(
triplet: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
base: int,
n_steps: int,
add_bias: bool = True,
):
"""Maps a triplet of tensors to an action.
Args:
triplet (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Triplet of
tensors u, v, and w.
base (int): Base used for the conversion.
n_steps (int): Number of steps in the action.
add_bias (bool, optional): Whether to add a bias to the action.
"""
# map the triplet to an action. First, we concatenate the three tensors and
# then we convert it to an action using the given base representation. Each
# element is converted using the formula:
# action += element * base^(element_index)
u, v, w = triplet
n_dim = u.ndim
action = torch.cat((u, v, w), dim=-1)
action = action.reshape(-1, n_steps, action.shape[-1] // n_steps)
if n_dim == 1:
action = action.squeeze(0)
if add_bias:
action = action + base // 2
action = action * torch.tensor(
[base**i for i in range(action.shape[-1])]
)
action = action.sum(dim=-1)
return action
# @torch.jit.script
def _single_action_to_triplet(
action_val: int,
basis: int,
out_dim: int,
bias: int,
device: str,
):
"""Converts an action to the original triplet (u, v, w) that generated it.
Args:
action_val (int): Action to convert.
basis (int): Basis used for the conversion.
out_dim (int): Output dimension.
bias (int): Bias to subtract from the action.
device (str): Name of the torch device to use.
"""
triplet = torch.zeros(out_dim).to(device)
if action_val > 0:
idx = int(
torch.log(torch.tensor(action_val))
// torch.log(torch.tensor(basis))
)
else:
idx = 0
while idx >= 0:
temp = int(basis**idx)
triplet[idx] = action_val // temp - bias
action_val = action_val - temp
idx -= 1
return triplet
def map_action_to_triplet(
action_tensor: torch.Tensor,
cardinality: int = 5,
vector_size: int = 5,
add_bias: bool = True,
):
"""Maps a batch of actions to the batch of triplets that generated them.
Args:
action_tensor (torch.Tensor): Batch of actions.
cardinality (int, optional): Cardinality of the action space.
vector_size (int, optional): Size of the vector.
add_bias (bool, optional): Whether to use bias.
"""
# map the action to a triplet. The action is converted to a base 5
# representation and then the three elements are extracted from it.
# The action has shape (bs, n_steps) and it contains the token for
# recreating u, v and w. The token is a number between 0 and n_logits.
action_shape = action_tensor.shape
action_tensor = action_tensor.reshape(-1)
if add_bias:
bias = cardinality // 2
else:
bias = 0
triplets = torch.stack(
[
_single_action_to_triplet(
action_tensor[idx],
cardinality,
vector_size,
bias,
action_tensor.device,
)
for idx in range(len(action_tensor))
]
)
final_size = triplets.shape[-1]
return triplets.reshape((*action_shape, final_size))
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/alpha_tensor.py
================================================
import torch
from open_alpha_tensor.core.modules.extras import (
QuantileLoss,
ValueRiskManagement,
)
from open_alpha_tensor.core.modules.heads import PolicyHead, ValueHead
from open_alpha_tensor.core.modules.torso import TorsoModel
class AlphaTensorModel(torch.nn.Module):
def __init__(
self,
tensor_length: int,
input_size: int,
scalars_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_samples: int,
):
# scalar_size = s
# input_size = S
# tensor_length = T
# emb_dim = c
super().__init__()
self.tensor_length = tensor_length
self.input_size = input_size
self.emb_dim = emb_dim
self.torso = TorsoModel(
scalars_size, input_size, tensor_length, emb_dim
)
emb_size = 3 * input_size * input_size
print("Build policy head")
self.policy_head = PolicyHead(
emb_size, emb_dim, n_steps, n_logits, n_samples
)
print("Build value head")
self.value_head = ValueHead(
2048
) # value dependent on num_head and proj_dim
self.policy_loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
self.quantile_loss_fn = QuantileLoss()
self.risk_value_management = ValueRiskManagement()
@property
def device(self):
return next(self.parameters()).device
def _train_forward(
self,
x: torch.Tensor,
s: torch.Tensor,
g_action: torch.Tensor,
g_value: torch.Tensor,
):
# shapes
# x = (N, T, S, S, S)
# s = (N, s)
# g_action = (N, N_steps)
# g_value = (N, )
e = self.torso(x, s)
o, z1 = self.policy_head(e, g_action)
l_policy = self.policy_loss_fn(
o.reshape(-1, o.shape[-1]), g_action.reshape(-1)
)
q = self.value_head(z1)
l_value = self.quantile_loss_fn(q, g_value.float())
return l_policy, l_value
def _eval_forward(self, x: torch.Tensor, s: torch.Tensor):
e = self.torso(x, s)
a, p, z1 = self.policy_head(e)
q = self.value_head(z1)
q = self.risk_value_management(q)
return a, p, q
def forward(
self,
x: torch.Tensor,
s: torch.Tensor,
g_action: torch.Tensor = None,
g_value: torch.Tensor = None,
):
if g_action is None:
return self._eval_forward(x, s)
else:
assert g_value is not None
return self._train_forward(x, s, g_action, g_value)
@property
def n_logits(self):
return self.policy_head.n_logits
@property
def n_steps(self):
return self.policy_head.n_steps
@property
def n_samples(self):
return self.policy_head.n_samples
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/attention.py
================================================
import torch
from torch.nn import functional as F
class AttentionHead(torch.nn.Module):
def __init__(self, x_size: int, y_size: int, proj_dim: int):
# x_size = N_x
# y_size = N_y
super(AttentionHead, self).__init__()
self.proj_dim = proj_dim
self.proj_dim_isqrt = 1 / torch.sqrt(torch.tensor(proj_dim))
self.queries_proj_layer = torch.nn.Linear(x_size, proj_dim)
self.keys_proj_layer = torch.nn.Linear(y_size, proj_dim)
self.values_proj_layer = torch.nn.Linear(y_size, proj_dim)
def forward(self, x: torch.Tensor, y: torch.Tensor, mask: bool = False):
queries = self.queries_proj_layer(x)
keys = self.keys_proj_layer(y)
values = self.values_proj_layer(y)
attention = F.softmax(
torch.matmul(queries, keys.transpose(-2, -1))
* self.proj_dim_isqrt,
dim=-1,
)
if mask:
attention = torch.triu(attention, diagonal=1)
output = torch.matmul(attention, values)
return output
class AttentionDenseBlock(torch.nn.Module):
def __init__(self, inner_size: int, multiplier: int = 4):
super().__init__()
self.norm = torch.nn.LayerNorm(inner_size)
self.linear = torch.nn.Linear(inner_size, inner_size * multiplier)
self.activation = torch.nn.GELU()
self.linear_final = torch.nn.Linear(
inner_size * multiplier, inner_size
)
def forward(self, x: torch.Tensor):
x_temp = self.activation(self.linear(self.norm(x)))
return x + self.linear_final(x_temp)
class AlphaMultiHeadAttention(torch.nn.Module):
def __init__(
self,
x_dim: int,
y_dim: int,
proj_dim: int = 32,
n_heads: int = 16,
multiplier: int = 4,
):
# x_dim = size of the last dimension of x
# y_dim = size of the last dimension of y
super().__init__()
self.norm_layer_x = torch.nn.LayerNorm(x_dim)
self.norm_layer_y = torch.nn.LayerNorm(y_dim)
self.module_list = torch.nn.ModuleList(
[AttentionHead(x_dim, y_dim, proj_dim) for _ in range(n_heads)]
)
self.linear = torch.nn.Linear(n_heads * proj_dim, x_dim)
self.dense = AttentionDenseBlock(x_dim, multiplier)
def forward(
self, x: torch.nn.Module, y: torch.nn.Module, mask: bool = False
):
# x.size = (Nx, c1), y.size = (Ny, c2)
x_norm = self.norm_layer_x(x)
y_norm = self.norm_layer_y(y)
temp = torch.cat(
[layer(x_norm, y_norm, mask) for layer in self.module_list], dim=-1
)
x = x + self.linear(temp)
return self.dense(x)
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/extras.py
================================================
import torch
class QuantileLoss(torch.nn.Module):
def __init__(self, delta: float = 1.0):
super().__init__()
self.huber_loss = torch.nn.HuberLoss(reduction="none", delta=delta)
def forward(self, q: torch.Tensor, g: torch.Tensor):
n = q.shape[-1]
tau = torch.arange(0, n).unsqueeze(0).to(q.device) / n
h = self.huber_loss(g, q)
k = torch.abs(tau - (g - q > 0).float())
return torch.mean(h * k)
class ValueRiskManagement(torch.nn.Module):
def __init__(self, u_q: float = 0.75):
super(ValueRiskManagement, self).__init__()
self.u_q = u_q
def forward(self, q: torch.Tensor):
# q shape = (N, n)
j = int(self.u_q * q.shape[-1])
return torch.mean(q[:, j:], dim=-1)
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/heads.py
================================================
import math
import torch
import torch.nn.functional as F
from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention
class PositionEncoding(torch.nn.Module):
def __init__(self, d_model: int, max_len: int = 5000):
super().__init__()
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
)
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer("pe", pe)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Tensor, shape [seq_len, batch_size, embedding_dim]
"""
x = x + self.pe[: x.size(0)]
return x
class PolicyHeadDoubleAttention(torch.nn.Module):
def __init__(
self,
n_steps: int,
n_heads: int,
n_feat: int,
emb_size: int,
emb_dim: int,
):
super().__init__()
d_model = n_feat * n_heads
self.layer_norm1 = torch.nn.LayerNorm(d_model)
self.attention1 = AlphaMultiHeadAttention(d_model, d_model)
self.drop1 = torch.nn.Dropout()
self.layer_norm2 = torch.nn.LayerNorm(d_model)
self.attention2 = AlphaMultiHeadAttention(d_model, emb_dim)
self.drop2 = torch.nn.Dropout()
def forward(self, x: torch.Tensor, e: torch.Tensor):
x = self.layer_norm1(x)
c = self.attention1(x, x, mask=True)
c = self.drop1(c)
x = x + c
x = self.layer_norm2(x)
c = self.attention2(x, e, mask=False)
c = self.drop2(c)
x = x + c
return x
class PolicyHeadCore(torch.nn.Module):
def __init__(
self,
emb_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_feat: int = 64,
n_heads: int = 32,
n_layers: int = 2,
):
super().__init__()
self.embedding = torch.nn.Embedding(n_logits, n_feat * n_heads)
self.position_encoding = PositionEncoding(n_feat * n_heads)
self.decoders = torch.nn.ModuleList(
[
PolicyHeadDoubleAttention(
n_steps, n_heads, n_feat, emb_size, emb_dim
)
for _ in range(n_layers)
]
)
self.relu = torch.nn.ReLU()
self.linear2 = torch.nn.Linear(n_feat * n_heads, n_logits)
def forward(self, a: torch.Tensor, e: torch.Tensor):
x = self.position_encoding(self.embedding(a))
for layer in self.decoders:
x = layer(x, e)
o = self.linear2(self.relu(x))
return o, x
def sample_from_logits(a):
# returns a sampled element and the associated probability
# since cross entropy is run during training we expect logits
# to be probabilities yet.
probs = torch.cumsum(F.softmax(a, dim=-1), dim=-1)
random_vals = torch.rand(probs.shape[0]).unsqueeze(-1).to(a.device)
n_classes = a.shape[-1]
new_a_idx = torch.argmax(1.0 * (probs > random_vals), dim=-1)
index_bias = torch.arange(0, len(new_a_idx)).to(a.device) * n_classes
probs = torch.take(probs, new_a_idx + index_bias)
# new_a = F.one_hot(new_a_idx, n_classes)
return new_a_idx, probs
class PolicyHead(torch.nn.Module):
def __init__(
self,
emb_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_samples: int,
):
super().__init__()
self.n_logits = n_logits
self.n_samples = n_samples
self.n_steps = n_steps
self.core = PolicyHeadCore(emb_size, emb_dim, n_steps, n_logits)
def _train_forward(self, e: torch.Tensor, g: torch.Tensor):
# e is the embedding, shape = (N, m, c)
# g represents the previous actions, when training it represents the
# list of correct actions, thus we need to shift them (since we do not
# want to consider also the latest, correct action when predicting).
# g has shape (N, N_steps) and it is a one-hot encoding of N_logits
g = torch.roll(g, shifts=-1, dims=1)
# the first raw will have attention zero during training
# g = F.one_hot(g, self.n_logits).float()
o, z = self.core(g, e)
return o, z[:, 0]
def _eval_forward(self, e: torch.Tensor):
bs = e.shape[0]
future_g = (
torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)
)
ps = torch.ones((bs, self.n_samples)).to(e.device)
e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)
future_g = future_g.view(-1, self.n_steps)
ps = ps.view(-1)
e = e.view(-1, e.shape[-2], e.shape[-1])
for i in range(self.n_steps):
o_s, z_s = self.core(future_g[:, : i + 1], e)
future_g[:, i], p_i = sample_from_logits(o_s[:, i])
ps *= p_i
future_g = future_g.view(bs, self.n_samples, self.n_steps)
ps = ps.view(bs, self.n_samples)
return (
future_g,
ps,
z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),
)
def forward(self, e: torch.Tensor, g: torch.Tensor = None):
if g is None:
return self._eval_forward(e)
return self._train_forward(e, g)
class ValueHeadCore(torch.nn.Module):
def __init__(self, input_size: int, output_size: int):
super().__init__()
self.linear = torch.nn.Linear(input_size, output_size)
self.relu = torch.nn.ReLU()
def forward(self, x: torch.Tensor):
return self.relu(self.linear(x))
class ValueHead(torch.nn.Module):
def __init__(
self, input_size: int, hidden_size: int = 512, output_size: int = 8
):
super().__init__()
self.layers = torch.nn.Sequential(
*(
[ValueHeadCore(input_size, hidden_size)]
+ [ValueHeadCore(hidden_size, hidden_size)] * 2
)
)
self.linear = torch.nn.Linear(hidden_size, output_size)
def forward(self, x: torch.Tensor):
return self.linear(self.layers(x))
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/modules/torso.py
================================================
import torch
from open_alpha_tensor.core.modules.attention import AlphaMultiHeadAttention
class TorsoAttentiveModes(torch.nn.Module):
def __init__(self, input_dim: int):
# input_dim = c
super().__init__()
self.attention = AlphaMultiHeadAttention(
input_dim,
input_dim,
)
def forward(self, x1, x2, x3):
# x1.size = x2.size = x3.size = (N, S, S, c)
# where N is the batch size
size = x1.shape[-2]
input_list = [x1, x2, x3]
for m1, m2 in [(0, 1), (2, 0), (1, 2)]:
matrix = torch.cat([input_list[m1], input_list[m2]], dim=-2)
# matrix_size = (N, S, 2S, c)
out = self.attention(matrix, matrix)
input_list[m1] = out[:, :, :size]
input_list[m2] = out[:, :, size:]
return input_list
class TorsoModel(torch.nn.Module):
"""Torso model of OpenAlphaTensor.
It maps an input tensor of shape (N, T, S, S, S) to (N, 3S*S, c), where:
N is the batch size;
T is the context size (size of the history + 1);
S is the number of elements in each matrix to be multiplied;
c is the output dimensionality.
"""
def __init__(
self,
scalars_size: int,
input_size: int,
tensor_length: int,
out_size: int,
):
# scalar_size = s
# input_size = S
# tensor_length = T
# out_size = c
super(TorsoModel, self).__init__()
self.linears_1 = torch.nn.ModuleList(
[
torch.nn.Linear(scalars_size, input_size * input_size)
for _ in range(3)
]
)
self.linears_2 = torch.nn.ModuleList(
[
torch.nn.Linear(input_size * tensor_length + 1, out_size)
for _ in range(3)
]
)
self.attentive_modes = torch.nn.ModuleList(
[TorsoAttentiveModes(out_size) for _ in range(8)]
)
def forward(self, x: torch.Tensor, scalars: torch.Tensor):
# x.size = (N, T, S, S, S)
# scalars.size = (N, s)
batch_size = x.shape[0]
S = x.shape[-1]
T = x.shape[1]
x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)
x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)
x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)
input_list = [x1, x2, x3]
for i in range(3):
temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)
input_list[i] = torch.cat([input_list[i], temp], dim=-1)
input_list[i] = self.linears_2[i](input_list[i])
x1, x2, x3 = input_list
for layer in self.attentive_modes:
x1, x2, x3 = layer(x1, x2, x3)
return torch.stack([x1, x2, x3], dim=2).reshape(
batch_size, 3 * S * S, -1
)
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/core/training.py
================================================
from pathlib import Path
from typing import Tuple, List
import torch.optim
import tqdm
from torch.utils.data import DataLoader
from open_alpha_tensor.config import (
BASE_CHECKPOINT_DATA_DIR,
BASE_CHECKPOINT_DIR,
)
from open_alpha_tensor.core.actors.stage import actor_prediction
from open_alpha_tensor.core.data.basis_change import ChangeOfBasis
from open_alpha_tensor.core.data.dataset import TensorGameDataset
from open_alpha_tensor.core.data.generation import f_prob_distribution
from open_alpha_tensor.core.data.utils import map_action_to_triplet
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
@torch.no_grad()
def _single_act(
actor_id: int,
model: torch.nn.Module,
input_tensor: torch.Tensor,
device: str,
mc_n_sim: int,
N_bar: int,
cob: ChangeOfBasis,
max_rank: int,
):
"""Executes an episode for a single actor using the MCTS.
The method is called multiple times in parallel with different actor ids.
Args:
actor_id (int): The id of the actor.
model (torch.nn.Module): The model used to take the action.
input_tensor (torch.Tensor): State of the game.
device (str): The name of the torch device used for training.
mc_n_sim (int): Number of simulations during Monte Carlo tree search.
N_bar (int): N_bar parameter used to compute tau when improving the
policy.
cob (ChangeOfBasis): The change of basis used to generate the input
tensor.
max_rank (int): The maximum matrix rank achieved by the actor before
tree search is stopped.
"""
print(f"Acting with actor {actor_id}")
model.to(device)
cob.device = device
input_tensor = input_tensor.to(device)
input_tensor_cob = cob(input_tensor)
states, policies, rewards = actor_prediction(
model, input_tensor_cob, max_rank, mc_n_sim, N_bar
)
print(f"Actor {actor_id} finished")
states = [s.to("cpu") for s in states]
policies = policies.to("cpu")
rewards = rewards.to("cpu")
return actor_id, states, policies, rewards
def swap_data(
states: List[torch.Tensor],
actions: List[torch.Tensor],
):
"""Swaps the last action with a random one and updates the states
accordingly for a single game.
Args:
states (List[torch.Tensor]): All the states for a single game.
actions (List[torch.Tensor]): All the actions through the game.
"""
last_action = actions[-1]
swap_index = torch.randint(0, len(states) - 1, (1,)).item()
actions[-1] = actions[swap_index]
actions[swap_index] = last_action
actual_state = states[swap_index]
for i in range(swap_index + 1, len(states) + 1):
prev_action = actions[i - 1]
triplet = map_action_to_triplet(
prev_action, vector_size=actual_state.shape[-1]
)
vector_size = actual_state.shape[-1] // 3
bs = actual_state.shape[0]
u = triplet[:, :vector_size].reshape(bs, -1, 1, 1)
v = triplet[:, vector_size : 2 * vector_size].reshape( # noqa E203
bs, 1, -1, 1
)
w = triplet[:, 2 * vector_size :].reshape(bs, 1, 1, -1) # noqa E203
reduced_state = u * v * w
fut_state = actual_state[:, 0] - reduced_state
new_state = actual_state[:, 1:].roll(1, dims=1)
new_state[:, 0] = reduced_state
actual_state = torch.cat([fut_state, new_state], dim=1)
states[i] = actual_state
return states, actions
class Trainer:
"""Trainer for the AlphaTensor model. The trainer does not require an
explicit loss since the loss is computed by the model itself. The trainer
is responsible for both the training step and the acting one, storing
acting performance in a buffer.
"""
def __init__(
self,
model: AlphaTensorModel,
tensor_size: int,
n_steps: int,
batch_size: int,
optimizer: torch.optim.Optimizer,
device: str,
len_data: int,
pct_synth: float,
n_synth_data: int,
limit_rank: int,
n_cob: int,
cob_prob: float,
data_augmentation: bool,
loss_params: Tuple[float, float] = None,
random_seed: int = None,
checkpoint_dir: str = None,
checkpoint_data_dir: Path = None,
extra_devices: List[str] = None,
):
"""Initializes the trainer.
Args:
model (AlphaTensorModel): The model to train.
tensor_size (int): Flattened size of the matrices to be multiplied.
n_steps (int): Number of steps used to get a single action out of
a triplet.
batch_size (int): Batch size.
optimizer (torch.optim.Optimizer): The optimizer used to train the
model.
device (str): The name of the torch device used for training.
len_data (int): Number of training samples used (both actor
generated and synthetic).
pct_synth (float): Initial percentage of synthetic samples used
for training.
n_synth_data (int): Number of synthetic training samples.
limit_rank (int): Maximum rank for synthetically-generated
matrices.
n_cob (int): Number of change of basis (cob) used for a single
training sample.
cob_prob (float): Probability of applying a change of basis.
data_augmentation (bool): Whether to randomly swap the last
operation of an episode with another operation.
loss_params (Tuple[float, float]): Alpha and Beta parameters used
in the loss function.
random_seed (int): Randomizing seed.
checkpoint_dir (str): Directory used to store model checkpoints.
checkpoint_data_dir (str): Directory used to store games as JSON
files.
extra_devices (List[str]): Extra devices names used for multi-GPU
training.
"""
self.model = model
self.optimizer = optimizer
self.device = device
self.dataset = TensorGameDataset(
len_data,
pct_synth,
tensor_size,
n_synth_data,
limit_rank,
f_prob_distribution,
device=device,
n_steps=n_steps,
action_memory_len=(model.tensor_length - 1),
random_seed=random_seed,
)
self.batch_size = batch_size
self.max_rank = limit_rank
if loss_params is None:
self.alpha = 1
self.beta = 1
else:
self.alpha, self.beta = loss_params
self.checkpoint_dir = Path(
checkpoint_dir if checkpoint_dir else BASE_CHECKPOINT_DIR
)
self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
self.checkpoint_data_dir = (
checkpoint_data_dir
if checkpoint_data_dir
else Path(BASE_CHECKPOINT_DATA_DIR)
)
self.checkpoint_data_dir.mkdir(exist_ok=True, parents=True)
self.change_of_basis = ChangeOfBasis(
tensor_size, n_cob, cob_prob, device, random_seed
)
self.data_augmentation = data_augmentation
self.extra_devices = extra_devices
def train_step(self):
"""Executes a single training step by optimizing the current model
parameters."""
self.dataset.recompute_synthetic_indexes()
self.model.train()
total_loss = 0
dl = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)
print("Training AlphaTensor")
for states, scalars, policies, rewards in tqdm.tqdm(dl):
loss_policy, loss_value = self.model(
states, scalars, policies, rewards
)
loss = self.alpha * loss_policy + self.beta * loss_value
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
total_loss += loss.item()
print(f"Total loss: {total_loss}")
@torch.no_grad()
def act_step(
self,
input_tensor: torch.Tensor,
n_games: int,
mc_n_sim: int,
N_bar: int,
):
"""Runs actors in parallel to generate multiple games starting from
the same input tensor.
Args:
input_tensor (torch.Tensor): The input tensor used to generate the
games.
n_games (int): Number of games to generate / actors to be run in
parallel.
mc_n_sim (int): Number of simulations used in the Monte Carlo tree
search.
N_bar (int): N_bar parameter used to compute tau when improving
the policy.
"""
self.model.eval()
best_reward = -1e10
best_game = None
if self.extra_devices:
from joblib import Parallel, delayed
# this means that there is an empty GPU available
# thus we can use it to parallelize the acting step
# use joblib to parallelize the acting step
# we should use _single_act as a function to be parallelized
extra_devices = (
self.extra_devices * (n_games // len(self.extra_devices))
+ self.extra_devices[: n_games % len(self.extra_devices)]
)
self.model.to("cpu")
input_tensor = input_tensor.to("cpu")
print(f"Starting acting phase with {n_games} games")
results = Parallel(n_jobs=len(self.extra_devices))(
delayed(_single_act)(
actor_id,
self.model,
input_tensor,
extra_devices[actor_id],
mc_n_sim,
N_bar,
self.change_of_basis,
self.max_rank,
)
for actor_id in range(n_games)
)
self.model.to(self.device)
for actor_id, states, policies, rewards in results:
if rewards[-1] > best_reward:
print(f"New best actor! Actor: {actor_id}")
best_reward = rewards[-1]
best_game = (states, policies, rewards)
self.dataset.add_game(states, policies, rewards)
if self.data_augmentation:
states, policies = swap_data(states, policies)
self.dataset.add_game(states, policies, rewards)
if best_game is not None:
self.dataset.add_best_game(*best_game)
else:
for actor_id in range(n_games):
input_tensor_cob = self.change_of_basis(input_tensor).to(
self.device
)
print(f"Running actor {actor_id} / {n_games}")
states, policies, rewards = actor_prediction(
self.model,
input_tensor_cob,
self.max_rank,
mc_n_sim,
N_bar,
)
print(
f"Actor {actor_id} finished. Final reward: {rewards[-1]}"
)
if rewards[-1] > best_reward:
print("New best actor!")
best_reward = rewards[-1]
best_game = (states, policies, rewards)
self.dataset.add_game(states, policies, rewards)
if self.data_augmentation:
states, policies = swap_data(states, policies)
self.dataset.add_game(states, policies, rewards)
if best_game is not None:
self.dataset.add_best_game(*best_game)
def train(
self,
n_epochs: int,
n_games: int,
mc_n_sim: int,
N_bar: int,
initial_lr: float,
lr_decay_factor: float,
lr_decay_steps: int,
starting_epoch: int = 0,
):
"""Trains the model for a given number of epochs.
Args:
n_epochs (int): Number of training epochs.
n_games (int): Number of games to generate / actors to be run in
parallel at each step.
mc_n_sim (int): Number of simulations used in the Monte Carlo tree
search at each step.
N_bar (int): N_bar parameter used to compute tau when improving
the policy.
initial_lr (float): Initial learning rate.
lr_decay_factor (float): Learning rate's decay factor.
lr_decay_steps (int): Number of learning rate's decay steps.
starting_epoch (int, optional): Epoch from which to start / resume
training.
"""
self.model = self.model.to(self.device)
if starting_epoch + 1 > n_epochs // 50:
self.dataset.change_training_split(0.7, 0.05)
if (
starting_epoch + 1 > n_epochs // 10
): # when restarting from a checkpoint
mc_n_sim = mc_n_sim * 4
for epoch in range(starting_epoch, n_epochs):
if epoch + 1 == n_epochs // 50:
self.dataset.change_training_split(0.7, 0.05)
if epoch + 1 == n_epochs // 10:
mc_n_sim = mc_n_sim * 4
# apply learning rate decay each epoch if epoch < lr_decay_steps
if 0 < epoch < lr_decay_steps - 1:
lr = initial_lr * lr_decay_factor ** (epoch / lr_decay_steps)
for param_group in self.optimizer.param_groups:
param_group["lr"] = lr
print(f"Epoch {epoch} / {n_epochs}")
self.train_step()
if epoch % 10 == 0:
self.act_step(
self.dataset.input_tensor, n_games, mc_n_sim, N_bar
)
# save checkpoint
if (epoch + 1) % 100 == 0:
checkpoint_name = f"checkpoint_{epoch + 1}.pt"
checkpoint = {
"model_state_dict": self.model.state_dict(),
"optimizer_state_dict": self.optimizer.state_dict(),
}
torch.save(
checkpoint,
self.checkpoint_dir / checkpoint_name,
)
self.dataset.save_game_data(self.checkpoint_data_dir)
# exit strategy
if self.dataset.games_are_good():
break
print("Training finished")
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/__init__.py
================================================
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/checkpoint_op.py
================================================
from pathlib import Path
from typing import Any
import torch
from nebullvm.operations.base import Operation
from open_alpha_tensor.config import (
BASE_CHECKPOINT_DATA_DIR,
BASE_CHECKPOINT_DIR,
)
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.core.training import Trainer
def optimizer_to(optim: torch.optim.Optimizer, device: str):
for param in optim.state.values():
# Not sure there are any global tensors in the state dict
if isinstance(param, torch.Tensor):
param.data = param.data.to(device)
if param._grad is not None:
param._grad.data = param._grad.data.to(device)
elif isinstance(param, dict):
for subparam in param.values():
if isinstance(subparam, torch.Tensor):
subparam.data = subparam.data.to(device)
if subparam._grad is not None:
subparam._grad.data = subparam._grad.data.to(device)
class LoadCheckPointOp(Operation):
"""An operation which loads a checkpoint during training of an
OpenAlphaTensor model."""
def __init__(self):
super().__init__()
self._last_epoch = None
self._model = None
self._optimizer = None
def execute(
self,
model: AlphaTensorModel,
optimizer: torch.optim.Optimizer,
checkpoint_dir: str,
):
"""Load a checkpoint from a directory.
Args:
model: The model to load the checkpoint into.
optimizer: The optimizer to load the checkpoint into.
checkpoint_dir: The directory to load the checkpoint from.
"""
checkpoint_dir = checkpoint_dir or BASE_CHECKPOINT_DIR
if (
Path(checkpoint_dir).exists()
and len(list(Path(checkpoint_dir).glob("*.pt"))) > 0
):
def key_func(x):
return int(x.stem.split("_")[-1])
checkpoint_path = sorted(
Path(checkpoint_dir).glob("*.pt"), key=key_func
)[-1]
print(f"Loading checkpoint from {checkpoint_path}")
old_device = model.device
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(old_device)
print(f"Loaded model to {old_device}")
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
optimizer_to(optimizer, old_device)
last_epoch = int(checkpoint_path.stem.split("_")[-1])
else:
last_epoch = 0
self._last_epoch = last_epoch
self._model = model
self._optimizer = optimizer
def get_last_epoch(self) -> int:
"""Returns the last epoch of the loaded checkpoint."""
return self._last_epoch
def get_model(self) -> AlphaTensorModel:
"""Returns the model loaded from the checkpoint."""
return self._model
def get_optimizer(self) -> torch.optim.Optimizer:
"""Returns the optimizer loaded from the checkpoint."""
return self._optimizer
def get_result(self) -> Any:
pass
class LoadCheckpointDataOp(Operation):
"""An operation which loads the games played while training an
OpenAlphaTensor model."""
def __init__(self):
super().__init__()
self._loaded = False
def execute(self, games_store_dir: Path, trainer: Trainer):
"""Load the games played while training an OpenAlphaTensor model.
Args:
games_store_dir: The directory where the games are stored.
trainer: The trainer to load the games into.
"""
games_store_dir = games_store_dir or BASE_CHECKPOINT_DATA_DIR
# if games_store_dir contains games, load them
if (
games_store_dir.exists()
and (games_store_dir / "game_data.json").exists()
):
trainer.dataset.load_games(games_store_dir)
self._loaded = True
def get_result(self) -> bool:
"""Returns whether the games were loaded or not."""
return self._loaded
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/model_op.py
================================================
import json
from pathlib import Path
from typing import Any
import torch
from nebullvm.operations.base import Operation
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
class BuildModelOp(Operation):
"""An operation which builds an OpenAlphaTensor model."""
def __init__(self):
super().__init__()
self._model = None
def execute(
self,
tensor_length: int,
input_size: int,
scalars_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_samples: int,
):
"""Builds the OpenAlphaTensor model.
Args:
tensor_length (int): Number of tensors to as history.
input_size (int): Flattened size of the matrices to be multiplied.
scalars_size (int): Size of the scalar vectors fed to the torso
model.
emb_dim (int): Embedding dimension.
n_steps (int): Number of steps used to get a single action out of
a triplet.
n_logits (int): Number of logits output by the policy head.
n_samples (int): Number of samples used by the policy head at
evaluation time.
"""
self._model = AlphaTensorModel(
tensor_length=tensor_length,
input_size=input_size,
scalars_size=scalars_size,
emb_dim=emb_dim,
n_steps=n_steps,
n_logits=n_logits,
n_samples=n_samples,
)
def get_model(self) -> AlphaTensorModel:
"""Returns the built model."""
return self._model
def get_result(self) -> Any:
pass
class BuildOptimizerOp(Operation):
"""An operation which builds an optimizer for an OpenAlphaTensor model."""
def __init__(self):
super().__init__()
self._optimizer = None
def execute(
self,
optimizer_name: str,
model: AlphaTensorModel,
lr: float,
weight_decay: float,
):
"""Builds the optimizer for the OpenAlphaTensor model.
Args:
optimizer_name (str): Name of the optimizer used.
model (AlphaTensorModel): OpenAlphaTensor model to be trained.
lr (float): Learning rate.
weight_decay (float): Weight decay used by the optimizer.
"""
if optimizer_name == "adam":
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
elif optimizer_name == "adamw":
optimizer = torch.optim.AdamW(
model.parameters(), lr=lr, weight_decay=weight_decay
)
elif optimizer_name == "sgd":
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
else:
raise ValueError(f"Optimizer {optimizer_name} not supported")
self._optimizer = optimizer
def get_optimizer(self) -> torch.optim.Optimizer:
"""Returns the built optimizer."""
return self._optimizer
def get_result(self) -> Any:
pass
class SaveModelOp(Operation):
"""An operation which saves an OpenAlphaTensor model.
The model parameters are stored in a json file, while the model weights
are stored in a .pt file."""
def get_result(self) -> Any:
pass
def execute(
self,
model: AlphaTensorModel,
save_dir: str,
):
"""Saves the OpenAlphaTensor model.
Args:
model (AlphaTensorModel): OpenAlphaTensor model to be saved.
save_dir (str): Directory where the model will be saved.
"""
save_dir = Path(save_dir if save_dir else ".")
save_dir.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), save_dir / "final_model.pt")
model_params = {
"input_size": model.input_size,
"tensor_length": model.tensor_length,
"scalars_size": 1,
"emb_dim": model.emb_dim,
"n_steps": model.n_steps,
"n_logits": model.n_logits,
"n_samples": model.n_samples,
}
# save parameters in a json file
with open(save_dir / "model_params.json", "w") as f:
json.dump(model_params, f)
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/operations/training_op.py
================================================
from pathlib import Path
from typing import Tuple, Any, List
import torch.optim
from nebullvm.operations.base import Operation
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.core.training import Trainer
from open_alpha_tensor.operations.checkpoint_op import LoadCheckpointDataOp
class TrainingOperation(Operation):
"""Operation which trains an AlphaTensor model to learn more efficient
matrix multiplications."""
def __init__(self):
super().__init__()
self._trained_model = None
self._load_checkpoint_data_op = LoadCheckpointDataOp()
def execute(
self,
model: AlphaTensorModel,
input_size: int,
n_steps: int,
batch_size: int,
optimizer: torch.optim.Optimizer,
device: str,
len_data: int,
pct_synth: float,
n_synth_data: int,
limit_rank: int,
max_epochs: int,
n_actors: int,
mc_n_sim: int,
N_bar: int,
last_epoch: int,
lr: float,
lr_decay_factor: float,
lr_decay_steps: int,
loss_params: Tuple[float, float] = None,
random_seed: int = None,
checkpoint_dir: str = None,
checkpoint_data_dir: str = None,
n_cob: int = 0,
cob_prob: float = 0.0,
data_augmentation: bool = False,
extra_devices: List[str] = None,
):
"""Trains an AlphaTensor model to learn more efficient matrix
multiplications.
Args:
model (AlphaTensorModel): The model to be trained.
input_size (int): Flattened size of the matrices to be multiplied.
n_steps (int): Number of steps used to get a single action out of
a triplet.
batch_size (int): Batch size.
optimizer (torch.optim.Optimizer): The optimizer used for training.
device (str): The name of the torch device used for training.
len_data (int): Number of training samples used (both actor
generated and synthetic).
pct_synth (float): Initial percentage of synthetic samples used
for training.
n_synth_data (int): Number of synthetic training samples.
limit_rank (int): Maximum rank for synthetically-generated
matrices.
max_epochs (int): Number of training epochs.
n_actors (int): Number of actors to play a single each game at
each training step.
mc_n_sim (int): Number of simulations during Monte Carlo tree
search.
N_bar (int): N_bar parameter used to compute tau when improving
the policy.
last_epoch (int): Latest epoch reached during training from which
checkpoint data will be loaded.
lr (float): Learning rate.
lr_decay_factor (float): Learning rate's decay factor.
lr_decay_steps (int): Number of learning rate's decay steps.
loss_params (Tuple[float, float]): Alpha and Beta parameters used
in the loss function.
random_seed (int): Randomizing seed.
checkpoint_dir (str): Directory used to store model checkpoints.
checkpoint_data_dir (str): Directory used to store games as JSON
files.
n_cob (int): Number of change of basis (cob) used for a single
training sample.
cob_prob (float): Probability of applying a change of basis.
data_augmentation (bool): Whether to randomly swap the last
operation of an episode with another operation.
extra_devices (List[str]): Extra devices names used for multi-GPU
training.
"""
checkpoint_data_dir = Path(checkpoint_data_dir or "games")
# build trainer
trainer = Trainer(
model=model,
tensor_size=input_size,
n_steps=n_steps,
batch_size=batch_size,
optimizer=optimizer,
device=device,
len_data=len_data,
pct_synth=pct_synth,
n_synth_data=n_synth_data,
limit_rank=limit_rank,
loss_params=loss_params,
random_seed=random_seed,
checkpoint_dir=checkpoint_dir,
checkpoint_data_dir=checkpoint_data_dir,
data_augmentation=data_augmentation,
cob_prob=cob_prob,
n_cob=n_cob,
extra_devices=extra_devices,
)
# load checkpoint data
self._load_checkpoint_data_op.execute(
games_store_dir=checkpoint_data_dir,
trainer=trainer,
)
# train
trainer.train(
n_epochs=max_epochs,
n_games=n_actors,
mc_n_sim=mc_n_sim,
N_bar=N_bar,
starting_epoch=last_epoch,
initial_lr=lr,
lr_decay_factor=lr_decay_factor,
lr_decay_steps=lr_decay_steps,
)
self._trained_model = trainer.model
def get_trained_model(self):
"""Returns the trained model."""
return self._trained_model
def get_result(self) -> Any:
pass
================================================
FILE: optimization/open_alpha_tensor/open_alpha_tensor/root_op.py
================================================
from typing import Tuple, List
from nebullvm.operations.base import Operation
from open_alpha_tensor.core.modules.alpha_tensor import AlphaTensorModel
from open_alpha_tensor.operations.checkpoint_op import LoadCheckPointOp
from open_alpha_tensor.operations.model_op import (
BuildModelOp,
SaveModelOp,
BuildOptimizerOp,
)
from open_alpha_tensor.operations.training_op import TrainingOperation
class TrainAlphaTensorRootOp(Operation):
"""Root operation which trains an AlphaTensor model to learn more
efficient matrix multiplications."""
def __init__(self):
super().__init__()
self._model = None
self._optimizer = None
self._build_model_op = BuildModelOp()
self._build_optimizer_op = BuildOptimizerOp()
self._load_checkpoint_op = LoadCheckPointOp()
self._training_op = TrainingOperation()
self._save_model_op = SaveModelOp()
def execute(
self,
tensor_length: int,
input_size: int,
scalars_size: int,
emb_dim: int,
n_steps: int,
n_logits: int,
n_samples: int,
optimizer_name: str,
lr: float,
lr_decay_factor: float,
lr_decay_steps: int,
weight_decay: float,
loss_params: Tuple[float, float],
checkpoint_dir: str,
checkpoint_data_dir: str,
epochs: int,
batch_size: int,
len_data: int,
n_synth_data: int,
pct_synth: float,
limit_rank: int,
n_actors: int,
mc_n_sim: int,
N_bar: int,
device: str,
save_dir: str,
random_seed: int,
n_cob: int,
cob_prob: float,
data_augmentation: bool,
extra_devices: List[str],
):
"""Trains an AlphaTensor model to learn more efficient matrix
multiplications.
Args:
tensor_length (int): Number of step tensors fed to the model
(history and current state),
input_size (int): Flattened size of the matrices to be multiplied,
scalars_size (int): Size of the scalar vectors fed to the torso
model,
emb_dim (int): Embedding dimension,
n_steps (int): Number of steps used to get a single action out of
a triplet,
n_logits (int): Number of logits output by the policy head,
n_samples (int): Number of samples used by the policy head at
evaluation time,
optimizer_name (str): Name of the optimizer used,
lr (float): Learning rate,
lr_decay_factor (float): Learning rate's decay factor,
lr_decay_steps (int): Number of learning rate's decay steps,
weight_decay (float): Weight decay used by the optimizer,
loss_params (Tuple[float, float]): Alpha and Beta parameters used
in the loss function,
checkpoint_dir (str): Directory used to store model checkpoints,
checkpoint_data_dir (str): Directory used to store games as JSON
files,
epochs (int): Number of training epochs,
batch_size (int): Batch size,
len_data (int): Number of training samples used (both actor
generated and synthetic),
n_synth_data (int): Number of synthetic training samples,
pct_synth (float): Initial percentage of synthetic samples used
for training,
limit_rank (int): Maximum rank for synthetically-generated
matrices,
n_actors (int): Number of actors to play a single each game at
each training step,
mc_n_sim (int): Number of simulations during Monte Carlo tree
search,
N_bar (int): N_bar parameter used to compute tau when improving
the policy,
device (str): The name of the torch device used for training,
save_dir (str): Directory where the final trained model will be
stored,
random_seed (int): Randomizing seed,
n_cob (int): Number of change of basis (cob) used for a single
training sample,
cob_prob (float): Probability of applying a change of basis,
data_augmentation (bool): Whether to randomly swap the last
operation of an episode with another operation,
extra_devices (List[str]): Extra devices names used for multi-GPU
training.
"""
if self._model is None:
self._build_model_op.execute(
tensor_length=tensor_length,
input_size=input_size,
scalars_size=scalars_size,
emb_dim=emb_dim,
n_steps=n_steps,
n_logits=n_logits,
n_samples=n_samples,
)
self._model = self._build_model_op.get_model().to(device)
if self._build_model_op.get_model() is not None:
self._build_optimizer_op.execute(
optimizer_name=optimizer_name,
model=self._build_model_op.get_model(),
lr=lr,
weight_decay=weight_decay,
)
self._optimizer = self._build_optimizer_op.get_optimizer()
if self._model is not None and self._optimizer is not None:
self._load_checkpoint_op.execute(
self._model, self._optimizer, checkpoint_dir
)
if self._load_checkpoint_op.get_model() is not None:
self._model = self._load_checkpoint_op.get_model()
self._optimizer = self._load_checkpoint_op.get_optimizer()
starting_epoch = self._load_checkpoint_op.get_last_epoch()
self._training_op.execute(
model=self._model,
input_size=input_size,
n_steps=n_steps,
batch_size=batch_size,
optimizer=self._optimizer,
device=device,
len_data=len_data,
pct_synth=pct_synth,
n_synth_data=n_synth_data,
limit_rank=limit_rank,
max_epochs=epochs,
n_actors=n_actors,
mc_n_sim=mc_n_sim,
N_bar=N_bar,
last_epoch=starting_epoch,
lr=lr,
lr_decay_factor=lr_decay_factor,
lr_decay_steps=lr_decay_steps,
loss_params=loss_params,
random_seed=random_seed,
checkpoint_dir=checkpoint_dir,
checkpoint_data_dir=checkpoint_data_dir,
n_cob=n_cob,
cob_prob=cob_prob,
data_augmentation=data_augmentation,
extra_devices=extra_devices,
)
if self._training_op.get_trained_model() is not None:
self._model = self._training_op.get_trained_model()
self._save_model_op.execute(
model=self._model,
save_dir=save_dir,
)
def get_result(self) -> AlphaTensorModel:
"""Returns the trained torch model"""
return self._model
================================================
FILE: optimization/open_alpha_tensor/resources/open_alpha_tensor.md
================================================
# Open Source Implementation of DeepMind’s AlphaTensor
Matrix multiplication is a fundamental operation used in many systems, from neural networks to scientific computing routines. Finding efficient and provably correct algorithms for matrix multiplication can have a huge impact on making computation faster and more efficient, but is a very challenging task. The space of possible algorithms is enormous, and traditional methods for discovering algorithms, such as human-designed heuristics or combinatorial search, are often suboptimal.
[DeepMind](https://www.deepmind.com/)'s recently proposed an AI-based solution for automated search that goes far beyond human intuition. The solution consists of a deep reinforcement learning agent called AlphaTensor, built on top of [AlphaZero](https://www.deepmind.com/blog/alphazero-shedding-new-light-on-chess-shogi-and-go). This agent is trained to play a single-player game, TensorGame, where the goal is to discover computationally efficient algorithms for matrix multiplication.
AlphaTensor is particularly good at handling large matrices by decomposing large matrix multiplications into smaller multiplications. Moreover, AlphaTensor can be used to achieve state-of-the-art performance for matrix multiplication once fine-tuned on a specific hardware device.
AlphaTensor has great potential for accelerating deep learning computing. In deep learning, many time-consuming operations can be mapped to matrix multiplications. By using AlphaTensor to optimize these operations, the overall performance of deep learning models can be significantly improved.
In this article, we will explore DeepMind's AlphaTensor architecture and algorithm and how it discovers new efficient algorithms by playing the TensorGame. Next, we will examine the [first open-source implementation of AlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor), and unresolved challenges to potentially revolutionize the computational performance of deep learning models with AlphaTensors.

Photo by [DeepMind](https://unsplash.com/@deepmind?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText) on [Unsplash](https://unsplash.com/photos/4QVqSh4VvP4)
# What is DeepMind’s AlphaTensor?
AlphaTensor is a reinforcement learning algorithm based on the AlphaZero algorithm and trained to play a simple one-player game: the TensorGame. This game consists in finding the tensor decomposition of a three-dimensional tensor representing the matrix multiplication.
### Matrix Multiplication Tensor
For non-experts in Matrix Multiplication optimization, it may not be straightforward to understand how an operation, such as a matrix multiplication, can be mapped in a three-dimensional tensor. I will try to explain it in simple words and with examples.
Let’s consider the product `C = A*B`, where for simplicity both A and B are square matrices of size N. The multiplication operation can be mapped in a 3D tensor of shape `(N^2, N^2, N^2)` . The first tensor dimension represents the flatten matrix A, the second dimension the flatten matrix B and the third dimension the flatten matrix C.
The tensor has only binary values (either 1 or 0) for each entry. Note that the tensor represents the multiplication operation, so it is independent of the values of the matrices A and B.
Every entry of the tensor corresponds to the coefficient of the operation. For example, to compute C[1,1], it is necessary to multiply both A[1,1] and B[1,1]. Therefore, the tensor entry [0,0,0], which corresponds to A[1,1], B[1,1] and C[1,1], will have value 1. In contrast, to compute C[1,1], A[2,1] is not needed. Thus, the tensor row T[N+1, :, 0] will contain only zeros.
The image below from [DeepMind’s paper](https://www.marktechpost.com/2023/02/20/a-new-ai-approach-using-embedding-recycling-er-can-make-language-model-development-more-efficient-with-2x-faster-training-and-1-8x-speedup-in-inference/) shows an example of a tensor for N=2.
As shown in (b) and (c) in the figure above, it is possible to implement an algorithm for computing the product using a decomposition of the 3D tensor. More specifically, the algorithm below can be used for converting a tensor decomposition (the matrices U, V, W) in a matrix multiplication algorithm.
## The TensorGame
The problem of finding efficient algorithms for matrix multiplication is extremely challenging because the number of possible algorithms to consider is much larger than the number of atoms in the universe, even for small instances of matrix multiplication.
DeepMind converted this problem into a single-player game, and called it the TensorGame. In this game, the player chooses how to combine different entries of matrices to multiply them. A score is assigned based on the number of operations required to achieve the correct multiplication result. The game ends when the zero tensor is reached or when the maximum number of moves has been made. The final factorization is evaluated based on an estimation of the residual rank and certain optimization criteria, such as asymptotic time complexity or practical runtime.
The initial position in the TensorGame corresponds to the Matrix Multiplication Tensor expressed on some random basis.
In each step t **of the game, the player writes down three vectors $\vec{u}(t), \vec{v}(t), \vec{w}(t)$, which specifies the rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$. The state of the game is updated by subtracting the vectors selected by the player:
$$
\tilde{S}_{t+1} = \tilde{S}_{t} - \vec{u} \otimes \vec{v} \otimes \vec{w}
$$
where $\tilde{S}_0$ is the Matrix Multiplication Tensor.
If the game ends in p steps, this means that the Matrix Multiplication Tensor $\tilde S_0$ can be decomposed into p rank-1 tensors $\vec{u} \otimes \vec{v} \otimes \vec{w}$, i.e. it has at least rank p.
The TensorGame can then be interpreted as a rank decomposition algorithm and AlphaTensor can be seen as an algorithm for estimating the rank of the tensor.
## AlphaTensor Architecture
So far we have learned about the TensorGame and clarified how its solution can be seen as a matrix multiplication algorithm. Let’s now explore the main concepts of AlphaTensor, the algorithm used for the game.
AlphaTensor architecture is basically an encoder-decoder Transformer architecture where:
- the encoder takes as input the game state $\tilde S_t$, the n previous actions taken by the model (usually n=7) and the time index t **of the current action. Information is stacked together in a tensor with shape `(n+1, N^2, N^2, N^2)` . This tensor is then reshaped and transformed (using three linear layers) in a tensor of shape `(N^2, N^2, c)` where c is the inner dimension of the model.
- the decoder generates the `n_steps` actions from the embedded vector given by the encoder in an auto-regressive way. Each action corresponds to a token of the triplets $(\vec{u}, \vec{v}, \vec{w})$ representing one of the triplets decomposing the game tensor (i.e. reducing its rank)
The model is trained by alternating back-propagation and model acting. Model acting is used to generate data that is then used to train the model. In practice, the model is trained with a mixture of synthetically generated data and data generated by the model during acting. The acting step is done by taking a 3D tensor corresponding to a matrix operation and playing `n_actors` games on it. Each actor plays a game either on the standard basis or on an alternative basis (the change of basis is applied with a given probability). The results are then collected and can be used in the training step with the synthetic data.
The acting step is based on AlphaZero's Monte Carlo Tree Search (MCTS), modified to support large action spaces. In short, before choosing the action, `n_sims` paths are explored from the model output with a maximum future exploration of 5 steps. The probabilities generated by the model are then adjusted taking into account the generated paths. Then the action with the most promising future path(s) is chosen to continue the game.
While training the model, the reward is actually a negative reward (penalty). Its absolute value increases with each additional step required to solve the game. If the model takes `m` steps to solve a TensorGame, the reward associated with the game is `r=-m.` If the model is not able to solve the TensorGame in `max_rank` steps, the reward is computed by estimating the rank of the remaining tensor. The rank is estimated as the sum of the ranks of the matrices that compose the tensor. The estimate is an upper bound on the true rank of the tensor.
When fine-tuning the model, the penalty reward at the terminal state should also take into account the latency of the algorithm produced by the model. The reward formula becomes `rt'=rt+λbt`, where `rt` is the reward scheme described earlier, `bt` is the benchmark reward (non-zero only at the terminal state), and *`λ`* is a user-specified coefficient.
The image above from DeepMind's paper shows the speed-ups (%) of AlphaTensor-discovered algorithms tailored for a GPU and a TPU, extracted from DeepMind’s paper. Speed-ups are measured relative to standard (e.g. cuBLAS for the GPU) matrix multiplication on the same hardware and compared to the Strassen-square algorithm.
# The Open Source Implementation of DeepMind’s AlphaTensor
[OpenAlphaTensor](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/open_alpha_tensor) is the first open source implementation of AlphaTensor and was developed by [Diego Fiori](https://www.linkedin.com/in/diego-fiori-/).
Let's discover more about the implementation.
As we discussed earlier, the AlphaTensor architecture is fairly straightforward, based on a standard transformer with an encoder-decoder architecture. The most interesting components of AlphaTensor are the first layer in the encoder part and the way the actions are sampled.
Let’s start with the first encoding layer.
```python
# x.size = (N, T, S, S, S)
# scalars.size = (N, s)
batch_size = x.shape[0]
S = x.shape[-1]
T = x.shape[1]
x1 = x.permute(0, 2, 3, 4, 1).reshape(batch_size, S, S, S * T)
x2 = x.permute(0, 4, 2, 3, 1).reshape(batch_size, S, S, S * T)
x3 = x.permute(0, 3, 4, 2, 1).reshape(batch_size, S, S, S * T)
input_list = [x1, x2, x3]
for i in range(3):
temp = self.linears_1[i](scalars).reshape(batch_size, S, S, 1)
input_list[i] = torch.cat([input_list[i], temp], dim=-1)
input_list[i] = self.linears_2[i](input_list[i])
x1, x2, x3 = input_list
```
In the snippet above, we show how the input tensor is decomposed into three tensors, which are then used as query, key and value inputs of the transformer-layer.
1. Across the three tensor dimensions representing the flattened matrices (A, B, C), the input tensor is flattened along each dimension together with the dimension representing the previous actions. In this way, in each flattened-copy of the input tensor, the selected dimension is an aggregation of the last T-1 values and the actual value, for all the S values of the selected dimension, where S=N^2. Philosophically, it is as if, for each dimension, we focus on what happened in the previous actions in that dimension.
2. The scalars are mapped in three different spaces of dimension S^2, and then reshaped to be concatenated with the tensors obtained at the previous point. Conceptually, the scalars are mapped to an embedding space of dimension S^2, and then the embedded information is chunked into S vectors and stacked together, similar to what happens to text when tokenized.
3. Scalar tokens are concatenated with the restructured input tensor and then given as input to a linear layer for mapping the scalars+channel-history focus information in the internal dimension of the model.
These three steps can be interpreted as a way of giving to the model both information about the scalars (as in the TensorGame time step) and the focus on the previous actions for each channel.
Regarding the way the actions are produced, it is interesting to note that AlphaTensor generates as output the triplet u, v, w, which aims to reduce the tensor rank. The three vectors have size S and since they are concatenated the model has to produce a vector of size 3*S. AlphaTensor is trained with a RL algorithm, so all possible actions must be expressed in terms of probabilities in an enumerated space, i.e. the model produces a probability over the different actions. This means that each vector in the 3S space should be mapped to a different action. This results in an action space of size |F|^(3S), where |F| is the number of different values that the element of u, v, w can take. Usually the values are restricted to (-2, -1, 0, 1, 2), resulting in a cardinality of 5 elements.
Here comes a major challenge: to generate the action probabilities for a matrix product of matrices of size 5 we would need a memory of 5^75 * 4 bytes, which would mean `~10^44 GB` of memory. Clearly we cannot manage such a large action space.
How do we solve the problem? To reduce the memory footprint of the action probabilities we can split the triplets into smaller chunks, “tokenize” them, and threaten the chunks as generated tokens in the transformer architecture, i.e. the tokens are given as input to the decoder in an auto-regressive way. In the example above we can split the triplets into 15 chunks, reducing the memory consumption to `15 * 5^(75/15) * 4`, i.e. `187.5 KB`.
```python
def _eval_forward(self, e: torch.Tensor):
bs = e.shape[0]
future_g = (
torch.zeros((bs, self.n_samples, self.n_steps)).long().to(e.device)
)
ps = torch.ones((bs, self.n_samples)).to(e.device)
e = e.unsqueeze(1).repeat(1, self.n_samples, 1, 1)
future_g = future_g.view(-1, self.n_steps)
ps = ps.view(-1)
e = e.view(-1, e.shape[-2], e.shape[-1])
for i in range(self.n_steps):
o_s, z_s = self.core(future_g[:, : i + 1], e)
future_g[:, i], p_i = sample_from_logits(o_s[:, i])
ps *= p_i
future_g = future_g.view(bs, self.n_samples, self.n_steps)
ps = ps.view(bs, self.n_samples)
return (
future_g,
ps,
z_s[:, 0].view(bs, self.n_samples, *z_s.shape[2:]).mean(1),
)
```
Above we show the code snippet for generating the full action. In the code, `self.core` contains the decoder layer and the tensor `e` represents the output of the encoder layer. Zero can be considered as the `` token in NLP models and the `n_steps` actions representing the `n_steps` chunks are generated in a progressive way.
The model returns three quantities:
1. The generated actions
2. The probability associated with the full action
3. The logits produced for generating the first action (the first chunk) that will be used for computing the model value.
It is worth spending a few words on the `n_samples` parameter. The parameter is used for the acting step and it allows the model to generate different versions of the triplets which will then be used for exploring the action space in the Monte Carlo Tree Search algorithm used in the Acting process. The `n_samples` different actions are sampled accordingly to the policy generated by the model.
## Acting Step
The most tricky part of the whole algorithm is probably the Acting step used for solving the TensorGame. The algorithm is not deeply explained in the AlphaTensor paper, since it is based on several DeepMind’s previous papers which are just cited and given as known. Here, I’ll re-compose all the missing pieces and explain step by step our implementation.
We can organize the acting steps in three different components:
- The Monte-Carlo Tree Search
- The game simulation
- The Improved policy computation
### Monte-Carlo Tree Search (MCTS)
Monte Carlo Tree Search (MCTS) is a widely used artificial intelligence technique for game playing, particularly in board games and video games. The algorithm creates a game tree that simulates potential moves and outcomes and uses random sampling to evaluate the expected reward for each move. The algorithm then repeatedly selects the move with the highest expected reward and continues simulating outcomes until it reaches a terminal state or a specified stopping condition. The simulations are used to estimate the probability of winning for each move and guide the decision-making process. MCTS has been shown to be effective in complex games where the number of possible moves and outcomes is large, and it has been used in successful game-playing AI systems, such as AlphaGo.
In AlphaTensor a modified version of the original MCTS is used. In particular, instead of randomly selecting the action from the whole action space, the action is selected among a subset generated directly by the model (through the `n_samples` presented before). The correction to the policy upgrade is then applied in the **Improved Policy computation** step.
In our implementation, we decided to keep all the information about the Monte-Carlo tree in a dictionary having as key the hash-version of the TensorGame state and as values the information associated with the state itself. Each Monte-Carlo step starts from a node and simulate `n_sim` mini-games, exploring the future with a horizon of 5 moves. If the node has already been explored in previous simulations, n_sim is adjusted considering the number of previous exploration. For each node the number of visits is stored in the `N_s_a` tensor, since this tensor contains the number of visits per node child action (among the ones sampled by the model).
```python
def monte_carlo_tree_search(
model: torch.nn.Module,
state: torch.Tensor,
n_sim: int,
t_time: int,
n_steps: int,
game_tree: Dict,
state_dict: Dict,
):
"""Runs the monte carlo tree search algorithm.
Args:
model (torch.nn.Module): The model to use for the simulation.
state (torch.Tensor): The initial state.
n_sim (int): The number of simulations to run.
t_time (int): The current time step.
n_steps (int): The maximum number of steps to simulate.
game_tree (Dict): The game tree.
state_dict (Dict): The dictionary containing the states.
"""
state_hash = to_hash(extract_present_state(state))
if state_hash in state_dict:
with torch.no_grad():
N_s_a = state_dict[state_hash][3]
n_sim -= int(N_s_a.sum())
n_sim = max(n_sim, 0)
for _ in range(n_sim):
simulate_game(model, state, t_time, n_steps, game_tree, state_dict)
# return next state
possible_states_dict, _, repetitions, N_s_a, q_values, _ = state_dict[
state_hash
]
possible_states = _recompose_possible_states(possible_states_dict)
next_state_idx = select_future_state(
possible_states, q_values, N_s_a, repetitions, return_idx=True
)
next_state = possible_states[next_state_idx]
return next_state
```
The code above shows our implementation of the algorithm. For a matter of code simplicity the policy correction is performed in the `simulate_game` function.
### Game Simulation
The `simulate_game` function is responsible for exploring the tree composed of nodes representing a particular state of the TensorGame. It also runs the model whenever a leaf node is encountered and it stores all node information in the `state_dict` dictionary. Let’s give a deep look at its implementation:
```python
@torch.no_grad()
def simulate_game(
model,
state: torch.Tensor,
t_time: int,
max_steps: int,
game_tree: Dict,
states_dict: Dict,
horizon: int = 5,
):
"""Simulates a game from a given state.
Args:
model: The model to use for the simulation.
state (torch.Tensor): The initial state.
t_time (int): The current time step.
max_steps (int): The maximum number of steps to simulate.
game_tree (Dict): The game tree.
states_dict (Dict): The states dictionary.
horizon (int): The horizon to use for the simulation.
"""
idx = t_time
max_steps = min(max_steps, t_time + horizon)
state_hash = to_hash(extract_present_state(state))
trajectory = []
# selection
while state_hash in game_tree:
(
possible_states_dict,
old_idx_to_new_idx,
repetition_map,
N_s_a,
q_values,
actions,
) = states_dict[state_hash]
possible_states = _recompose_possible_states(possible_states_dict)
state_idx = select_future_state(
possible_states, q_values, N_s_a, repetition_map, return_idx=True
)
trajectory.append((state_hash, state_idx)) # state_hash, action_idx
future_state = extract_present_state(possible_states[state_idx])
state = possible_states[state_idx]
state_hash = to_hash(future_state)
idx += 1
# expansion
if idx <= max_steps:
trajectory.append((state_hash, None))
if not game_is_finished(extract_present_state(state)):
state = state.to(model.device)
scalars = get_scalars(state, idx).to(state.device)
actions, probs, q_values = model(state, scalars)
(
possible_states,
cloned_idx_to_idx,
repetitions,
not_dupl_indexes,
) = extract_children_states_from_actions(
state,
actions,
)
not_dupl_actions = actions[:, not_dupl_indexes].to("cpu")
not_dupl_q_values = torch.zeros(not_dupl_actions.shape[:-1]).to(
"cpu"
)
N_s_a = torch.zeros_like(not_dupl_q_values).to("cpu")
present_state = extract_present_state(state)
states_dict[to_hash(present_state)] = (
_reduce_memory_consumption_before_storing(possible_states),
cloned_idx_to_idx,
repetitions,
N_s_a,
not_dupl_q_values,
not_dupl_actions,
)
game_tree[to_hash(present_state)] = [
to_hash(extract_present_state(fut_state))
for fut_state in possible_states
]
leaf_q_value = q_values
else:
leaf_q_value = -int(torch.linalg.matrix_rank(state).sum())
# backup
backward_pass(trajectory, states_dict, leaf_q_value=leaf_q_value)
```
Each simulation is divided in three parts:
- Selection
- Expansion
- Backup
In the `selection` part the simulation is run on the already generated tree-nodes, and the following node is selected using the following function:
```python
def select_future_state(
possible_states: List[torch.Tensor],
q_values: torch.Tensor,
N_s_a: torch.Tensor,
repetitions: Dict[int, list],
c_1: float = 1.25,
c_2: float = 19652,
return_idx: bool = False,
) -> torch.Tensor:
"""Select the future state maximizing the upper confidence bound."""
# q_values (1, K, 1)
pi = torch.tensor(
[
len(repetitions[i])
for i in range(len(possible_states))
if i in repetitions
]
).to(q_values.device)
ucb = q_values.reshape(-1) + pi * torch.sqrt(
torch.sum(N_s_a) / (1 + N_s_a)
) * (c_1 + torch.log((torch.sum(N_s_a) + c_2 + 1) / c_2))
if return_idx:
return ucb.argmax()
return possible_states[ucb.argmax()]
```
In practice, the action maximizing the `ucb` function
$$
Q(a,s) + \pi(a,s) * \sqrt{\frac{\sum_i{N(s, a_i)}}{1+N(s,a)}} * \left[c_1 + \log\left(\frac{1+c_2+\sum_i{N(s, a_i)}}{c_2}\right)\right]
$$
for the given state is selected. Where Q represents the Q values generated by the model and π represents the random distribution over the actions sampled using the model policy. `N(s, a)` represents the number of visits of the node to action a from node s.
Once the selection phase reaches a leaf node, if the simulation has not reached a terminal condition (in terms of either maximum exploration, i.e. future horizon, or game ending), the model is then used for selecting `n_samples` alternative nodes (they will be leaf nodes in the successive iteration). This is called the `expansion` phase, since new nodes are added to the tree. Then, no further node is explored in the current simulation, but the leaf q_value is sent to the following simulation step: the `backup`.
Backup is the final stage of each simulation. During backup, if the leaf node was a terminal state the final reward is computed else the leaf q value is used as an estimated reward. Then the reward is back-propagated on the simulation trajectory updating both the states q_values and updating the visit counter `N(s, a)`. In the snippet below we show the code for the reward back-propagation.
```python
def backward_pass(trajectory, states_dict, leaf_q_value: torch.Tensor):
"""Backward pass of the montecarlo algorithm"""
reward = 0
for idx, (state, action_idx) in enumerate(reversed(trajectory)):
if action_idx is None: # leaf node
reward += leaf_q_value
else:
(
_,
old_idx_to_new_idx,
_,
N_s_a,
q_values,
_,
) = states_dict[state]
if isinstance(reward, torch.Tensor):
reward = reward.to(q_values.device)
action_idx = int(action_idx)
if action_idx in old_idx_to_new_idx:
not_dupl_index = old_idx_to_new_idx[int(action_idx)]
else:
not_dupl_index = action_idx
reward -= 1
q_values[:, not_dupl_index] = (
N_s_a[:, not_dupl_index] * q_values[:, not_dupl_index] + reward
) / (N_s_a[:, not_dupl_index] + 1)
N_s_a[:, not_dupl_index] += 1
```
### Improved Policy Computation
Once all the simulations have been run and the MCTS offers an interesting snapshot of the near future it is time to update the policy associated with the predicted nodes and return them, so that they can be used during training. The improved policy, following the method described in [Hubert et al](https://arxiv.org/pdf/2104.06303.pdf), is used for managing large action spaces. In fact, for small search space it is possible during MCTS to sample an action randomly from the action space and evaluate its impact. A similar approach in a much larger action space would lead to all trajectories to diverge in different paths and it would need an infinite amount of trajectories for getting meaningful statistics and then update the policy. Since here we are using sample-MCTS for avoiding the dispersion, i.e. `n_samples` actions are sampled accordingly to the model policy and then MCTS just selects one of the sampled actions while exploring the tree, we need to take into account the sample-correction when computing the final updated policy that will be used while training the model.
In practice the improved policy is computed as
$$
I\pi\left(s, a\right) = \frac{N^{1/\tau(s)}(s, a)}{\sum_iN^{1/\tau(s)}(s, a_i)}
$$
where $\tau(s) = \frac{\log\left(\sum_iN(s, a_i)\right)}{\log\left(\bar{N}\right)}$ if $\sum_iN(s, a_i) > \bar{N}$ else $\tau(s) = 1$.
```python
def compute_improved_policy(
state_dict: Dict,
states: List[str],
model_n_steps: int,
model_n_logits: int,
N_bar: int,
):
"""Compute the improved policy given the state_dict, the list of states.
The improved policy is computed as (N_s_a / N_s_a.sum())ˆ(1/tau) where tau
is (log(N_s_a.sum()) / log(N_bar)) if N_s_a.sum() > N_bar else 1.
"""
policies = torch.zeros(len(states), model_n_steps, model_n_logits)
N_bar = torch.tensor(N_bar)
for idx, state in enumerate(states):
N_s_a = state_dict[state][3]
actions = state_dict[state][5]
if N_s_a.sum() > N_bar:
tau = (torch.log(N_s_a.sum()) / torch.log(N_bar)).item()
else:
tau = 1
N_s_a = N_s_a ** (1 / tau)
improved_policy = N_s_a / N_s_a.sum()
for sample_id in range(actions.shape[1]):
action_ids = actions[0, sample_id]
for step_id, action_id in enumerate(action_ids):
policies[idx, step_id, action_id] += improved_policy[
0, sample_id
]
return policies
```
Note that in our implementation after having computed the policy from the `N_s_a` tensor we have to map it back to the original action tensor. In fact `N_s_a` just considers the actions sampled by the model, while the final policy must contain probabilities also for the not-explored actions.
### Differences respect to ChatGPT training algorithm
AlphaTensor is the latest member of the AlphaGo/AlphaZero family of artificial intelligence methods by DeepMind. These methods are based on the Monte Carlo Tree Search (MCTS) algorithm, which has been refined and enhanced by DeepMind to tackle increasingly complex tasks. Another AI system, OpenAI's ChatGPT, which has caused a lot of buzz for its remarkable performance, was trained with a different approach, called Reinforcement Learning with Human Feedback (RLHF).
RLHF is a fine-tuning technique used to tune language models to follow a set of written instructions. It uses human preferences as a reward signal to fine-tune the model, thereby aligning the behavior of the language model with the stated preferences of a specific group of people, rather than some broader notion of ‘human values’.
In contrast, MCTS is a tree-based search algorithm used to determine the optimal moves in games. It simulates potential moves and updates the values of each move based on their outcomes, guiding the selection of the best move.
RLHF collects data from human-written demonstrations and human-labelled comparisons between AI models, and trains a reward model to predict the preferences of a given group of people. The reward model is then used to fine-tune the AI models. MCTS, on the other hand, uses simulations and evaluations to determine the best decision.
Although they are different approaches, RLHF and MCTS also have similarities. Both artificial intelligence techniques use decision-making and problem-solving methods, and both use a trial-and-error approach to explore different options and make decisions based on available information. Both are also iterative processes that improve over time as more information and experience are gathered.
The choice between RLHF and MCTS depends on the task at hand. RLHF is ideal when there is no clear metric for evaluating the model performance, while MCTS has proven effective in game-like tasks where knowledge and exploration of the future give the model a significant advantage.
## Code Optimization for AlphaTensor training
Implementing the AlphaTensor training algorithm requires finding the perfect compromise between training speed and memory consumption. As seen in the Model section, simply considering the action tokenization can save a lot of memory, but an overly aggressive action space reduction can lead to both drop in accuracy and slower performance. The latter happens because all tokens are generated sequentially in an autoregressive way by the model decoder. Therefore, the inference time grows linearly with the number of tokens per action once the softmax on the action space is not the bottleneck anymore.
When setting up AlphaTensor training, the main difficulties were found in dealing with the acting process. If the tensors are not stored in the correct format, the MCTS can easily cause uncontrolled memory usage growth. On the other hand, if the number of tensors stored during each simulation is reduced too much, the MCTS can spend an infinite amount of time re-computing the required states.
Let's take an example of the game simulation step, where the game is explored by looking at possible future scenarios. For each state, if we don't save the actions generated by the model and we decide to save only the random seed used to sample the actions from the policy, then each time we explore a tree node we would have to recompute the policy and then sample the actions. Clearly, we decided to store the sampled actions to save time and to avoid having to manage model sharing between different processes in the case of MCTS exploration parallelization.
However, just saving the actions was not enough to get a sufficiently efficient acting step. In fact, the time for converting the n_steps actions into the (u, v, w) triplet, reducing the game tensor state and creating the new3D tensors from the n_samples actions would easily be a bottleneck for the whole training.
Secondly, we didn't want to store all possible future states for each sampled action, as this would have a huge impact on the memory used by the algorithm. Suppose we set n_samples=32, n=7 and N=5, and let's remember that N is the size of the square matrix product we want to reduce and n is the number of previous actions remembered by the model. In this situation, each state tensor would have the form (8, 25, 25, 25), which multiplied by 32 would result in 32*8*25*25*25*4 bytes for each node in the graph. Now, considering that each simulation in the expansion phase generates a new node (and n_sim=200), we would have a final memory consumption of 200*32*8*25*25*25*4 = 3.2GB for the first MCTS node alone. In the worst case scenario, while exploring acting max_rank nodes (where `max_rank=150`), this would result in a total memory consumption of 150 * 3.2GB = 480GB in RAM memory (or GPU memory if all tensors were stored on the GPU). We ran the training on our workstation with 128 GB of RAM and 48 GB of GPU memory, so we had to reduce the memory consumption.
Since we didn't want to increase the execution time, we adopted an optimization that exploits the redundancy in the state tensors produced. In fact, the tensors have n-1 previous actions in common, which can then be stored once and not repeated for each stored tensor. This results in a memory reduction of 2/7~28%, meaning that in the worst case 137GB can be stored. At this point, by simply pruning the unused part of the tree (such as the unselected trajectories) and storing the tensors in CPU memory, we were able to avoid any memory error during training.
# Next Steps
With AlphaTensor now being open source, several exciting avenues for further development open up.
A natural next step is to fine-tune AlphaTensor on specific hardware devices and benchmark performance. At the time of writing, fine-tuning was in progress.
Another important advance would be the support for remote compilation, allowing users to build algorithms optimized for edge devices. This can be achieved by storing the AlphaTensor model on a server, while the matrix multiplication algorithm is evaluated on different hardware.
It could also be important to extend support for different compilers to compute the latency-based reward correction. Different compilers can lead to different optimized algorithms on a given hardware. For example, the DeepMind paper showed promising results using JAX and the XLA compiler on TPU and Nvidia GPUs. It would be interesting to evaluate this using NCCL on Nvidia or llvm on CPUs.
Finally, extending the model and training algorithm to support larger matrix sizes remains a major open challenge. Currently, AlphaTensor supports a maximum matrix size of 5, but it can be applied by splitting larger matrix multiplications into groups of tiny MMs with a size smaller than 5. This approach is suboptimal, and performing the reduction directly on the large tensor corresponding to the full MM could theoretically lead to better results.
## Speedster integration of AlphaTensor
AlphaTensor opens the doors for further improvements to Speedster. [Speedster](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/speedster) is an open source module designed to speed up AI inference with just a few lines of code. The library automatically applies the best set of SOTA optimization techniques to achieve maximum inference speed-up.
Within Speedster, AlphaTensor will use its optimized kernels for matrix multiplication to find the optimal set of sub-operations for each layer in the AI model that involve matrix multiplication, including linear layers, attention layers, and convolution layers. The matrix multiplications will be decomposed into sub-matrix multiplications up to the maximum size supported by AlphaTensor, and the fastest decomposition will be selected for each layer. This optimization process will be applied to all layers in the neural network, resulting in a dramatically improved model.
We expect to see significant speed-ups especially in transformer models, where large matrix multiplications become the computational bottleneck at larger sizes. We also plan to support AlphaTensor algorithm generation for reduced precision formats, such as fp16 and int8, in addition to fp32.
================================================
FILE: optimization/open_alpha_tensor/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages
REQUIREMENTS = [
"nebullvm",
"torch",
"tqdm",
]
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")
setup(
name="OpenAlphaTensor",
version="0.0.1",
packages=find_packages(),
install_requires=REQUIREMENTS,
long_description=long_description,
include_package_data=True,
long_description_content_type="text/markdown",
)
================================================
FILE: optimization/optimate/README.md
================================================
# 🧉 OptiMate (WIP)
Interactive tool guiding savvy users in achieving the best inference performance out of a given model / hardware setup.
If you like this library, give us a star to show your support for the project ⭐
## 📖 Description
The OptiMate module is targeted at a sophisticated and savvy type of users, who need to squeeze out every last drop of performance out of a given hardware.
The module is designed to help users to optimize their deep-learning models through the use of profilers and advanced optimization techniques. It also includes a smart assistant that guides the user through the optimization process and provides suggestions to improve the performance of the model.
Each temporary optimization is tracked in a detailed version history, allowing the user to revert to its preferred version at the end of the optimization process.
First, the module leverages profilers to gather information about the model, such as the amount of time it takes for the model to make predictions and the amount of memory used. This information helps in identifying bottlenecks and other inefficiencies in the model.
Then, the module uses various optimization techniques to improve inference performances. These techniques include, among others, model compression, pruning, and quantization, which can help reduce the size and computational demand of the model.
Throughout the process, the smart assistant provides guidance and suggestions to the user. For example, it might suggest which optimization techniques to try out or provide guidance on how to adjust the model parameters to improve its performance.
Overall, the module provides a user-friendly but sophisticated interface to get the most out of any model / hardware setup. Try it out today, and reach out if you have any feedback!
================================================
FILE: optimization/speedster/README.md
================================================
# 💥 Speedster
`Speedster` reduces inference costs by leveraging SOTA optimization techniques that best couple your AI models with the underlying hardware (GPUs and CPUs). The idea is to make AI inference way cheaper in just a few lines of code.
`Speedster` makes it easy to combine optimization techniques across the whole software-to-hardware stack, delivering best-in-class cost savings. If you like the idea, give us a star to support the project ⭐

The core `Speedster` workflow consists of 3 steps:
- [x] **Select**: input your model in your preferred DL framework and express your preferences regarding:
- Accuracy loss: do you want to trade off a little accuracy for significant cost savings?
- Optimization time: achieving great savings can be time-consuming. Can you wait, or do you need an instant answer?
- [x] **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.
- [x] **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just cheaper 🚀).
# Installation
Install `Speedster` and its base requirements:
```
pip install speedster
```
Then make sure to install all the available deep learning compilers.
```
python -m nebullvm.installers.auto_installer --compilers all
```
> :warning: For **MacOS** with **ARM processors**, please use a conda environment.
> Moreover, if you want to optimize a **PyTorch model**, PyTorch must be pre-installed
> on your environment before proceeding to the next step, please install it from this
> [link](https://pytorch.org/get-started/locally/).
For more details on how to install Speedster, please visit our [Installation](https://docs.nebuly.com/Speedster/installation/) guide.
# Quick start
Only one line of code - that’s what you need to accelerate your model! Find below your getting started guide for 5 different input model frameworks:
🔥 PyTorch
In this section, we will learn about the 4 main steps needed to optimize PyTorch models:
1) Input your model and data
2) Run the optimization
3) Save your optimized model
4) Load and run your optimized model in production
```python
import torch
import torchvision.models as models
from speedster import optimize_model, save_model
#1 Provide input model and data (we support PyTorch Dataloaders and custom input, see the docs to learn more)
model = models.resnet50()
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]
#2 Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.
```python
#4 Load and run your PyTorch accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path")
output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with PyTorch Optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/).
🤗 Hugging Face Transformers
In this section, we will learn about the 4 main steps needed to optimize 🤗 Hugging Face Transformer models:
1) Input your model and data
2) Run the optimization
3) Save your optimized model
4) Load and run your optimized model in production
* ✅ For Decoder-only or Encoder-only architectures (Bert, GPT, etc)
```python
from transformers import AlbertModel, AlbertTokenizer
from speedster import optimize_model, save_model
#1a. Provide input model: Load Albert as an example
model = AlbertModel.from_pretrained("albert-base-v1")
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
#1b. Dictionary input format (also string format is accepted, see the docs to learn more)
text = "This is an example text for the huggingface model."
input_dict = tokenizer(text, return_tensors="pt")
input_data = [input_dict for _ in range(100)]
#2 Run Speedster optimization (if input data is in string format, also the tokenizer
# should be given as input argument, see the docs to learn more)
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.
```python
#4 Load and run your Huggingface accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path")
output = optimized_model(**input_sample)
```
For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).
* ✅ For Encoder-Decoder architectures (T5 etc)
```python
from transformers import T5Tokenizer, T5ForConditionalGeneration
from speedster import optimize_model, save_model
#1a. Provide input model: Load T5 as an example
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
#1b. Dictionary input format
question = "What's the meaning of life?"
answer = "The answer is:"
input_dict = tokenizer(question, return_tensors="pt")
input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids
input_data = [input_dict for _ in range(100)]
#2 Run Speedster optimization (if input data is in string format, also the tokenizer
# should be given as input argument, see the docs to learn more)
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.
```python
#4 Load and run your Huggingface accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path")
output = optimized_model(**input_sample)
```
For more details, please visit [Getting Started with HuggingFace optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/).
🧨 Hugging Face Diffusers
> :warning: In order to work properly, the diffusers optimization requires `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`. For additional details, please look the docs [here](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).
In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the Diffusers library:
1) Input your model and data
2) Run the optimization
3) Save your optimized model
4) Load and run your optimized model in production
```python
import torch
from diffusers import StableDiffusionPipeline
from speedster import optimize_model, save_model
#1 Provide input model and data
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
# On GPU we load by default the model in half precision, because it's faster and lighter.
pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)
else:
pipe = StableDiffusionPipeline.from_pretrained(model_id)
# Create some example input data
input_data = [
"a photo of an astronaut riding a horse on mars",
"a monkey eating a banana in a forest",
"white car on a road surrounded by palm trees",
"a fridge full of bottles of beer",
"madara uchiha throwing asteroids against people"
]
#2 Run Speedster optimization
optimized_model = optimize_model(
model=pipe,
input_data=input_data,
optimization_time="unconstrained",
ignore_compilers=["torch_tensor_rt", "tvm"],
metric_drop_ths=0.1,
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀).
```python
#4 Load and run your PyTorch accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path", pipe=pipe)
test_prompt = "futuristic llama with a cyberpunk city on the background"
output = optimized_model(test_prompt).images[0]
```
For more details, please visit [Getting Started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/).
🌊 TensorFlow/Keras
In this section, we will learn about the 4 main steps needed to optimize TensorFlow/Keras models:
1) Input your model and data
2) Run the optimization
3) Save your optimized model
4) Load and run your optimized model in production
```python
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from speedster import optimize_model, save_model
#1 Provide input model and data (we support Keras dataset and custom input, see the docs to learn more)
model = ResNet50()
input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]
#2 Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.
```python
#4 Load and run your TensorFlow accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path")
output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/).
⚡ ONNX
In this section, we will learn about the 4 main steps needed to optimize ONNX models:
1) Input your model and data
2) Run the optimization
3) Save your optimized model
4) Load and run your optimized model in production
```python
import numpy as np
from speedster import optimize_model, save_model
#1 Provide input model and data
# Model was downloaded from here:
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx"
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]
#2 Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
#3 Save the optimized model
save_model(optimized_model, "model_save_path")
```
Once the optimization is completed, start using the accelerated model (on steroids 🚀) in your DL framework of choice.
```python
#4 Load and run your ONNX accelerated model in production
from speedster import load_model
optimized_model = load_model("model_save_path")
output = optimized_model(input_sample)
```
For more details, please visit [Getting Started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/).
# **Documentation**
- [Installation](https://docs.nebuly.com/Speedster/installation/)
- [Getting started with PyTorch optimization](https://docs.nebuly.com/Speedster/getting_started/pytorch_getting_started/)
- [Getting started with Hugging Face optimization](https://docs.nebuly.com/Speedster/getting_started/hf_getting_started/)
- [Getting started with Stable Diffusion optimization](https://docs.nebuly.com/Speedster/getting_started/diffusers_getting_started/)
- [Getting started with TensorFlow optimization](https://docs.nebuly.com/Speedster/getting_started/tf_getting_started/)
- [Getting started with ONNX optimization](https://docs.nebuly.com/Speedster/getting_started/onnx_getting_started/)
- [Key concepts](https://docs.nebuly.com/Speedster/key_concepts/)
- [Notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster)
- [Advanced options](https://docs.nebuly.com/Speedster/advanced_options/)
- [Benchmarks](https://docs.nebuly.com/Speedster/benchmarks/)
# **Key concepts**
Speedster's design reflects our mission to automatically master each and every existing AI acceleration technique to deliver the most cost-efficient AI ever. As a result, `Speedster` leverages available enterprise-grade open-source optimization tools. If these tools and communities already exist, and are distributed under a permissive license (Apache, MIT, etc), we integrate them and happily contribute to their communities. However, many tools do not exist yet, in which case we implement them and open-source the code so that our community can benefit from it.
`Speedster` is shaped around **4 building blocks** and leverages a modular design to foster scalability and integration of new acceleration components across the software to hardware stack.
- [x] **Converter:** converts the input model from its original framework to the framework backends supported by `Speedster`, namely PyTorch, ONNX and TensorFlow. This allows the Compressor and Compiler modules to apply any optimization technique to the model.
- [x] **Compressor:** applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.
- [x] **Compiler:** converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.
- [x] **Inference Learner:** takes the best performing compiled model and converts it back into the same interface as the original input model.

The **compressor** stage leverages the following open-source projects:
- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.
- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.
The **compiler stage** leverages the following open-source projects:
- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.
- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.
- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.
- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.
- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator
- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.
- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.
# **Community**
We’re developing `Speedster` for and together with our community, so please get in touch on GitHub or Discord.
• **[GitHub issues](https://github.com/nebuly-ai/nebullvm/issues)**: suggest new acceleration components, request new features, and report bugs and improvements.
• **[Discord](https://discord.gg/RbeQMu886J)**: learn about AI acceleration, share exciting projects and hang out with our global community.
The best way to get started is to pick a good-first issue. Please read our [contribution guidelines](https://docs.nebuly.com/contributions/) for a deep dive into how to best contribute to our project!
Don't forget to leave a star ⭐ to support the project and happy acceleration 🚀
================================================
FILE: optimization/speedster/docs/en/docs/advanced_options.md
================================================
# Advanced options
If you’re new to the library, you may want to start with the **Getting started** section.
The user guide here shows more advanced workflows and how to use the library in different ways. We are going to show some examples of more advanced usages of `Speedster`, that we hope will give you a deeper insight of how `Speedster` works.
In particular, we will overview:
- [`optimize_model`](#optimizemodel-api) API
- [Acceleration suggestions](#acceleration-suggestions)
- [Selecting which device](#selecting-which-device-to-use--cpu-gpu-and-other-accelerators) to use: CPU, GPU and other accelerators
- [Optimization Time: constrained vs unconstrained](#optimization-time--constrained-vs-unconstrained)
- [Selecting specific compilers/compressors](#select-specific-compilerscompressors)
- [Using dynamic shape](#using-dynamic-shape)
- [Enable TensorrtExecutionProvider for ONNXRuntime on GPU](#enable-tensorrtexecutionprovider-for-onnxruntime-on-gpu)
- [Custom models](#custom-models)
- [Store the performances of all the optimization techniques](#store-the-performances-of-all-the-optimization-techniques)
- [Set number of threads](#set-number-of-threads)
## `optimize_model` API
The `optimize_model` function allows to optimize a model from one of the supported frameworks (PyTorch, HuggingFace, TensorFlow, ONNX), and returns an optimized model that can be used with the same interface as the original model.
```python
def optimize_model(
model: Any,
input_data: Union[Iterable, Sequence],
metric_drop_ths: Optional[float] = None,
metric: Union[str, (...) -> Any, None] = None,
optimization_time: str = "constrained",
dynamic_info: Optional[dict] = None,
config_file: Optional[str] = None,
ignore_compilers: Optional[List[str]] = None,
ignore_compressors: Optional[List[str]] = None,
store_latencies: bool = False,
device: str = None,
**kwargs: Any
) -> Any
```
**Arguments**
`model`: Any
The input model can belong to one of the following frameworks: PyTorch, TensorFlow, ONNX, HuggingFace. In the ONNX case, `model` is a string with the path to the saved onnx model. In the other cases, it is a torch.nn.Module or a tf.Module.
`input_data`: Iterable or Sequence
Input data needed to test the optimization performances (latency, throughput, accuracy loss, etc). It can consist of one or more data samples. Note that if `optimization_time` is set to "unconstrained," it would be preferable to provide at least 100 data samples to also activate `Speedster` techniques that require more data (pruning, etc.). See the Getting started section to learn more about the `input_data` depending on your input framework:
- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md#1-input-model-and-data)
- [Getting started with 🤗 HuggingFace optimization](getting_started/hf_getting_started.md#1-input-model-and-data)
- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md#1-input-model-and-data)
- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md#1-input-model-and-data)
- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md#1-input-model-and-data)
`metric_drop_ths`: float, optional
Maximum drop in your preferred metric (see "metric" section below). All the optimized models having a larger error with respect to the `metric_drop_ths` will be discarded.
Default: 0.
`metric`: Callable, optional
Metric to be used for estimating the error that may arise from using optimization techniques and for evaluating if the error exceeds the `metric_drop_ths`. `metric` accepts as input a string, a user-defined metric, or None. Metric accepts a string containing the name of the metric; it currently supports:
- "numeric_precision"
- "accuracy".
- user-defined metric: function that takes as input the output of the original model and the one of the optimized model, and, if available, the original label. The function calculates and returns the reduction in the metric due to the optimization.
Default: "numeric_precision".
`optimization_time`: OptimizationTime, optional
The optimization time mode. It can be "constrained" or "unconstrained". In "constrained" mode, Speedster takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation. Note that most techniques activated in "unconstrained" mode require fine-tuning, and therefore it is recommended to provide at least 100 samples as input_data.
Default: "constrained".
`dynamic_info`: Dict, optional
Dictionary containing dynamic axis information. It should contain as keys both "input" and "output" and as values two lists of dictionaries, where each dictionary represents dynamic axis information for an input/output tensor. The inner dictionary should have an integer as a key, i.e. the dynamic axis (also considering the batch size) and a string as a value giving it a tag, e.g., "batch_size.".
Default: None.
`config_file`: str, optional
Configuration file containing the parameters needed to define the CompressionStep in the pipeline.
Default: None.
`ignore_compilers`: List[str], optional
List of DL compilers ignored during optimization execution. The compiler name should be one among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite, bladedisc, torchscript, intel_neural_compressor .
Default: None.
`ignore_compressors`: List[str], optional
List of DL compressors ignored during the compression stage. The compressor name should be one among sparseml and intel_pruning.
Default: None.
`store_latencies`: bool, optional
Parameter that allows to store the latency for each compiler used by Speedster in a json file. The JSON is created in the working directory.
Default: False.
`device`: str, optional
Device used for inference, it can be cpu or gpu/cuda (both gpu and cuda options are supported). A specific gpu can be selected using notation gpu:1 or cuda:1. gpu will be used if available, otherwise cpu.
Default: None.
**Returns: Inference Learner**
Optimized version with the same interface of the input model. For example, optimizing a PyTorch model will return an InferenceLearner object that can be called exactly like a PyTorch model (either with model.forward(input) or model(input)). The optimized model will therefore take as input a torch.Tensors and return a torch.Tensors.
## Acceleration suggestions
If the speedup you obtained with the first optimization with `Speedster` is not enough, we suggest the following actions:
- Include more backends for optimization, i.e. set `--backend all`
- Increase the `metric_drop_ths` by 5%, if possible: see [Optimize_model API](#optimize_model-api)
- Verify that your device is supported by your version of speedster: see [Supported hardware](hardware.md)
- Try to accelerate your model on a different hardware or consider using the CloudSurfer module to automatically understand which is the best hardware for your model: see [CloudSurfer](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/cloud_surfer) module.
## Selecting which device to use: CPU, GPU and other accelerators.
Speedster currently supports the following devices: `CPUs`, `GPUs`, `TPUs` and `AWS Inferentia chips`.
The parameter `device` allows to select which device we want to use for inference. By default, `Speedster` will use the accelerator if available on the machine, otherwise it will use cpu. If we are running on a machine with an available accelerator and we want to optimize the model for cpu inference, we can use:
```python
from speedster import optimize_model
optimized_model = optimize_model(
model, input_data=input_data, device="cpu"
)
```
If we are working on a multi-gpu machine and we want to use a specific gpu, we can use:
```python
from speedster import optimize_model
optimized_model = optimize_model(
model, input_data=input_data, device="cuda:1" # also device="gpu:1" is supported
)
```
The same applies also for TPUs and AWS Inferentia chips:
```python
from speedster import optimize_model
optimized_model = optimize_model(
model, input_data=input_data, device="tpu:1" # use tpu #1
)
optimized_model = optimize_model(
model, input_data=input_data, device="neuron:1" # use Inferentia chip #1
)
```
## Optimization Time: constrained vs unconstrained
One of the first options that can be customized in `Speedster` is the `optimization_time` parameter. In order to optimize the model, `Speedster` will try a list of compilers which allow to keep the same accuracy of the original model. In addition to compilers, it can also use other techniques such as pruning, quantization, and other compression techniques which can lead to a little drop in accuracy and may require some time to complete.
We defined two scenarios:
- **constrained**: only compilers and precision reduction techniques are used, so the compression step (the most time consuming one) is skipped. Moreover, in some cases the same compiler could be available for more than one pipeline, for example tensor RT is available both with PyTorch and ONNX backends. In the constrained scenario, each compiler will be used only once, so if for example we optimize a PyTorch model and tensor RT in the PyTorch pipeline manages to optimize the model, it won't be used again in the ONNX pipeline.
- **unconstrained**: in this scenario, `Speedster` will use all the compilers available, even if they appear in more than one backend. It also allows the usage of more time consuming techniques such as pruning and distillation. Note that for using many of the sophisticated techniques in the 'unconstrained' optimization, a small fine-tuning of the model will be needed. Thus, we highly recommend to provide as input_data at least 100 samples when selecting 'unconstrained' optimization.
## Select specific compilers/compressors
The `optimize_model` functions accepts also the parameters `ignore_compilers` and `ignore_compressors`, which allow to skip specific compilers or compressors.
The full list of available options is the following:
- _ignore_compilers_: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `onnx_tensor_rt`, `torchscript`, `onnxruntime`, `tflite`, `tvm`, `onnx_tvm`, `torch_tvm`, `bladedisc`, `openvino`, `intel_neural_compressor`, `torch_xla`, `torch_neuron`.
- _ignore_compressors_: `sparseml`, `intel_pruning`.
Some compilers, such as tensor RT, are available for both PyTorch and ONNX backends. For this reason in the list of compilers we have `tensor_rt` which skips both the PyTorch and ONNX pipelines, and `torch_tensor_rt` and `onnx_tensor_rt` which skip only the PyTorch and ONNX pipelines respectively.
If we want to skip the `tvm` and `bladedisc` optimizers, we could write:
```python
from speedster import optimize_model
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=["tvm", "bladedisc"]
)
```
## Using dynamic shape
By default, a model optimized with `Speedster` will have a static shape. This means that it can be used in inference only with the same shape of the inputs provided to the `optimize_model` function during the optimization. The dynamic shape however is fully supported, and can be enabled with the `dynamic_info` parameter (see the [optimize_model API](#optimize_model-api) arguments to see how this parameter is defined.)
For each dynamic axis in the inputs, we need to provide the following information:
- the axis number (starting from 0, considering the batch size as the first axis)
- a tag that will be used to identify the axis
- the minimum, optimal and maximum sizes of the axis (some compilers will work also for shapes that are not in the range [min, max], but the performance may be worse)
Let's see an example of a model that takes two inputs, where the batch size must be dynamic, as well as the size on the third and fourth dimensions.
```python
import torch
import torchvision.models as models
from speedster import optimize_model
# Load a resnet as example
model = models.resnet50()
# Provide an input data for the model
input_data = [((torch.randn(1, 3, 256, 256),), torch.tensor([0])) for _ in range(100)]
# Set dynamic info
dynamic_info = {
"inputs": [
{
0: {
"name": "batch",
"min_val": 1,
"opt_val": 1,
"max_val": 8,
},
2: {
"name": "dim_image",
"min_val": 128,
"opt_val": 256,
"max_val": 512,
},
3: {
"name": "dim_image",
"min_val": 128,
"opt_val": 256,
"max_val": 512,
},
}
],
"outputs": [
{0: "batch", 1: "out_dim"}
]
}
# Run Speedster optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
dynamic_info=dynamic_info
)
```
## Enable TensorrtExecutionProvider for ONNXRuntime on GPU
By default, `Speedster` will use the `CUDAExecutionProvider` for ONNXRuntime on GPU. If you want to use the `TensorrtExecutionProvider` instead, you must add the TensorRT installation path to the env variable LD_LIBRARY_PATH.
If you installed TensorRT through the nebullvm auto_installer, you can do it by running the following command in the terminal:
```bash
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"//site-packages/tensorrt"
```
## Custom models
`Speedster` is designed to optimize models that take as inputs and return in output only tensors or np.ndarrays (and dictionaries/strings for huggingface). Some models may require instead a custom input, for example a dictionary where the keys are the names of the inputs and the values are the input tensors, or may return a dictionary as output. We can optimize such models with `Speedster` by defining a model wrapper.
Let's take the example of the detectron2 model which takes as input a tuple of tensors but returns a dictionary as output:
```python
class BaseModelWrapper(torch.nn.Module):
def __init__(self, core_model, output_dict):
super().__init__()
self.core_model = core_model
self.output_names = [key for key in output_dict.keys()]
def forward(self, *args, **kwargs):
res = self.core_model(*args, **kwargs)
return tuple(res[key] for key in self.output_names)
class OptimizedWrapper(torch.nn.Module):
def __init__(self, optimized_model, output_keys):
super().__init__()
self.optimized_model = optimized_model
self.output_keys = output_keys
def forward(self, *args):
res = self.optimized_model(*args)
return {key: value for key, value in zip(self.output_keys, res)}
input_data = [((torch.randn(1, 3, 256, 256)), torch.tensor([0]))]
# Compute the original output of the model (in dict format)
res = model_backbone(torch.randn(1, 3, 256, 256))
# Pass the model and the output sample to the wrapper
backbone_wrapper = BaseModelWrapper(model_backbone, res)
# Optimize the model wrapper
optimized_model = optimize_model(backbone_wrapper, input_data=input_data)
# Wrap the optimized model with a new wrapper to restore the original model output format
optimized_backbone = OptimizedWrapper(optimized_model, backbone_wrapper.output_names)
```
You can find other examples in the [notebooks](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) section available on GitHub.
## Store the performances of all the optimization techniques
`Speedster` internally tries all the techniques available on the target hardware and automatically chooses the fastest one. If you need more details on the inference times of each compiler, you can set the `store_latencies` parameter to `True`. A json file will be created in the working directory, listing all the results of the applied techniques and of the original model itself.
```python
# Run Speedster optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
store_latencies=True
)
```
## Set number of threads
When running multiple replicas of the model in parallel, it would be useful for CPU-optimized algorithms to limit the number of threads to use for each model. In `Speedster`, it is possible to set the maximum number of threads a single model can use with the environment variable `NEBULLVM_THREADS_PER_MODEL`.
For instance, you can run:
```python
export NEBULLVM_THREADS_PER_MODEL = 2
```
for using just two CPU threads per model at inference time and during optimization.
================================================
FILE: optimization/speedster/docs/en/docs/benchmarks.md
================================================
# Benchmarks
!!! info
In this section you are going to learn how `Speedster` accelerates the inference of various models on different hardware architecture.
Here we provide a preview of the following accelerated models:
- [Bert](#bert)
- [YoloV5](#yolov5)
- [EfficientNet](#efficientnet)
- [GPT2](#gpt2)
- [ResNet](#resnet)
- [Roberta](#roberta)
The above models are tested on very popular hardware architecture and instances:
- AWS - c5n,2xlarge
- AWS - c5,12xlarge
- AWS - c6i.12xlarge
- AWS - m6i,24xlarge
- NVIDIA T4
- NVIDIA V100
- NVIDIA 3090
## Bert

## YoloV5

## EfficientNet

## GPT2

## ResNet

## Roberta

================================================
FILE: optimization/speedster/docs/en/docs/getting_started/diffusers_getting_started.md
================================================
# Getting started with Stable Diffusion optimization
In this section, we will learn about the 4 main steps needed to optimize Stable Diffusion models from the `Diffusers` library:
1. [Environment Setup](#1-input-model-and-data)
2. [Input your model and data](#2-input-model-and-data)
3. [Run the optimization](#3-run-the-optimization)
4. [Save your optimized model](#4-save-your-optimized-model)
5. [Load and run your optimized model in production](#5-load-and-run-your-optimized-model-in-production)
## 1) Environment Setup (GPU only)
In order to optimize a Stable Diffusion model, you have to ensure that your environment is correctly set up according to these requirements: `CUDA>=12.0`, `tensorrt>=8.6.0` and `torch<=1.13.1`.
From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above.
There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.
For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully.
You can check your CUDA version with the following command:
```bash
nvidia-smi
```
If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads
You can check your TensorRT version with the following command:
```bash
python -c "import tensorrt; print(tensorrt.__version__)"
```
If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:
```
pip install -U tensorrt
```
You can finally check your PyTorch version with the command
```bash
python -c "import torch; print(torch.__version__)"
```
If you have torch>=2.0.0, you can downgrade it by running:
```
pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
```
## 2) Input model and data
!!! info
In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc).
For Stable Diffusion models Speedster expects the input data to be a list of sentences: ```List[str]```
```python
import torch
from speedster import optimize_model
from diffusers import StableDiffusionPipeline
# Load Stable Diffusion 1.4 as example
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
# On GPU we load by default the model in half precision, because it's faster and lighter.
pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)
else:
pipe = StableDiffusionPipeline.from_pretrained(model_id)
# Create some example input data
input_data = [
"a photo of an astronaut riding a horse on mars",
"a monkey eating a banana in a forest",
"white car on a road surrounded by palm trees",
"a fridge full of bottles of beer",
"madara uchiha throwing asteroids against people"
]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
## 3) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model.
The function takes the following arguments as inputs:
- `model`: model to be optimized in your preferred framework (A Diffusers pipe in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration
and returns the accelerated version of your model 🚀.
``` python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
pipe,
input_data=input_data,
optimization_time="unconstrained",
metric_drop_ths=0.05
)
```
Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.
At the end of the optimization, you are going to see the results in a summary table like the following:

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.
If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.
## 4) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:
```python
from speedster import save_model
save_model(optimized_model, "model_save_path")
```
Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.
## 5) Load and run your optimized model in production
Once the optimized model has been saved, it can be loaded with the `load_model` function:
```python
from speedster import load_model
optimized_model = load_model("model_save_path", pipe=pipe)
```
In this case we must provide also the original pipe as argument to the load_function, Speedster will automatically load the optimized model and replace the original UNet inside the pipe.
The optimized model can be used for accelerated inference in the same way as the original model:
```python
# Use the accelerated version of your Stable Diffusion model in production
output = optimized_model(test_prompt).images[0]
```
!!! info
The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.
If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.
================================================
FILE: optimization/speedster/docs/en/docs/getting_started/hf_getting_started.md
================================================
# Getting started with HuggingFace optimization
In this section, we will learn about the 4 main steps needed to optimize your 🤗 HuggingFace models:
1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)
## 1) Input model and data
!!! info
In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc).
For HuggingFace models we support different types of input data depending on the architecture of your input model.
- [x] For Decoder-only or Encoder-only architectures (Bert, GPT, etc), we support:
- Dictionary
- String
- [x] For Encoder-Decoder architectures (T5 etc), we support:
- Dictionary
=== "Decoder-only or Encoder-only (Bert, GPT, etc)"
**Input as Dictionary**
```python
from transformers import AlbertModel, AlbertTokenizer
# Load Albert as example
model = AlbertModel.from_pretrained("albert-base-v1")
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
# Case 1: dictionary input format
text = "This is an example text for the huggingface model."
input_dict = tokenizer(text, return_tensors="pt")
input_data = [input_dict for _ in range(100)]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
**Input as String**
In the string case, the HuggingFace tokenizer must be given as input to the `optimize_model` in addition to the `input_data`, and the arguments for the tokenizer can be passed using the param `tokenizer_args`.
```python
from transformers import AlbertModel, AlbertTokenizer
# Load Albert as example
model = AlbertModel.from_pretrained("albert-base-v1")
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
# Case 2: strings input format
input_data = [
"This is a test.",
"Hi my name is John.",
"The cat is on the table.",
]
tokenizer_args = dict(
return_tensors="pt",
padding="longest",
truncation=True,
)
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
=== "Encoder-Decoder architectures (T5 etc)"
For encoder-decoder architectures we support only `input_data` as Dictionary:
```python
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Load T5 as example
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
# Case 1: dictionary input format
question = "What's the meaning of life?"
answer = "The answer is:"
input_dict = tokenizer(question, return_tensors="pt")
input_dict["decoder_input_ids"] = tokenizer(answer, return_tensors="pt").input_ids
input_data = [input_dict for _ in range(100)]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model.
The function takes the following arguments as inputs:
- `model`: model to be optimized in your preferred framework (HuggingFace in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration
and returns the accelerated version of your model 🚀.
Depending on the format of your `input_data`, the `optimize_model` is as follows:
=== "Input as Dictionary"
```python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
```
=== "Input as String"
```python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05,
tokenizer=tokenizer,
tokenizer_args={"return_tensors": "pt"}
)
```
Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.
At the end of the optimization, you are going to see the results in a summary table like the following:

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.
If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.
## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:
```python
from speedster import save_model
save_model(optimized_model, "model_save_path")
```
Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.
## 4) Load and run your optimized model in production
Once the optimized model has been saved, it can be loaded with the `load_model` function:
```python
from speedster import load_model
optimized_model = load_model("model_save_path")
```
The optimized model can be used for accelerated inference in the same way as the original model:
```python
# Use the accelerated version of your HuggingFace model in production
output = optimized_model(**input_sample)
```
!!! info
The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.
If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.
================================================
FILE: optimization/speedster/docs/en/docs/getting_started/onnx_getting_started.md
================================================
# Getting started with ONNX optimization
In this section, we will learn about the 4 main steps needed to optimize your ONNX models:
1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)
## 1) Input model and data
!!! info
In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc).
```python
import numpy as np
# Load a resnet as example
# Model was downloaded from here:
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx"
# Provide input data for the model
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0])) for _ in range(100)]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model.
The function takes the following arguments as inputs:
- `model`: model to be optimized in your preferred framework (ONNX in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration
and returns the accelerated version of your model 🚀.
``` python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
```
Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.
At the end of the optimization, you are going to see the results in a summary table like the following:

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.
If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.
## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:
```python
from speedster import save_model
save_model(optimized_model, "model_save_path")
```
Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.
## 4) Load and run your optimized model in production
Once the optimized model has been saved, it can be loaded with the `load_model` function:
```python
from speedster import load_model
optimized_model = load_model("model_save_path")
```
The optimized model can be used for accelerated inference in the same way as the original model:
```python
# Use the accelerated version of your ONNX model in production
output = optimized_model(input_sample)
```
!!! info
The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.
If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.
================================================
FILE: optimization/speedster/docs/en/docs/getting_started/pytorch_getting_started.md
================================================
# Getting started with PyTorch optimization
In this section, we will learn about the 4 main steps needed to optimize PyTorch models:
1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)
## 1) Input model and data
!!! info
In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc).
For PyTorch models we support two types of input data:
* Custom data format
* PyTorch DataLoader
=== "Custom Data Format"
Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```
- Each element of the list is a tuple, which represents a batch of the dataset.
- In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.
``` python
import torch
import torchvision.models as models
# Load a resnet as example
model = models.resnet50()
# Provide input data for the model
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]
```
See below further examples with custom format:
``` python
# Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 with labels
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]
# Dataset for a model that takes 2 inputs, containing 100 batches of data with bs=5 with labels
input_data = [((torch.randn(5, 3, 256, 256), torch.randn(5, 3, 256, 256), ), torch.tensor([0, 1, 0, 1, 1])) for _ in range(100)]
# Dataset for a model that takes 1 input, containing 100 batches of data with bs=1 without labels
input_data = [((torch.randn(1, 3, 256, 256), ), ) for _ in range(100)]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
=== "PyTorch DataLoader"
We support the following DataLoader types:
* Tensor only
* Tensor and labels
For models with multiple inputs, we support the following types:
- input_1, input_2, ..., input_n, label
- (input_1, input_2, ..., input_n), label
```python
import torch
import torchvision.models as models
# Load a resnet as example
model = models.resnet50()
# Use your PyTorch DataLoader in any of the standard format
input_data =
```
Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.
## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model.
The function takes the following arguments as inputs:
- `model`: model to be optimized in your preferred framework (PyTorch in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration
and returns the accelerated version of your model 🚀.
``` python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
```
Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.
At the end of the optimization, you are going to see the results in a summary table like the following:

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.
If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.
## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:
```python
from speedster import save_model
save_model(optimized_model, "model_save_path")
```
Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.
## 4) Load and run your optimized model in production
Once the optimized model has been saved, it can be loaded with the `load_model` function:
```python
from speedster import load_model
optimized_model = load_model("model_save_path")
```
The optimized model can be used for accelerated inference in the same way as the original model:
```python
# Use the accelerated version of your PyTorch model in production
output = optimized_model(input_sample)
```
!!! info
The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.
If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.
================================================
FILE: optimization/speedster/docs/en/docs/getting_started/tf_getting_started.md
================================================
# Getting started with TensorFlow optimization
In this section, we will learn about the 4 main steps needed to optimize TensorFlow models:
1. [Input your model and data](#1-input-model-and-data)
2. [Run the optimization](#2-run-the-optimization)
3. [Save your optimized model](#3-save-your-optimized-model)
4. [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production)
## 1) Input model and data
!!! info
In order to optimize a model with `Speedster`, first you should input the model you want to optimize and load some sample data that will be needed to test the optimization performances (latency, throughput, accuracy loss, etc).
For TensorFlow models we support two types of input data:
* Custom data format
* TensorFlow DataLoader
=== "Custom Data Format"
Input data is a ```List[Tuple[Tuple[tensor, ...], tensor]]```
- Each element of the list is a tuple, which represents a batch of the dataset.
- In each tuple, the first element is another tuple containing a value for each input tensor of the model, while the second element is a tensor containing the labels of that batch of data. The label is optional, so it can be omitted.
``` python
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
# Load a resnet as example
model = ResNet50()
# Provide input data for the model
input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for _ in range(100)]
```
Now your input model and data are ready, you can move on to [Run the optimization](#2-run-the-optimization) section 🚀.
=== "TensorFlow DataLoader"
We support the following DataLoader types:
* Tensor only
* Tensor and labels
For models with multiple inputs, we support the following types:
- input_1, input_2, ..., input_n, label
- (input_1, input_2, ..., input_n), label
```python
import torch
import torchvision.models as models
# Load a resnet as example
model = models.resnet50()
# Use your TensorFlow DataLoader in any of the standard format
input_data =
```
Now your input `model` and `input_data` are ready, you can move on to the [Run the optimization](#2-run-the-optimization) section.
## 2) Run the optimization
Once the `model` and `input_data` have been defined, everything is ready to use Speedster's `optimize_model` function to optimize your model.
The function takes the following arguments as inputs:
- `model`: model to be optimized in your preferred framework (TensorFlow in this case)
- `input_data`: sample data needed to test the optimization performances (latency, throughput, accuracy loss, etc)
- `optimization_time`: if "constrained" mode, `Speedster` takes advantage only of compilers and precision reduction techniques, such as quantization. "unconstrained" optimization_time allows it to exploit more time-consuming techniques, such as pruning and distillation
- `metric_drop_ths`: maximum drop in your preferred accuracy metric that you are willing to trade to gain in acceleration
and returns the accelerated version of your model 🚀.
``` python
from speedster import optimize_model
# Run Speedster optimization
optimized_model = optimize_model(
model,
input_data=input_data,
optimization_time="constrained",
metric_drop_ths=0.05
)
```
Internally, `Speedster` tries to use all the compilers and optimization techniques at its disposal along the software to hardware stack to optimize the model. From these, it will choose the ones with the lowest latency on the specific hardware.
At the end of the optimization, you are going to see the results in a summary table like the following:

If the speedup you obtained is good enough for your application, you can move to the [Save your optimized model](#3-save-your-optimized-model) section to save your model and use it in production.
If you want to squeeze out even more acceleration out of the model, please see the [`optimize_model` API](../advanced_options.md#optimize_model-api) section. Consider if in your application you can trade off a little accuracy for much higher performance and use the `metric`, `metric_drop_ths` and `optimization_time` arguments accordingly.
## 3) Save your optimized model
After accelerating the model, it can be saved using the `save_model` function:
```python
from speedster import save_model
save_model(optimized_model, "model_save_path")
```
Now you are all set to use your optimized model in production. To explore how to do it, see the [Load and run your optimized model in production](#4-load-and-run-your-optimized-model-in-production) section.
## 4) Load and run your optimized model in production
Once the optimized model has been saved, it can be loaded with the `load_model` function:
```python
from speedster import load_model
optimized_model = load_model("model_save_path")
```
The optimized model can be used for accelerated inference in the same way as the original model:
```python
# Use the accelerated version of your TensorFlow model in production
output = optimized_model(input_sample)
```
!!! info
The first 1-2 inferences could be a bit slower than expected because some compilers still perform some optimizations during the first iterations. After this warm-up time, the next ones will be faster than ever.
If you want to know more about how to squeeze out more performances from your models, please visit the [Advanced options](../advanced_options.md) section.
================================================
FILE: optimization/speedster/docs/en/docs/hardware.md
================================================
# Supported hardware
`Speedster` has been mostly tested on Nvidia GPUs and Intel/AMD CPUs. The library may also work with other hardware on which has not been tested. Please let us know if you find out that `Speedster` works well on other hardware or if you find issues.
Fully supported hardware:
- Intel CPU
- Nvidia GPU
Hardware we are currently integrating:
- Apple M1
- AMD CPU
- Intel GPU (open issue 👩💻)
================================================
FILE: optimization/speedster/docs/en/docs/installation.md
================================================
# Installation
In this installation guide we will learn:
- [Quick installation](#quick-installation) of `Speedster` with pip **(Recommended)**
- [Selective installation](#optional-selective-installation-of-speedster-requirements) of the requirements **(Optional)**
- [Installation](#optional-download-docker-images-with-frameworks-and-optimizers) with Docker **(Optional)**
- [Set up Speedster on custom DL devices](#set-up-speedster-on-custom-dl-devices) to run models on Google TPUs and AWS Inferentia Chips
## Quick installation
You can easily install `Speedster` using pip.
pip install speedster
Then make sure to install all the available deep learning compilers:
python -m nebullvm.installers.auto_installer --compilers all
!!! info
If you want to optimize PyTorch or HuggingFace models, PyTorch must be pre-installed in the environment before using the auto-installer, please install it from [this](https://pytorch.org/get-started/locally/) link. Moreover, for Mac computers with M1/M2 processors, please use a conda environment, or you may run into problems when installing some of the deep learning compilers.
Great, now you are ready to accelerate your model 🚀 Please visit the following pages to get started based on the DL framework of your input model:
- [Getting started with PyTorch optimization](getting_started/pytorch_getting_started.md)
- [Getting started with 🤗 Hugging Face optimization](getting_started/hf_getting_started.md)
- [Getting started with Stable Diffusion optimization](getting_started/diffusers_getting_started.md)
- [Getting started with TensorFlow/Keras optimization](getting_started/tf_getting_started.md)
- [Getting started with ONNX optimization](getting_started/onnx_getting_started.md)
## (Optional) Selective installation of Speedster requirements
By default, the `auto_installer` installs all the DL frameworks and compilers supported by `Speedster`. However, some of these may not be relevant to your use case. In this section, we explain how you can customize the installation of these libraries, avoiding those that are not needed.
To customize the libraries installation you have two options:
- [Use the auto-installer (recommended)](#use-the-auto-installer-recommended)
- [Install the libraries manually](#manual-installation)
### Use the auto-installer (recommended)
To understand how to selectively install your preferred libraries, let's examine the auto-installer API:
```bash
python -m nebullvm.installers.auto_installer
--frameworks
--extra-backends
--compilers
```
!!! Description
=== "--frameworks"
`frameworks` is used to specify the deep learning framework of your input model. The supported frameworks are `torch`, `tensorflow`, `onnx`, `huggingface` and `diffusers`.
- if you want to optimize a model with a single DL framework, the code is as follows (example below for HuggingFace):
```python
python -m nebullvm.installers.auto_installer --frameworks huggingface
```
Please remember that for PyTorch optimization, you should pre-install PyTorch from the official [repo](https://pytorch.org/get-started/locally/).
- if you want to optimize models in multiple input frameworks, you can include them separated with a space:
```python
python -m nebullvm.installers.auto_installer --frameworks tensorflow torch
```
- If you want to include all the frameworks, you can use `all` as the argument:
```python
python -m nebullvm.installers.auto_installer --frameworks all
```
Default: `all`.
=== "--extra-backends"
After entering your input model, `Speedster` converts the input model from its original framework into an intermediate framework to be used during the optimization; we call these intermediate frameworks "backends." To learn more, see the section [Model Converter](https://docs.nebuly.com/Speedster/key_concepts/) in the docs. This conversion allows `Speedster` to apply all optimization techniques without being constrained by the input framework of your model.
The supported backends are `torch`, `tensorflow` and `onnx`.
You can specify multiple backends by separating them with a space.
- For example, if you want to install TensorFlow and ONNX as backends of an HugginFace model, the code is as follows:
```python
python -m nebullvm.installers.auto_installer --frameworks huggingface --extra-backends tensorflow onnx
```python
- If you want to install all the backends supported by the selected frameworks, you can use `all` as the argument.
- If you don't want to install extra backends, you can set `--extra-backends none`.
The extra-backends that you choose must be compatible with at least one of the input frameworks you previously selected with the argument `—-frameworks`, please see the table below to see the compatibility matrix.
Default: `all`.
=== "--compilers"
`compilers` is used to specify the deep learning compilers to be installed. The supported compilers are: `deepsparse`, `tensor_rt`, `torch_tensor_rt`, `openvino` and `intel_neural_compressor`. The compilers must be compatible with at least one of the backends selected with the argument `—-extra-backends`, please see the table below to see the compatibility matrix.
- You can specify multiple compilers by separating them with a space. For example:
```python
--compilers deepsparse tensor_rt
```
will install DeepSparse and TensorRT.
- If you want to install all the compilers supported by the selected frameworks/backends, you can use `all` as the argument.
Speedster also supports `torchscript`, `tf_lite`, and `onnxruntime` as built-in; these are preinstalled with their respective backends, so there is no need to include them in the list. Speedster also supports `tvm`, which is currently not supported by the automatic installer and must be installed manually; see the next section if you wish to include it.
Default: `all`.
Let's see an example of how to use these three arguments:
```bash
python -m nebullvm.installers.auto_installer
--frameworks torch
--extra-backends all
--compilers all
```
This command will setup your environment to optimize PyTorch models, and will install all PyTorch supported backends and compilers.
The following table shows the supported combinations of frameworks, backends and compilers that you can install with the auto-installer:
| Framework | Extra Backends | Compilers |
|--------------|---------------------------|-------------------------------------------------------------------------|
| PyTorch | ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |
| TensorFlow | ONNX | TensorRT, OpenVINO |
| ONNX | / | TensorRT, OpenVINO |
| Hugging Face | PyTorch, TensorFlow, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |
| Diffusers | PyTorch, ONNX | DeepSparse, TensorRT, Torch TensorRT, OpenVINO, Intel Neural Compressor |
!!! info
Hugging Face models can be of two types, PyTorch-based or TensorFlow-based. For PyTorch-based models, it is necessary to include `torch` as an extra-backend. For TensorFlow-based models, you must include `tensorflow` as an extra-backend.
### Manual installation
If you want to manually install the requirements, this section collects links to the official installation guides for all frameworks and compilers supported by `Speedster`.
#### Deep Learning frameworks/backends
- PyTorch: https://pytorch.org/get-started/locally/
- TensorFlow: https://www.tensorflow.org/install
- ONNX: https://github.com/onnx/onnx#installation
- HuggingFace: https://huggingface.co/transformers/installation.html
- Diffusers: https://github.com/huggingface/diffusers#installation
#### Deep Learning compilers
- DeepSparse: https://github.com/neuralmagic/deepsparse#installation
- TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html
- Torch TensorRT: https://pytorch.org/TensorRT/getting_started/installation.html#installation
- ONNXRuntime: https://onnxruntime.ai/docs/install/#python-installs
- OpenVINO: https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#step-4-install-the-package
- Intel Neural Compressor: https://github.com/intel/neural-compressor#installation
- Apache TVM: https://tvm.apache.org/docs/install/index.html
#### Other requirements
- tf2onnx: https://github.com/onnx/tensorflow-onnx#installation (Install it if you want to convert TensorFlow models to ONNX)
- polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy#installation (Install it if you want to use TensorRT)
- onnx-simplifier: https://github.com/daquexian/onnx-simplifier#python-version (Install it if you want to use TensorRT)
- onnx_graphsurgeon: https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon#installation (Install it if you want to use TensorRT with Stable Diffusion)
- onnxmltools: https://github.com/onnx/onnxmltools#install (Install it if you want to convert models to ONNX)
## (Optional) Download Docker images with frameworks and optimizers
Instead of installing the frameworks and compilers needed for optimization, which can be a time-consuming task, you can simply download a Docker container with all compilers preinstalled.
To pull up the Docker image, run:
docker pull nebulydocker/nebullvm:latest
and then run and access the Docker with:
docker run -ti --gpus=all nebulydocker/nebullvm:latest
After optimizing the model, you may decide to deploy it to production. Note that you need to have the deep learning compiler used to optimize the model and other components inside the production Docker. For this reason, we have created several versions of the Docker nebullvm container in the [Docker Hub](https://hub.docker.com/repository/docker/nebulydocker/nebullvm), each containing only one compiler. Pull the image with the compiler that has optimized your model!
## Set up Speedster on custom DL devices
From version `0.10.0`, Speedster supports optimization of PyTorch models on `Google TPUs` and `AWS Inferentia` chips.
For these devices, the user must ensure that the required libraries are installed on the machine.
The following sections describe how to install the required libraries for each device.
### Google TPUs
In order to use a TPU, you must request a TPU-enabled VM from Google Cloud. You can consult the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en)
for more information about how to create a TPU VM and how to get started with PyTorch on TPUs.
To use Speedster on Google TPUs, we will use the [`torch_xla`](https://github.com/pytorch/xla) library, which is already
preinstalled in all the Google Cloud TPU VMs, you will find it in the base Python3 environment.
After creating the VM, you can follow these steps to set up Speedster:
- Check that the `torch_xla` library is installed in the base Python3 environment. You can do this by running `python3 -c "import torch_xla; print(torch_xla.__version__)"` in the VM console;
- Set TPU runtime configuration as explained in the [official documentation](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#set_tpu_runtime_configuration);
- [Optional] Check that the TPU is working by running the [official example](https://cloud.google.com/tpu/docs/run-calculation-pytorch?hl=en#perform_a_simple_calculation);
- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on TPUs.
You are now ready to use Speedster on TPUs! Speedster will automatically detect the TPU device and will use the `torch_xla` library to optimize the model, comparing its performances with the original model running on the CPU.
### AWS Inferentia
For AWS Inferentia, you must first create an AWS EC2 instance with the `inf1` instance type.
You can find more information about `inf1` instances in the [official documentation](https://aws.amazon.com/it/ec2/instance-types/inf1/).
!!! info
AWS has recently released the `inf2` instance type, which is a more powerful version of `inf1`. For now `inf2`
instances are only available in private preview, you can request them directly to AWS by filling this [form](https://pages.awscloud.com/EC2-Inf2-Preview.html).
To use Speedster on AWS Inferentia, we will use the [`torch-neuron`](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) library, that must be manually installed on `inf1` instances (on `inf2`instances it's already preinstalled if you use the PyTorch DLAMI provided by AWS).
You can find here the full guides to set up the EC2 instances and install the required libraries:
- `inf1`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/setup/pytorch-install.html#install-neuron-pytorch
- `inf2`: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install
After creating the EC2 instance and installing `torch_neuron`, you can follow these steps to set up Speedster:
- Check that the `torch_neuron` library is installed, you can do this by running `python -c "import torch_neuron; print(torch_neuron.__version__)"` in the console (if using `inf1` instances, otherwise change `torch_neuron` with `torch_neuronx`);
- Install Speedster by running `pip install speedster`. It's not required to install the deep learning compilers in this case, since they are not supported on AWS Inferentia.
You are now ready to use Speedster on AWS Inferentia! Speedster will automatically detect the AWS Inferentia device and will use the `torch_neuron` library to optimize the model, comparing its performances with the original model running on the CPU.
================================================
FILE: optimization/speedster/docs/en/docs/key_concepts.md
================================================
# Key concepts
In this section we are going to learn the architectural design of the 4 building blocks of `Speedster`.
- [x] **Converter**: converts the input model from its original framework to the framework backends supported by Speedster, namely PyTorch, TensorFlow, and ONNX. This allows the Compressor and Optimizer modules to apply any optimization technique to the model.
- [x] **Compressor**: applies various compression techniques to the model, such as pruning, knowledge distillation, or quantization-aware training.
- [x] **Optimizer**: converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The compilers apply both post-training quantization techniques and graph optimizations, to produce compiled binary files.
- [x] **Inference Learner**: takes the best performing compiled model and converts it to the same interface as the original input model.

The **compressor** stage leverages the following open-source projects:
- [Intel/neural-compressor](https://github.com/intel/neural-compressor): targeting to provide unified APIs for network compression technologies, such as low precision quantization, sparsity, pruning, knowledge distillation, across different deep learning frameworks to pursue optimal inference performance.
- [SparseML](https://github.com/neuralmagic/sparseml): libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models.
The **compiler stage** leverages the following open-source projects:
- [Apache TVM](https://github.com/apache/tvm): open deep learning compiler stack for cpu, gpu and specialized accelerators.
- [BladeDISC](https://github.com/alibaba/BladeDISC): end-to-end Dynamic Shape Compiler project for machine learning workloads.
- [DeepSparse](https://github.com/neuralmagic/deepsparse): neural network inference engine that delivers GPU-class performance for sparsified models on CPUs.
- [OpenVINO](https://github.com/openvinotoolkit/openvino): open-source toolkit for optimizing and deploying AI inference.
- [ONNX Runtime](https://github.com/microsoft/onnxruntime): cross-platform, high performance ML inferencing and training accelerator
- [TensorRT](https://github.com/NVIDIA/TensorRT): C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.
- [TFlite](https://github.com/tensorflow/tflite-micro) and [XLA](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla): open-source libraries to accelerate TensorFlow models.
## Model converter
!!! Definition
The Converter converts the input model from its original input framework to the framework backends supported by `Speedster`. This conversion enables the Compressor and the Compiler modules to apply all the optimization techniques without being constrained by the framework of your input model.

`Speedster` supports deep learning models in the following input frameworks:
- Hugging Face
- Diffusers
- ONNX
- PyTorch
- TensorFlow
`Speedster` now includes 3 backends:
- **ONNX backend**, which supports models in any input framework.
- **PyTorch backend**, which supports input models in PyTorch and ONNX and Hugging Face.
- **TensorFlow backend**, which supports input models in TensorFlow and ONNX.
As you notice, to date, not all cross-conversions from input frameworks to each `Speedster` backend are supported.
Let's see a couple of examples to better understand the potenatiality of the Converter block:
1. PyTorch model as input: first of all Speedster will try the compilers available in the PyTorch backend pipeline, then it will convert it to ONNX and will try also the ones available in the ONNX backend optimization pipeline. Finally, the best one among them will be chosen and returned as the optimized model in your input framework (in this case PyTorch).
2. HuggingFace model as input: Let's assume that for your specific use case, the best optimization technique is a specific type of dynamic quantization only supported by PyTorch. If you feed a Hugging Face model into Speedster, the Converter will first transform your model into a PyTorch model. Speedster will then quantize it and finally return it as an Hugging Face model.
## Compressor
The compressor applies various compression techniques to the model:
- Block-wise un/structured sparsity (🎉 launched in 0.4.0 🎉)
- Knowledge distillation (to be supported)
- Layer replacement (to be supported)
- Low-rank compression (to be supported)
- Quantization-aware training (to be supported)
- SparseML (🎉 launched in 0.4.0 🎉)

## Compiler
The Compiler block converts the compressed models to the intermediate representation (IR) of the supported deep learning compilers. The different DL compilers perform both the low-level optimizations, which mostly consist of various quantization techniques, and graph optimizations. Finally, the model is compiled into binary.

Supported deep learning compilers:
- Apache TVM
- BladeDISC (🎉 launched in 0.4.0 🎉)
- DeepSparse (🎉 launched in 0.4.0 🎉)
- MLIR (open pull request 👩💻)
- ONNX Runtime
- OpenVINO
- TensorRT
- TF Lite / XLA
- TorchScript
Supported low-level optimizations:
- Static quantization
- Dynamic quantization
- Half-precision
- Low-bit quantization on TVM (to be supported)
## Inference learner
The Learner, or Inference Learner, selects the most performing compiled model on your hardware and converts it to the same interface as the original input model.

================================================
FILE: optimization/speedster/docs/en/docs/notebooks.md
================================================
# Notebooks
In this section you can find optimization notebooks for multiple DL input models:
- HuggingFace
- Diffusers
- ONNX
- Pytorch
- Tensorflow
Please check out notebooks and tutorials on GitHub at [this](https://github.com/nebuly-ai/nebullvm/tree/main/notebooks/speedster) link.
================================================
FILE: optimization/speedster/docs/en/docs/overview.md
================================================
# Overview
`Speedster` is an open-source module designed to accelerate AI inference in just a few lines of code.
The library allows you to seamlessy modulate the inference performances of your AI models in terms of latency, throughput, model size, accuracy, cost and automatically applies the best set of optimization techniques along the software to hardware stack to meet your targets.
`Speedster` makes it easy to combine optimization techniques across the whole software to hardware stack, delivering best in class speed-ups. If you like the idea, give us a star to support the project ⭐

The core `Speedster` workflow consists of 3 steps:
- [x] **Select**: input your model in your preferred DL framework and express your preferences regarding:
- Accuracy loss: do you want to trade off a little accuracy for much higher performance?
- Optimization time: stellar accelerations can be time-consuming. Can you wait, or do you need an instant answer?
- [x] **Search**: the library automatically tests every combination of optimization techniques across the software-to-hardware stack (sparsity, quantization, compilers, etc.) that is compatible with your needs and local hardware.
- [x] **Serve**: finally, `Speedster` chooses the best configuration of optimization techniques and returns an accelerated version of your model in the DL framework of your choice (just on steroids 🚀).
Now you are ready to start accelerating your models, visit the [Installation](installation.md) section to start right away!
================================================
FILE: optimization/speedster/docs/en/docs/telemetry.md
================================================
# Telemetry
`Speedster` is a young and rapidly evolving open-source project. There is plenty of room for improvement for Speedster to make your model achieve the very best performance on your hardware... and you may still find some bugs in the code 🪲
Contributions to this OSS project are warmly welcomed 🤗. We encourage you to check out the Contribution guidelines to understand how you can become an active contributor of the source code.
## Sharing feedback to improve Speedster
Open source is a unique resource for sharing knowledge and building great projects collaboratively with the OSS community. To support the continued development, upon installation of Speedster you could share the information strictly necessary to improve the performance of this open-source project and facilitate bug detection and fixing.
More specifically, you will foster project enhancement by sharing details of the optimization techniques used with Speedster and the performance achieved on your model and hardware.
**Which data do we collect?**
We make sure to collect as little data as possible to improve the open-source project:
- basic information about the environment
- basic information about the optimization
Please find below an example of telemetry collection:
```python
{
"nebullvm_version": "0.6.0",
"app_version": "0.0.1",
"model_id": "e33a1bbf-fcfd-4f5a-81c9-a9154c7e9343_-7088971112344091114",
"model_metadata": {
"model_name": "ResNet",
"model_size": "102.23 MB",
"framework": "torch"
},
"hardware_setup": {
"cpu": "Apple M1 Pro",
"operative_system": "Darwin",
"ram": "17.18 GB"
},
"optimizations": [
{
"compiler": "torch",
"technique": "original",
"latency": 0.03
},
{
"compiler": "NUMPY_onnxruntime",
"technique": "none",
"latency": 0.01
}
],
"ip_address": "1.1.1.1"
}
```
**How to opt-out?**
You can simply opt-out from telemetry collection by setting the environment variable `SPEEDSTER_DISABLE_TELEMETRY to 1`.
**Should I opt out?**
Being open-source, we have very limited visibility into the use of the tool unless someone actively contacts us or opens an issue on GitHub.
We would appreciate it if you would maintain telemetry, as it helps us improve the source code. In fact, it brings increasing value to the project and helps us to better prioritize feature development.
We understand that you may still prefer not to share telemetry data and we respect that desire. Please follow the steps above to disable data collection.
================================================
FILE: optimization/speedster/docs/en/mkdocs.yaml
================================================
site_name: Speedster
docs_dir: ./docs
nav:
- Overview: overview.md
- Installation: installation.md
- Getting started:
- PyTorch: getting_started/pytorch_getting_started.md
- 🤗 HuggingFace: getting_started/hf_getting_started.md
- 🧨 Stable Diffusion: getting_started/diffusers_getting_started.md
- TensorFlow/Keras: getting_started/tf_getting_started.md
- ONNX: getting_started/onnx_getting_started.md
- Notebooks: notebooks.md
- Key concepts: key_concepts.md
- Supported hardware: hardware.md
- Advanced options: advanced_options.md
- Benchmarks: benchmarks.md
- Telemetry: telemetry.md
================================================
FILE: optimization/speedster/notebooks/README.md
================================================
# **Jupyter notebooks**
This folder contains notebooks showing how to use the `Speedster` app to optimize several models.
The following frameworks are supported:
- PyTorch
- HuggingFace
- Diffusers
- Tensorflow
- ONNX
Examples of how to use `Speedster` are shown for each of these frameworks.
In each folder we provide links to google colab where you can easily test the notebooks.
If you want to test them on your own hardware, you can follow the guide below.
## 1. Setup
To test notebooks, we have to create an environment where all the required dependencies are installed.
First of all, clone the `nebullvm` repository:
```
git clone https://github.com/nebuly-ai/nebullvm.git
```
Next, navigate to the repo's root directory:
```
cd nebullvm
```
After cloning the repository there are two options: we can either install `Speedster` in a local environment or use a ready-to-use docker container.
### a. Using a local environment
Install `Speedster` library:
```
pip install speedster
```
Install deep learning compilers:
```
python -m nebullvm.installers.auto_installer \
--frameworks all --compilers all
```
You can find additional options and details on the official [installation guide](https://docs.nebuly.com/modules/speedster/installation).
After everything has been installed, you can start a jupyter session with the following command:
```
jupyter notebook --allow-root --port 8888
```
And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`
Use the token listed in the output from running the jupyter command to log in, for example:
`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`
You can finally navigate to the `notebooks/speedster` folder and then to the folder of the framework that you want to try and start a notebook.
### b. Using a Docker container
Another very easy way to test the following notebooks is by using one of the docker containers released on [dockerhub](https://hub.docker.com/r/nebulydocker/nebullvm).
Pull the most up-to-date container image that has all compilers and their dependencies preinstalled:
```
docker pull nebulydocker/nebullvm:latest
```
Once pulled, the container can be launched with the following command:
```
docker run --rm --gpus all -ti -p 8888:8888 -v $PWD:/nebullvm nebulydocker/nebullvm:latest
```
The `-v` option in the command above allows to persist all the changes that will be done to the notebooks inside the container.
Please note that, in order to enable gpu inside docker, you have to ensure that nvidia docker is installed. Please follow the "Setting up NVIDIA Container Toolkit" part from the
official [installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
You can then check that the gpu can be seen inside the container by running `nvidia-smi` inside it, and checking that your gpu appears in the output.
Inside the container, we can then navigate to the notebooks folder:
```
cd /nebullvm/notebooks/speedster
```
We can then run a jupyter session with the following command:
```
jupyter notebook --allow-root --ip 0.0.0.0 --port 8888
```
And navigate a web browser to the IP address or hostname of the host machine at port 8888: `http://[host machine]:8888`
Use the token listed in the output from running the jupyter command to log in, for example:
`http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b`
You can finally navigate to the folder of the framework that you want to try and start a notebook.
## 2. Contributions
At Nebuly we are always eager to see how our library manages to optimise more and more models. If you test nebullvm on your model and this is not already present among the notebooks, feel free to open a PR for us to add your notebook to the repository!
================================================
FILE: optimization/speedster/notebooks/diffusers/Accelerate_Stable_Diffusion_with_Speedster.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"id": "ef331be9",
"metadata": {
"id": "ef331be9"
},
"source": [
""
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "f260653a",
"metadata": {
"id": "f260653a"
},
"source": [
"# Accelerate Stable Diffusion with Speedster\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "8bdf3af5",
"metadata": {
"id": "8bdf3af5"
},
"source": [
"Hi and welcome 👋\n",
"\n",
"In this notebook we will discover how in just a few steps you can speed up the response time of Stable Diffusion inference using the Speedster module from the open-source library nebullvm. In the first section we will try using `Speedster` with the default configuration, then we will explore a more advanced option that involves the TensorRT plugins, that allow to accelerate Stable Diffusion further on GPU.\n",
"\n",
"Let's jump to the code."
]
},
{
"cell_type": "markdown",
"id": "cXXh1ifQ13mH",
"metadata": {
"id": "cXXh1ifQ13mH"
},
"source": [
"# Installation"
]
},
{
"cell_type": "markdown",
"id": "48aljCHu14-H",
"metadata": {
"id": "48aljCHu14-H"
},
"source": [
"Install Speedster:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "QFQh3BVr1-GO",
"metadata": {
"id": "QFQh3BVr1-GO"
},
"outputs": [],
"source": [
"!pip install speedster"
]
},
{
"cell_type": "markdown",
"id": "8a7a86b3",
"metadata": {
"id": "8a7a86b3"
},
"source": [
"Install deep learning compilers:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cffbfa32",
"metadata": {
"id": "cffbfa32"
},
"outputs": [],
"source": [
"!python -m nebullvm.installers.auto_installer --frameworks diffusers --compilers all"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "c2ab3de7",
"metadata": {},
"source": [
"# Environment check (GPU only)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "61a1a445",
"metadata": {},
"source": [
"**Please skip this section if you don't have a GPU**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "e2784bb8",
"metadata": {},
"source": [
"If you want to optimize Stable Diffusion on a Nvidia GPU, in order to work properly, the following requirements must be installed on your machine:\n",
"- `CUDA>=12.0`\n",
"- `tensorrt>=8.6.0`\n",
"- `torch<=1.13.1`"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "e3bc8b4d",
"metadata": {},
"source": [
"From TensorRT 8.6, all the tensorrt pre-built wheels released by nvidia support only `CUDA>=12.0`. Speedster will install `tensorrt>=8.6.0` automatically in the auto-installer only if it detects CUDA>=12.0, otherwise it will install `tensorrt==8.5.3.1`. In that case, you will have to upgrade your CUDA version and then to upgarde tensorrt to 8.6.0 or above to execute this notebook.\n",
"\n",
"There should be a way to run TensorRT 8.6 also with CUDA 11, but it requires installing TensorRT in a different way, you can check this issue: https://github.com/NVIDIA/TensorRT/issues/2773. Otherwise, we highly suggest to just upgrade to CUDA 12.\n",
"\n",
"For now PyTorch>=2.0.0 is not supported due to an [issue](https://github.com/pytorch/pytorch/issues/97262) in the conversion to onnx, so until they fix it you must have torch<=1.13.1 to optimize Stable Diffusion successfully."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ec2267f0",
"metadata": {},
"source": [
"First of all, Let's check the CUDA version installed on the machine"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82b78585",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import subprocess\n",
"\n",
"if torch.cuda.is_available():\n",
" cuda_version = subprocess.check_output([\"nvidia-smi\"])\n",
" cuda_version = int(cuda_version.decode(\"utf-8\").split(\"\\n\")[2].split(\"|\")[-2].split(\":\")[-1].strip().split(\".\")[0])\n",
" assert cuda_version >= 12, (\"This notebook requires CUDA>=12.0 to be executed, please upgrade your CUDA version.\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "015cfa92",
"metadata": {},
"source": [
"If you have CUDA<12.0, you can upgrade it at this link: https://developer.nvidia.com/cuda-downloads"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "563779e6",
"metadata": {},
"source": [
"Then, let's check the tensorrt version installed on the platform. Stable Diffusion optimization is supported starting from `tensorrt==8.6.0`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e385021d",
"metadata": {},
"outputs": [],
"source": [
"import tensorrt\n",
"from nebullvm.tools.utils import check_module_version\n",
"\n",
"if torch.cuda.is_available():\n",
" assert check_module_version(tensorrt, \"8.6.0\"), (\"This notebook can be run only with tensorrt>=8.6.0, if using an older version you could have issues during the optimization. Please upgrade your version.\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "61da505b",
"metadata": {},
"source": [
"If you have an older version, after ensuring you have `CUDA>=12.0` installed, you can upgrade your TensorRT version by running:\n",
"```\n",
"pip install -U tensorrt\n",
"```"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3876bea4",
"metadata": {},
"source": [
"Finally, let's check the PyTorch version"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db83853f",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"from nebullvm.tools.utils import check_module_version\n",
"\n",
"assert check_module_version(torch, max_version=\"1.13.1+cu117\"), (\"This notebook can be run only with torch<=1.13.1, if using an older version you could have issues during the optimization. Please downgrade your version.\")"
]
},
{
"cell_type": "markdown",
"id": "73072506",
"metadata": {
"id": "73072506"
},
"source": [
"## Model and Dataset setup"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "aeb2c521",
"metadata": {},
"source": [
"Once we have ensured that the the required libraries are installed, we have to choose the version of Stable Diffusion we want to optimize, speedster officially supports the most used versions:\n",
"- `CompVis/stable-diffusion-v1-4`\n",
"- `runwayml/stable-diffusion-v1-5`\n",
"- `stabilityai/stable-diffusion-2-1-base`\n",
"- `stabilityai/stable-diffusion-2-1` (only on gpus with at least 22GB of Memory, if you want to try with a GPU with a lower memory, you have to uncomment `pipe.enable_attention_slicing()` in the cell below)\n",
"\n",
"Other Stable Diffusion versions from the Diffusers library should work but have never been tested. If you try a version not included among these and it works, please feel free to report it to us on [Discord](https://discord.com/invite/RbeQMu886J) so we can add it to the list of supported versions. If you try a version that does not work, you can open an issue and possibly a PR on [GitHub](https://github.com/nebuly-ai/nebullvm/issues)."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "e4d55115",
"metadata": {
"id": "e4d55115"
},
"source": [
"For this notebook, we are going to select Stable Diffusion 1.4. Let's download and load it using the diffusers API:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d633cf21",
"metadata": {
"id": "d633cf21",
"scrolled": true
},
"outputs": [],
"source": [
"import torch\n",
"from diffusers import StableDiffusionPipeline\n",
"\n",
"# Select Stable Diffusion version\n",
"model_id = \"CompVis/stable-diffusion-v1-4\"\n",
"\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"if device == \"cuda\":\n",
" # On GPU we load by default the model in half precision, because it's faster and lighter.\n",
" pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n",
" # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n",
"else:\n",
" pipe = StableDiffusionPipeline.from_pretrained(model_id)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "11aa0739",
"metadata": {
"id": "11aa0739"
},
"source": [
"Let's now create an example dataset with some random sentences, that will be used later for the optimization process"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbbfeeb2",
"metadata": {
"id": "cbbfeeb2"
},
"outputs": [],
"source": [
"input_data = [\n",
" \"a photo of an astronaut riding a horse on mars\",\n",
" \"a monkey eating a banana in a forest\",\n",
" \"white car on a road surrounded by palm trees\",\n",
" \"a fridge full of bottles of beer\",\n",
" \"madara uchiha throwing asteroids against people\"\n",
"]"
]
},
{
"cell_type": "markdown",
"id": "17040431",
"metadata": {
"id": "17040431"
},
"source": [
"## Speed up inference with Speedster"
]
},
{
"cell_type": "markdown",
"id": "44ddc21d",
"metadata": {
"id": "44ddc21d"
},
"source": [
"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9d934f6",
"metadata": {
"id": "f9d934f6"
},
"outputs": [],
"source": [
"from speedster import optimize_model, save_model, load_model"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2799e3e3",
"metadata": {},
"source": [
"Let's move the pipe back to CPU to save up GPU memory, `Speedster` will automatically move it back to GPU when required."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45220cf0",
"metadata": {},
"outputs": [],
"source": [
"import gc\n",
"\n",
"# Move the pipe back to cpu\n",
"pipe.to(\"cpu\")\n",
"\n",
"# Clean memory\n",
"torch.cuda.empty_cache()\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "76248033",
"metadata": {
"id": "76248033"
},
"source": [
"Using Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "75b339c3",
"metadata": {},
"source": [
"**Optimisation of stable diffusion requires a lot of RAM. If you are running this notebook on google colab, make sure to use the high RAM option, otherwise the kernel may crash. If the kernel crashes also when using the high RAM option, please try adding also `\"torchscript\"` to the `ignore_compilers` list. \n",
"If running on GPU, the optimization requires at least 16GB og GPU memory to exploit the best techniques for optimizing the model, otherwise it may fail with a Memory Error**."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "zPC_EDwEJIM0",
"metadata": {
"id": "zPC_EDwEJIM0"
},
"outputs": [],
"source": [
"optimized_model = optimize_model(\n",
" model=pipe,\n",
" input_data=input_data,\n",
" optimization_time=\"unconstrained\",\n",
" ignore_compilers=[\"torch_tensor_rt\", \"tvm\"], # Some compilers have issues with Stable Diffusion, so it's better to skip them.\n",
" metric_drop_ths=0.2,\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "fdae59d2",
"metadata": {},
"source": [
"If running on GPU, here you should obtain a speedup of about 124% on the UNet. We run the optimization on a **3090Ti** and here are our results:\n",
"- **Original Model (PyTorch, fp16): 51,557 ms/batch**\n",
"- **Optimized Model (TensorRT, fp16): 23,055 ms/batch**\n",
"\n",
"If the optimized model you obtained is not a TensorRT one, probably there was an error during the optimization. If running on colab, it could happen that the standard gpu is not enough to run the optimization, so we suggest to select a premium gpu with more memory.\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "af9f86ac",
"metadata": {},
"source": [
"If everything worked correctly, let's check the output of the optimized model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b640885",
"metadata": {},
"outputs": [],
"source": [
"test_prompt = \"futuristic llama with a cyberpunk city on the background\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa443637",
"metadata": {},
"outputs": [],
"source": [
"optimized_model(test_prompt).images[0]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6e5b3b21",
"metadata": {
"id": "6e5b3b21"
},
"source": [
"Let's run the prediction 10 times to calculate the average response time of the original model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09170c78",
"metadata": {},
"outputs": [],
"source": [
"if device == \"cuda\":\n",
" pipe = StableDiffusionPipeline.from_pretrained(model_id, revision='fp16', torch_dtype=torch.float16)\n",
" # pipe.enable_attention_slicing() # Uncomment for stable-diffusion-2.1 on gpus with 16GB of memory like V100-16GB and T4\n",
"else:\n",
" pipe = StableDiffusionPipeline.from_pretrained(model_id)\n",
"\n",
"pipe.to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3bc5c98",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "d3bc5c98",
"outputId": "e0596cf2-fa96-4c50-c012-f5cdab82e681"
},
"outputs": [],
"source": [
"import time\n",
"\n",
"times = []\n",
"\n",
"# Warmup for 2 iterations\n",
"for _ in range(2):\n",
" with torch.no_grad():\n",
" final_out = pipe(test_prompt).images[0]\n",
"\n",
"# Benchmark\n",
"for _ in range(8):\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" final_out = pipe(test_prompt).images[0]\n",
" times.append(time.time()-st)\n",
"original_model_time = sum(times)/len(times)\n",
"print(f\"Average response time for original Stable Diffusion 1.4: {original_model_time} s\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3db0a7a1",
"metadata": {
"id": "3db0a7a1"
},
"source": [
"Let's run the prediction 10 times to calculate the average response time of the optimized model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3e83997",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "a3e83997",
"outputId": "7a416b14-f170-4df9-d416-026f06a7d980"
},
"outputs": [],
"source": [
"times = []\n",
"\n",
"for _ in range(2):\n",
" with torch.no_grad():\n",
" final_out = optimized_model(test_prompt).images[0]\n",
"\n",
"# Benchmark\n",
"for _ in range(8):\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" final_out = optimized_model(test_prompt).images[0]\n",
" times.append(time.time()-st)\n",
"optimized_model_time = sum(times)/len(times)\n",
"print(f\"Average response time for optimized Stable Diffusion 1.4: {optimized_model_time} s\")"
]
},
{
"cell_type": "markdown",
"id": "ceb60d8c",
"metadata": {
"id": "ceb60d8c"
},
"source": [
"## Save and reload the optimized model"
]
},
{
"cell_type": "markdown",
"id": "d9eda1a0",
"metadata": {},
"source": [
"We can easily save to disk the optimized model with the following line:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62b6fcbf",
"metadata": {},
"outputs": [],
"source": [
"save_model(optimized_model, \"model_save_path\")"
]
},
{
"cell_type": "markdown",
"id": "3c968d51",
"metadata": {},
"source": [
"We can then load again the model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1340c49",
"metadata": {},
"outputs": [],
"source": [
"optimized_model = load_model(\"model_save_path\", pipe=pipe)"
]
},
{
"cell_type": "markdown",
"id": "cb234e5e",
"metadata": {
"id": "cb234e5e"
},
"source": [
"Great! Was it easy? How are the results? Do you have any comments?\n",
"Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n",
"\n",
"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
"\n",
"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
]
},
{
"cell_type": "markdown",
"id": "b77ff2ac",
"metadata": {
"id": "b77ff2ac"
},
"source": [
"
"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"provenance": []
},
"gpuClass": "premium",
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: optimization/speedster/notebooks/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"id": "ef331be9",
"metadata": {
"id": "ef331be9"
},
"source": [
""
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "f260653a",
"metadata": {
"id": "f260653a"
},
"source": [
"# Accelerate Hugging Face T5 with Speedster\n"
]
},
{
"cell_type": "markdown",
"id": "8bdf3af5",
"metadata": {
"id": "8bdf3af5"
},
"source": [
"Hi and welcome 👋\n",
"\n",
"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n",
"\n",
"With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n",
"\n",
"Let's jump to the code."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d527d63b",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "d527d63b",
"outputId": "57626bac-e458-487f-f4fa-a459627af296"
},
"outputs": [],
"source": [
"%env CUDA_VISIBLE_DEVICES=0"
]
},
{
"cell_type": "markdown",
"id": "cXXh1ifQ13mH",
"metadata": {
"id": "cXXh1ifQ13mH"
},
"source": [
"# Installation"
]
},
{
"cell_type": "markdown",
"id": "48aljCHu14-H",
"metadata": {
"id": "48aljCHu14-H"
},
"source": [
"Install Speedster:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "QFQh3BVr1-GO",
"metadata": {
"id": "QFQh3BVr1-GO"
},
"outputs": [],
"source": [
"!pip install speedster"
]
},
{
"cell_type": "markdown",
"id": "8a7a86b3",
"metadata": {
"id": "8a7a86b3"
},
"source": [
"Install deep learning compilers:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cffbfa32",
"metadata": {
"id": "cffbfa32"
},
"outputs": [],
"source": [
"!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all"
]
},
{
"cell_type": "markdown",
"id": "73072506",
"metadata": {
"id": "73072506"
},
"source": [
"## Model and Dataset setup"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "cf24c4c4",
"metadata": {},
"source": [
"Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cf8ff74",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n",
"\n",
"if os.path.exists(tensorrt_path):\n",
" os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n",
"else:\n",
" print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")"
]
},
{
"cell_type": "markdown",
"id": "e4d55115",
"metadata": {
"id": "e4d55115"
},
"source": [
"We chose T5-efficient-base as the pre-trained model that we want to optimize. Let's download both the pre-trained model and the tokenizer from the Hugging Face model hub."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "NOgOmfdY_dav",
"metadata": {
"id": "NOgOmfdY_dav"
},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"import torch\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"model_name = \"google/t5-efficient-base\"\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torchscript=True).to(device)\n",
"\n",
"# set the model to eval mode\n",
"_ = model.eval()"
]
},
{
"cell_type": "markdown",
"id": "11aa0739",
"metadata": {
"id": "11aa0739"
},
"source": [
"Let's create an example dataset with some random sentences"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ghGcDNFtKt3X",
"metadata": {
"id": "ghGcDNFtKt3X"
},
"outputs": [],
"source": [
"texts = [\n",
" \"\"\"BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.\"\"\",\n",
" \"\"\"GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.\"\"\",\n",
" \"\"\"With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.\"\"\",\n",
" \"\"\"LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis.\"\"\",\n",
" \"\"\"XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.\"\"\"\n",
"]\n",
"texts = texts*20"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a09f9424",
"metadata": {
"id": "a09f9424"
},
"outputs": [],
"source": [
"encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\") for text in texts]"
]
},
{
"cell_type": "markdown",
"id": "17040431",
"metadata": {
"id": "17040431"
},
"source": [
"## Speed up inference with Speedster: no metric drop"
]
},
{
"cell_type": "markdown",
"id": "44ddc21d",
"metadata": {
"id": "44ddc21d"
},
"source": [
"It's now time of improving a bit the performance in terms of speed. Let's use `Speedster`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9d934f6",
"metadata": {
"id": "f9d934f6"
},
"outputs": [],
"source": [
"from speedster import optimize_model, save_model, load_model"
]
},
{
"cell_type": "markdown",
"id": "76248033",
"metadata": {
"id": "76248033"
},
"source": [
"Usually Speedster is very simple and straightforward! Just use the `optimize_model` function and provide as input the model, some input data as example and the optimization time mode. But for this type of models, we need to do some extra steps because current version of speedster don't have direct support for Encoder-Decoder Models. These type of models has both Encoder and Decoder. For Example, BERT models are Encoder models and GPT models are Decoder models, but T5 has both."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "i7sgUWjePN9i",
"metadata": {
"id": "i7sgUWjePN9i"
},
"outputs": [],
"source": [
"# First, we get the encoder and decoder from the model\n",
"encoder = model.get_encoder()\n",
"decoder = model.get_decoder()"
]
},
{
"cell_type": "markdown",
"id": "O7xaI1drQOQ0",
"metadata": {
"id": "O7xaI1drQOQ0"
},
"source": [
"Optionally a dynamic_info dictionary can be also provided, in order to support inputs with dynamic shape."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "nTUPdDchQLc1",
"metadata": {
"id": "nTUPdDchQLc1"
},
"outputs": [],
"source": [
"dynamic_info = {\n",
" \"inputs\": [\n",
" {0: 'batch', 1: 'num_tokens'},\n",
" {0: 'batch', 1: 'num_tokens'}\n",
" ],\n",
" \"outputs\": [\n",
" {0: 'batch', 1: 'num_tokens'},\n",
" ]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "zPC_EDwEJIM0",
"metadata": {
"id": "zPC_EDwEJIM0"
},
"outputs": [],
"source": [
"# Create the optimized encoder model seperately\n",
"optimized_encoder_model = optimize_model(\n",
" model=encoder,\n",
" input_data=encoded_inputs,\n",
" optimization_time=\"constrained\",\n",
" ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n",
" dynamic_info=dynamic_info,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7Oa68a87Qjre",
"metadata": {
"id": "7Oa68a87Qjre"
},
"outputs": [],
"source": [
"# Create the optimized decoder model seperately\n",
"optimized_decoder_model = optimize_model(\n",
" model=decoder,\n",
" input_data=encoded_inputs,\n",
" optimization_time=\"constrained\",\n",
" ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n",
" dynamic_info=dynamic_info,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98c6ab09",
"metadata": {
"id": "98c6ab09"
},
"outputs": [],
"source": [
"import time\n",
"\n",
"# Move inputs to gpu if available\n",
"encoded_inputs = [tokenizer(text, padding=\"longest\", return_tensors=\"pt\").to(device) for text in texts]"
]
},
{
"cell_type": "markdown",
"id": "6e5b3b21",
"metadata": {
"id": "6e5b3b21"
},
"source": [
"Let's run the prediction 100 times to calculate the average response time of the original model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3bc5c98",
"metadata": {
"id": "d3bc5c98"
},
"outputs": [],
"source": [
"times = []\n",
"# Warmup for 30 iterations\n",
"for encoded_input in encoded_inputs[:30]:\n",
" with torch.no_grad():\n",
" encoder_out = encoder(**encoded_input)\n",
" decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
"\n",
"# Benchmark\n",
"for encoded_input in encoded_inputs:\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" encoder_out = encoder(**encoded_input)\n",
" decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
" times.append(time.time()-st)\n",
"original_model_time = sum(times)/len(times)*1000\n",
"print(f\"Average response time for original T5: {original_model_time} ms\")"
]
},
{
"cell_type": "markdown",
"id": "GU0SwykMTVAj",
"metadata": {
"id": "GU0SwykMTVAj"
},
"source": [
"In Real world use cases, we pass the decoder output to `model.lm_head` to get the actual prediction, but here we are testing the performance improvements, so i am skipping that step."
]
},
{
"cell_type": "markdown",
"id": "12c2df98",
"metadata": {
"id": "12c2df98"
},
"source": [
"Let's see the output of the original model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4892a905",
"metadata": {
"id": "4892a905"
},
"outputs": [],
"source": [
"encoder(**encoded_input)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "gx0naPVuSVrm",
"metadata": {
"id": "gx0naPVuSVrm"
},
"outputs": [],
"source": [
"decoder(**encoded_input,encoder_hidden_states=encoder_out[0])"
]
},
{
"cell_type": "markdown",
"id": "3db0a7a1",
"metadata": {
"id": "3db0a7a1"
},
"source": [
"Let's run the prediction 100 times to calculate the average response time of the optimized model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3e83997",
"metadata": {
"id": "a3e83997"
},
"outputs": [],
"source": [
"times = []\n",
"\n",
"# Warmup for 30 iterations\n",
"for encoded_input in encoded_inputs[:30]:\n",
" with torch.no_grad():\n",
" encoder_out = optimized_encoder_model(**encoded_input)\n",
" decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
"\n",
"# Benchmark\n",
"for encoded_input in encoded_inputs:\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" encoder_out = optimized_encoder_model(**encoded_input)\n",
" decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
" times.append(time.time()-st)\n",
"optimized_model_time = sum(times)/len(times)*1000\n",
"print(f\"Average response time for optimized T5 (no metric drop): {optimized_model_time} ms\")"
]
},
{
"cell_type": "markdown",
"id": "0d884d61",
"metadata": {
"id": "0d884d61"
},
"source": [
"Let's see the output of the optimized_model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75611b2e",
"metadata": {
"id": "75611b2e"
},
"outputs": [],
"source": [
"optimized_encoder_model(**encoded_input)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cpieoDfwS-V7",
"metadata": {
"id": "cpieoDfwS-V7"
},
"outputs": [],
"source": [
"optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])"
]
},
{
"cell_type": "markdown",
"id": "ceb60d8c",
"metadata": {
"id": "ceb60d8c"
},
"source": [
"## Speed up inference with Speedster: metric drop"
]
},
{
"cell_type": "markdown",
"id": "7b1950d5",
"metadata": {
"id": "7b1950d5"
},
"source": [
"This time we will use the `metric_drop_ths` argument to accept a little drop in terms of precision, in order to enable quantization and obtain an higher speedup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "VwOLWZSZUM89",
"metadata": {
"id": "VwOLWZSZUM89"
},
"outputs": [],
"source": [
"optimized_encoder_model = optimize_model(\n",
" model=encoder,\n",
" input_data=encoded_inputs,\n",
" optimization_time=\"constrained\",\n",
" ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n",
" dynamic_info=dynamic_info,\n",
" metric_drop_ths=0.1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "FIKn4V3dUIZB",
"metadata": {
"id": "FIKn4V3dUIZB"
},
"outputs": [],
"source": [
"optimized_decoder_model = optimize_model(\n",
" model=decoder,\n",
" input_data=encoded_inputs,\n",
" optimization_time=\"constrained\",\n",
" ignore_compilers=[\"tensor_rt\", \"tvm\"], # TensorRT does not work for this model\n",
" dynamic_info=dynamic_info,\n",
" metric_drop_ths=0.1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0fbfe6fa",
"metadata": {
"id": "0fbfe6fa"
},
"outputs": [],
"source": [
"times = []\n",
"# Warmup for 30 iterations\n",
"for encoded_input in encoded_inputs[:30]:\n",
" with torch.no_grad():\n",
" encoder_out = encoder(**encoded_input)\n",
" decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
"\n",
"# Benchmark\n",
"for encoded_input in encoded_inputs:\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" encoder_out = encoder(**encoded_input)\n",
" decoder_out = decoder(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
" times.append(time.time()-st)\n",
"original_model_time = sum(times)/len(times)*1000\n",
"print(f\"Average response time for original T5: {original_model_time} ms\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f89b7e6d",
"metadata": {
"id": "f89b7e6d"
},
"outputs": [],
"source": [
"encoder(**encoded_input)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "oI1zjIBSUoIU",
"metadata": {
"id": "oI1zjIBSUoIU"
},
"outputs": [],
"source": [
"decoder(**encoded_input,encoder_hidden_states=encoder_out[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10d17b5c",
"metadata": {
"id": "10d17b5c"
},
"outputs": [],
"source": [
"times = []\n",
"\n",
"# Warmup for 30 iterations\n",
"for encoded_input in encoded_inputs[:30]:\n",
" with torch.no_grad():\n",
" encoder_out = optimized_encoder_model(**encoded_input)\n",
" decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
"\n",
"# Benchmark\n",
"for encoded_input in encoded_inputs:\n",
" st = time.time()\n",
" with torch.no_grad():\n",
" encoder_out = optimized_encoder_model(**encoded_input)\n",
" decoder_out = optimized_decoder_model(**encoded_input,encoder_hidden_states=encoder_out[0])\n",
" times.append(time.time()-st)\n",
"optimized_model_time = sum(times)/len(times)*1000\n",
"print(f\"Average response time for optimized T5 (metric drop): {optimized_model_time} ms\")"
]
},
{
"cell_type": "markdown",
"id": "4XFMC1S6zXTU",
"metadata": {
"id": "4XFMC1S6zXTU"
},
"source": [
"## Save and reload the optimized model"
]
},
{
"cell_type": "markdown",
"id": "OXHVr3EAzbT5",
"metadata": {
"id": "OXHVr3EAzbT5"
},
"source": [
"We can easily save to disk the optimized model with the following line:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3M565P-zzaFB",
"metadata": {
"id": "3M565P-zzaFB"
},
"outputs": [],
"source": [
"save_model(optimized_encoder_model, \"encoder_model_save_path\")\n",
"save_model(optimized_decoder_model, \"decoder_model_save_path\")"
]
},
{
"cell_type": "markdown",
"id": "ee8CS_Evzg1j",
"metadata": {
"id": "ee8CS_Evzg1j"
},
"source": [
"We can then load again the model:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "zOQ88SY_zg-A",
"metadata": {
"id": "zOQ88SY_zg-A"
},
"outputs": [],
"source": [
"optimized_encoder_model = load_model(\"encoder_model_save_path\")\n",
"optimized_decoder_model = load_model(\"decoder_model_save_path\")"
]
},
{
"cell_type": "markdown",
"id": "cb234e5e",
"metadata": {
"id": "cb234e5e"
},
"source": [
"Great! Was it easy? How are the results? Do you have any comments?\n",
"Share your optimization results and thoughts with our community on Discord, where we chat about Speedster and AI acceleration.\n",
"\n",
"Note that the acceleration of Speedster depends very much on the hardware configuration and your AI model. Given the same input model, Speedster can accelerate it by 10 times on some machines and perform poorly on others.\n",
"\n",
"If you want to learn more about how Speedster works, look at other tutorials and performance benchmarks, check out the links below or write to us on Discord."
]
},
{
"cell_type": "markdown",
"id": "b77ff2ac",
"metadata": {
"id": "b77ff2ac"
},
"source": [
"
"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: optimization/speedster/notebooks/onnx/Readme.md
================================================
# **ONNX Optimization**
This section contains all the available notebooks that show how to leverage Speedster to optimize ONNX models.
## Notebooks:
| Notebook | Description | |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate ONNX Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model in ONNX format. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/onnx/Accelerate_ONNX_ResNet50_with_Speedster.ipynb) |
## ONNX API quick view:
```python
import numpy as np
from speedster import optimize_model
# Load a resnet as example
# Model was downloaded from here:
# https://github.com/onnx/models/tree/main/vision/classification/resnet
model = "resnet50-v1-12.onnx"
# Provide an input data for the model
input_data = [((np.random.randn(1, 3, 224, 224).astype(np.float32), ), np.array([0]))]
# Run Speedster optimization
optimized_model = optimize_model(
model, input_data=input_data, optimization_time="unconstrained"
)
# Try the optimized model
x = np.random.randn(1, 3, 224, 224).astype(np.float32)
## Warmup the model
## This step is necessary before the latency computation of the
## optimized model in order to get reliable results.
# for _ in range(10):
# optimized_model(x)
res_optimized = optimized_model(x)
```
================================================
FILE: optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "p5b0PzpW1xJq"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Accelerate PyTorch ResNet50 with Speedster"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T9xuwZEHzN2K"
},
"source": [
"Hi and welcome 👋\n",
"\n",
"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using Speedster app from the open-source library `nebullvm`.\n",
"\n",
"We will\n",
"1. Install Speedster and the deep learning compilers used by the library.\n",
"2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n",
"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
"\n",
"Let's jump to the code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_0ZRCXCR9693",
"outputId": "19096862-5c5c-4f9f-b2ad-3ce084ccf213"
},
"outputs": [],
"source": [
"%env CUDA_VISIBLE_DEVICES=0"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HbFy2Aykz2Qo"
},
"source": [
"### Installation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZPJHVZ74d8r2"
},
"outputs": [],
"source": [
"!pip install speedster"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "b0CLgQqxyrQi"
},
"source": [
"Let's now import install the deep learning compilers used by Speedster that are not yet installed on the hardware.\n",
"\n",
"The installation of the compilers may take a few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GvK9mZSjeLU5"
},
"outputs": [],
"source": [
"!python -m nebullvm.installers.auto_installer --frameworks torch --compilers all"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "N5RXHoZl0p3p"
},
"source": [
"## Optimization example with Pytorch"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-Ju-VcRH01Mw"
},
"source": [
"In the following example we will try to optimize a standard resnet50 loaded directly from torchvision.\n",
"\n",
"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n",
"\n",
"Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "skxEuemn171G"
},
"source": [
"### Scenario 1 - No accuracy drop"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wVRLXrDi2VaG"
},
"source": [
"First we load the model and optimize it using the Speedster API:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2RbgGruAeQcf"
},
"outputs": [],
"source": [
"import torch\n",
"import torchvision.models as models\n",
"from speedster import optimize_model, save_model, load_model\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"# Load a resnet as example\n",
"model = models.resnet50().to(device)\n",
"\n",
"# Provide an input data for the model \n",
"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]\n",
"\n",
"# Run Speedster optimization\n",
"optimized_model = optimize_model(\n",
" model, input_data=input_data, optimization_time=\"unconstrained\"\n",
")\n",
"\n",
"# Try the optimized model\n",
"x = torch.randn(1, 3, 256, 256).to(device)\n",
"model.eval()\n",
"res_optimized = optimized_model(x)\n",
"res_original = model(x)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "JMiuufyu2gD3"
},
"source": [
"We can print the type of the optimized model to see which compiler was faster:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ifuLyQsM9697",
"outputId": "c1534e0d-e5bb-4d44-91e9-652593751d52"
},
"outputs": [],
"source": [
"optimized_model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4WxcxrUC9698"
},
"source": [
"In our case, the optimized model type was PytorchTensorRTInferenceLearner, so this means that Pytorch-TensorRT was the faster compiler."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iwHKfT349698"
},
"source": [
"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-IMJpfcb9698"
},
"source": [
"First of all, let's print the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uI8Kd1Z49698",
"outputId": "832d3053-d6c8-4cc2-9b48-a59dfaa45d33"
},
"outputs": [],
"source": [
"res_original"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0I_zSpv29698",
"outputId": "a0ba566d-6730-4954-8dd0-eb47b549cbf1"
},
"outputs": [],
"source": [
"res_optimized"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hBEtrYOd9699"
},
"source": [
"Then, let's compare the performances:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GqxiCAbpfcwV"
},
"outputs": [],
"source": [
"from nebullvm.tools.benchmark import benchmark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_0b0Bzwq-czD"
},
"outputs": [],
"source": [
"# Set the model to eval mode and move it to the available device\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"model.eval()\n",
"model.to(device)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UqxzStjD2v0r"
},
"source": [
"Here we compute the average throughput for the baseline model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dkt67_Orwlv4",
"outputId": "fc10c03c-c3ad-44d4-9fd6-c9b6dc0256c7"
},
"outputs": [],
"source": [
"benchmark(model, input_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AgOv-GqQ3KIC"
},
"source": [
"Here we compute the average throughput for the optimized model:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4PodpaDVfwzT",
"outputId": "27a42560-93a2-4c19-e68d-360093fe914c"
},
"outputs": [],
"source": [
"benchmark(optimized_model, input_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tBeRKNTI3iyK"
},
"source": [
"## Scenario 2 - Accuracy drop"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "w3wutIzfAMe_"
},
"source": [
"In this scenario, we set a max threshold for the accuracy drop to 2%"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "fO1nGqpj3p7z"
},
"outputs": [],
"source": [
"import torch\n",
"import torchvision.models as models\n",
"from speedster import optimize_model\n",
"\n",
"# Load a resnet as example\n",
"model = models.resnet50().to(device)\n",
"\n",
"# Provide 100 random input data for the model \n",
"input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0])) for _ in range(100)]\n",
"\n",
"# Run Speedster optimization\n",
"optimized_model = optimize_model(\n",
" model, input_data=input_data, optimization_time=\"unconstrained\", metric=\"accuracy\", metric_drop_ths=0.02\n",
")\n",
"\n",
"# Try the optimized model\n",
"x = torch.randn(1, 3, 256, 256).to(device)\n",
"res = optimized_model(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qFKHaHM6-GKm"
},
"outputs": [],
"source": [
"# Set the model to eval mode and move it to the available device\n",
"\n",
"model.eval()\n",
"model.to(device)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yfW9kmHX-pGi"
},
"source": [
"Here we compute the average throughput for the baseline model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0MMrL3959hli",
"outputId": "2e8d27ec-a9f3-4f70-8c75-a0df974f2653"
},
"outputs": [],
"source": [
"benchmark(model, input_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "i3GqasOM-u8f"
},
"source": [
"Here we compute the average throughput for the optimized model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_IbAW0KA4Fm5",
"outputId": "48d83c89-5687-42aa-a3b8-6989bcb66aa6"
},
"outputs": [],
"source": [
"benchmark(optimized_model, input_data)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ceb60d8c",
"metadata": {
"id": "ceb60d8c"
},
"source": [
"## Save and reload the optimized model"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "d9eda1a0",
"metadata": {},
"source": [
"We can easily save to disk the optimized model with the following line:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62b6fcbf",
"metadata": {},
"outputs": [],
"source": [
"save_model(optimized_model, \"model_save_path\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3c968d51",
"metadata": {},
"source": [
"We can then load again the model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1340c49",
"metadata": {},
"outputs": [],
"source": [
"optimized_model = load_model(\"model_save_path\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b77ff2ac",
"metadata": {
"id": "b77ff2ac"
},
"source": [
"
"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: optimization/speedster/notebooks/pytorch/Readme.md
================================================
# **PyTorch Optimization**
This section contains all the available notebooks that show how to leverage Speedster to optimize PyTorch models.
## Notebooks:
| Notebook | Description | |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Torchvision Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from Torchvision. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ResNet50_with_Speedster.ipynb) |
| [Accelerate Fast AI Resnet34](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet34 model loaded from Fast AI. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_fast_ai_Resnet34_with_Speedster.ipynb) |
| [Accelerate PyTorch ViT](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb) | Show how to optimize with Speedster a PyTorch ViT model. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_ViT_with_Speedster.ipynb) |
| [Accelerate Ultralytics YOLOv5](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb) | Show how to optimize with Speedster a YOLOv5 model from Ultralytics. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb) |
| [Accelerate Ultralytics YOLOv8](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb) | Show how to optimize with Speedster a YOLOv8 model from Ultralytics. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb) |
## PyTorch API quick view:
``` python
import torch
import torchvision.models as models
from speedster import optimize_model
# Load a resnet as example
model = models.resnet50()
# Provide an input data for the model
input_data = [((torch.randn(1, 3, 256, 256), ), torch.tensor([0]))]
# Run Speedster optimization
optimized_model = optimize_model(
model, input_data=input_data, optimization_time="unconstrained"
)
# Try the optimized model
x = torch.randn(1, 3, 256, 256)
## Warmup the model
## This step is necessary before the latency computation of the
## optimized model in order to get reliable results.
# for _ in range(10):
# optimized_model(x)
res = optimized_model(x)
```
================================================
FILE: optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "p5b0PzpW1xJq"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-KdJPm7M05Jc"
},
"source": [
"# Accelerate Tensorflow ResNet50 with Speedster"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T9xuwZEHzN2K"
},
"source": [
"Hi and welcome 👋\n",
"\n",
"In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library `nebullvm`.\n",
"\n",
"We will\n",
"1. Install Speedster and the deep learning compilers used by the library.\n",
"2. Speed up a PyTorch ResNet50 without any loss of accuracy.\n",
"3. Achieve faster acceleration on the same model by applying more aggressive optimization techniques (e.g. pruning, quantization) under the constraint of sacrificing up to 2% accuracy.\n",
"\n",
"Let's jump to the code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KIeIvBPVLQuq"
},
"outputs": [],
"source": [
"%env CUDA_VISIBLE_DEVICES=0"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HbFy2Aykz2Qo"
},
"source": [
"### Installation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "48aljCHu14-H"
},
"source": [
"Install Speedster:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QFQh3BVr1-GO"
},
"outputs": [],
"source": [
"!pip install speedster"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8a7a86b3"
},
"source": [
"Install deep learning compilers:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cffbfa32"
},
"outputs": [],
"source": [
"!python -m nebullvm.installers.auto_installer --frameworks tensorflow --compilers all"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "N5RXHoZl0p3p"
},
"source": [
"## Optimization example with Tensorflow"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-Ju-VcRH01Mw"
},
"source": [
"In the following example we will try to optimize a standard resnet50 loaded directly from keras.\n",
"\n",
"Speedster can accelerate neural networks without loss of a user-defined precision metric, e.g. accuracy, or can achieve faster acceleration by applying more aggressive optimization techniques, such as pruning and quantization, that may have a negative impact on the selectic metric. The maximum threshold value for accuracy loss is determined by the metric_drop_ths parameter. Read more in the [docs](https://docs.nebuly.com/modules/speedster/getting-started).\n",
"\n",
"Let first test the optimization without accuracy loss (metric_drop_ths=0, default value), and then apply further accelerate it under the constrained of losing up to 2% of accuracy (metric = \"accuracy\", metric_drop_ths = 0.02)."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "skxEuemn171G"
},
"source": [
"### Scenario 1 - No accuracy drop"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wVRLXrDi2VaG"
},
"source": [
"First we load the model and optimize it using the Speedster API:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2RbgGruAeQcf"
},
"outputs": [],
"source": [
"# If you encountered any error, run the cell again\n",
"import tensorflow as tf\n",
"from tensorflow.keras.applications.resnet50 import ResNet50\n",
"from speedster import optimize_model, save_model, load_model\n",
"\n",
"# Load a resnet as example\n",
"model = ResNet50()\n",
"\n",
"# Provide an input data for the model \n",
"input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]\n",
"\n",
"# Run Speedster optimization\n",
"optimized_model = optimize_model(\n",
" model, input_data=input_data, optimization_time=\"unconstrained\"\n",
")\n",
"\n",
"# Try the optimized model\n",
"x = tf.random.normal([1, 224, 224, 3])\n",
"res_original = model.predict(x)\n",
"res_optimized = optimized_model.predict(x)[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NGrk6_jwRubP"
},
"source": [
"We can print the type of the optimized model to see which compiler was faster:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cVMn6erJLQuu"
},
"outputs": [],
"source": [
"optimized_model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aT0BhdIKR7gY"
},
"source": [
"In our case, the optimized model type was TensorflowNvidiaInferenceLearner, so this means that Tensor RT was the faster compiler."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "JMiuufyu2gD3"
},
"source": [
"After the optimization step, we can compare the optimized model with the baseline one in order to verify that the output is the same and to measure the speed improvement"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Swpr-Wi5Si9a"
},
"source": [
"First of all, let's print the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MjGtKkeZSOc7"
},
"outputs": [],
"source": [
"res_original"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dhe94Tk3SSfn"
},
"outputs": [],
"source": [
"res_optimized"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UqxzStjD2v0r"
},
"source": [
"Then, let's compute the average latency of the baseline model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ELyTjg6_S4Us"
},
"outputs": [],
"source": [
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dkt67_Orwlv4"
},
"outputs": [],
"source": [
"num_iters = 100\n",
"\n",
"# Warmup\n",
"for i in range(10):\n",
" model.predict(x)\n",
"\n",
"start = time.time()\n",
"for i in range(num_iters):\n",
" model.predict(x)\n",
"stop = time.time()\n",
"\n",
"print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AgOv-GqQ3KIC"
},
"source": [
"Finally we compute the average latency for the optimized model:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4PodpaDVfwzT"
},
"outputs": [],
"source": [
"# Warmup\n",
"for i in range(10):\n",
" optimized_model.predict(x)\n",
"\n",
"start = time.time()\n",
"for i in range(num_iters):\n",
" optimized_model.predict(x)\n",
"stop = time.time()\n",
"\n",
"print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tBeRKNTI3iyK"
},
"source": [
"### Scenario 2 - Accuracy drop"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "w3wutIzfAMe_"
},
"source": [
"In this scenario, we set a max threshold for the accuracy drop to 2%"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "fO1nGqpj3p7z"
},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.applications.resnet50 import ResNet50\n",
"from speedster import optimize_model\n",
"\n",
"# Load a resnet as example\n",
"model = ResNet50()\n",
"\n",
"# Provide an input data for the model \n",
"# Note that in this case we should provide the model at least 100 data samples\n",
"input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0])) for i in range(100)]\n",
"\n",
"# Run Speedster optimization\n",
"optimized_model = optimize_model(\n",
" model, input_data=input_data, optimization_time=\"unconstrained\", metric = \"accuracy\", metric_drop_ths = 0.02\n",
")\n",
"\n",
"# Try the optimized model\n",
"x = tf.random.normal([1, 224, 224, 3])\n",
"res_original = model.predict(x)\n",
"res_optimized = optimized_model.predict(x)[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yfW9kmHX-pGi"
},
"source": [
"Here we compute the average throughput for the baseline model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0MMrL3959hli"
},
"outputs": [],
"source": [
"num_iters = 100\n",
"\n",
"# Warmup\n",
"for i in range(10):\n",
" model.predict(x)\n",
"\n",
"start = time.time()\n",
"for i in range(num_iters):\n",
" model.predict(x)\n",
"stop = time.time()\n",
"\n",
"print(\"Average latency original model: {:.4f} seconds\".format((stop - start) / num_iters))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "i3GqasOM-u8f"
},
"source": [
"Here we compute the average throughput for the optimized model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_IbAW0KA4Fm5"
},
"outputs": [],
"source": [
"# Warmup\n",
"for i in range(10):\n",
" optimized_model.predict(x)\n",
"\n",
"start = time.time()\n",
"for i in range(num_iters):\n",
" optimized_model.predict(x)\n",
"stop = time.time()\n",
"\n",
"print(\"Average latency optimized model: {:.4f} seconds\".format((stop - start) / num_iters))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4XFMC1S6zXTU"
},
"source": [
"## Save and reload the optimized model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OXHVr3EAzbT5"
},
"source": [
"We can easily save to disk the optimized model with the following line:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3M565P-zzaFB"
},
"outputs": [],
"source": [
"save_model(optimized_model, \"model_save_path\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ee8CS_Evzg1j"
},
"source": [
"We can then load again the model:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zOQ88SY_zg-A"
},
"outputs": [],
"source": [
"optimized_model = load_model(\"model_save_path\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b77ff2ac",
"metadata": {
"id": "b77ff2ac"
},
"source": [
"
"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: optimization/speedster/notebooks/tensorflow/Readme.md
================================================
# **Tensorflow Optimization**
This section contains all the available notebooks that show how to leverage Speedster to optimize Tensorflow models.
## Notebooks:
| Notebook | Description | |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Accelerate Keras Resnet50](https://github.com/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) | Show how to optimize with Speedster a Resnet50 model loaded from keras. | [](https://colab.research.google.com/github/nebuly-ai/nebuly/blob/main/optimization/speedster/notebooks/tensorflow/Accelerate_Tensorflow_ResNet50_with_Speedster.ipynb) |
## Tensorflow API quick view:
``` python
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from speedster import optimize_model
# Load a resnet as example
model = ResNet50()
# Provide an input data for the model
input_data = [((tf.random.normal([1, 224, 224, 3]),), tf.constant([0]))]
# Run Speedster optimization
optimized_model = optimize_model(
model, input_data=input_data, optimization_time="unconstrained"
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
## Warmup the model
## This step is necessary before the latency computation of the
## optimized model in order to get reliable results.
# for _ in range(10):
# optimized_model.predict(x)
res_optimized = optimized_model.predict(x)[0]
```
================================================
FILE: optimization/speedster/requirements.txt
================================================
nebullvm>=0.10.0
tabulate>=0.8.0
================================================
FILE: optimization/speedster/setup.py
================================================
from pathlib import Path
from setuptools import setup, find_packages
REQUIREMENTS = [
"nebullvm>=0.9.0",
"tabulate>=0.8.0",
]
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text(encoding="utf8")
setup(
name="speedster",
version="0.4.0",
packages=find_packages(),
install_requires=REQUIREMENTS,
long_description=long_description,
include_package_data=True,
long_description_content_type="text/markdown",
)
================================================
FILE: optimization/speedster/speedster/__init__.py
================================================
from speedster.api.functions import optimize_model # noqa: F401
from nebullvm.operations.inference_learners.utils import ( # noqa: F401
load_model,
save_model,
)
================================================
FILE: optimization/speedster/speedster/api/__init__.py
================================================
================================================
FILE: optimization/speedster/speedster/api/functions.py
================================================
import logging
from typing import (
Union,
Iterable,
Sequence,
Callable,
Dict,
List,
Optional,
)
from nebullvm.config import DEFAULT_METRIC_DROP_THS
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from nebullvm.tools.logger import debug_mode_enabled, LoggingContext
from speedster.root_op import SpeedsterRootOp
from nebullvm.tools.utils import check_device
def optimize_model(
model: Union[torch.nn.Module, tf.Module, str],
input_data: Union[Iterable, Sequence],
metric_drop_ths: float = DEFAULT_METRIC_DROP_THS,
metric: Union[str, Callable] = None,
optimization_time: str = "constrained",
dynamic_info: Dict = None,
config_file: str = None,
ignore_compilers: List[str] = None,
ignore_compressors: List[str] = None,
store_latencies: bool = False,
device: Optional[str] = None,
**kwargs,
):
"""Optimize the input model regardless of the framework it was used for
implementing it. The optimized model given as output will share with the
input one the same API, i.e. the optimized model will have the same
interface as the original one.
Args:
model (Union[torch.Module, tf.Module, str]): The input model. It can be
a torch or tensorflow model or a path to an onnx saved model.
input_data (Iterable or Sequence): Input data to be used for
optimizing the model. Note that if 'unconstrained' is selected as
`optimization_time`, it would be beneficial to provide at least 100
data samples in order to use all the techniques supported by
Nebullvm. The data can be given in either as sequence (data can be
accessed by "element", e.g. `data[i]`) or iterable (data needs to
be accessed with loop, e.g. `for x in data`). PyTorch, TensorFlow
and Onnx respectively accept input tensor in `torch.Tensor`,
`tf.Tensor` and `np.ndarray` formats. Note that each input
sample must be a tuple containing a tuple as first element, the
`inputs`, and the `label` as second element. The `inputs` needs to
be passed as tuple even if a single input is needed by the model
(in this case the `inputs` tuple will contain just an element).
HuggingFace models can take as data samples both dictionaries or
strings. Strings will then be converted in data samples using the
HuggingFace tokenizer which must be given as input when just a
list of string is provided as input_data (tokenizers can be passed
as extra arguments of this function using the keyword `tokenizer`).
metric_drop_ths (float, optional): Maximum reduction in the
selected metric accepted. No model with a higher error will be
accepted, i.e. all optimized model having a larger error respect to
the original one will be discarded, without even considering their
possible speed-up. Default: None, i.e. no drop in metric accepted.
metric (Union[Callable, str], optional): The metric to
be used for accepting or refusing a precision-reduction
optimization proposal. If none is given but a `metric_drop_ths` is
received, the `nebullvm.measure.compute_relative_difference`
metric will be used as default one. A user-defined metric can
be passed as function accepting as inputs two tuples of tensors
(produced by the baseline and the optimized model) and the related
original labels.
For more information see
`nebullvm.measure.compute_relative_difference` and
`nebullvm.measure.compute_accuracy_drop`. `metric`
accepts as value also a string containing the metric name. At the
current stage the supported metrics are `"numeric_precision"` and
`"accuracy"`. Default: `"numeric_precision"`
optimization_time (OptimizationTime, optional): The optimization time
mode. It can be either 'constrained' or 'unconstrained'. For
'constrained' mode just compilers and precision reduction
techniques are used (no compression). 'Unconstrained' optimization
allows the usage of more time-consuming techniques as pruning and
distillation. Note that for using many of the sophisticated
techniques in the 'unconstrained' optimization, a small fine-tuning
of the model will be needed. Thus we highly recommend to give as
input_data at least 100 samples for when selecting 'unconstrained'
optimization. Default: 'constrained'.
dynamic_info (Dict, optional): Dictionary containing info about the
dynamic axis. It should contain as keys both "inputs" and "outputs"
and as values two lists of dictionaries where each dictionary
represents the dynamic axis information for an input/output tensor.
The inner dictionary should have as key an integer, i.e. the
dynamic axis (considering also the batch size) and as value a
string giving a "tag" to it, e.g. "batch_size". Default: None
config_file (str, optional): Configuration file containing the
parameters needed for defining the CompressionStep in the pipeline.
Default: None.
ignore_compilers (List, optional): List containing the compilers to be
ignored during the OptimizerStep. The compiler name should be one
among tvm, tensor RT, openvino, onnxruntime, deepsparse, tflite,
bladedisc, torchscript, intel_neural_compressor. Default: None.
ignore_compressors (List, optional): List containing the compressors
to be ignored during the CompressionStep. The compiler name should
be one among . Default: None.
store_latencies (bool, optional): Parameter that allows to save the
latency for each compiler used by nebullvm. Default: False.
device (str, optional): Device used, can be 'cpu' or 'gpu'. If not
set, gpu will be used if available, otherwise cpu. Default: None
Returns:
InferenceLearner: Optimized version of the input model having the same
interface, imported by its original framework. For instance a
Pytorch model, when optimized, will return an InferenceLearner
object that can be call exactly as a PyTorch model (either
with `model.forward(input)` and `model(input)`), i.e. it will
take as input and it will return `torch.Tensor`s.
"""
root_op = SpeedsterRootOp()
device = check_device(device)
disable_log = True if not debug_mode_enabled() else False
with LoggingContext(logging.getLogger(), disabled=disable_log):
return root_op.to(device).execute(
model=model,
input_data=input_data,
metric_drop_ths=metric_drop_ths,
metric=metric,
optimization_time=optimization_time,
dynamic_info=dynamic_info,
config_file=config_file,
ignore_compilers=ignore_compilers,
ignore_compressors=ignore_compressors,
store_latencies=store_latencies,
**kwargs,
)
================================================
FILE: optimization/speedster/speedster/api/tests/__init__.py
================================================
================================================
FILE: optimization/speedster/speedster/api/tests/test_huggingface.py
================================================
from tempfile import TemporaryDirectory
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.huggingface import (
HuggingFaceInferenceLearner,
)
from nebullvm.optional_modules.tensorflow import tensorflow as tf
from nebullvm.optional_modules.torch import torch
from transformers import AlbertModel, TFAlbertModel, AlbertTokenizer
from speedster import optimize_model, load_model
def test_torch_huggingface_ort_input_text():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = AlbertModel.from_pretrained("albert-base-v1")
# Move the model to gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
input_data = [
"this is a test",
"hi my name is Valerio",
"india is very far from italy",
]
optimized_model = optimize_model(
model=model,
input_data=input_data,
optimization_time="constrained",
tokenizer=tokenizer,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
tokenizer_args=dict(
add_special_tokens=True,
return_attention_mask=True,
return_tensors="pt",
return_token_type_ids=None, # Sets to model default
padding="longest",
truncation=True,
),
)
# save and load
with TemporaryDirectory() as tmp_dir:
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, HuggingFaceInferenceLearner)
assert isinstance(loaded_model.get_size(), int)
x = ["this is a test input to see if the optimized model works."]
inputs = tokenizer(x, return_tensors="pt").to(device)
model.to(device)
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
torch.mean(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
torch.mean(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
def test_torch_huggingface_ort_input_tensors():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = AlbertModel.from_pretrained("albert-base-v1")
# Move the model to gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
text = "hi my name is Valerio"
inputs = tokenizer(text, return_tensors="pt").to(device)
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=[inputs for _ in range(10)],
optimization_time="constrained",
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
dynamic_info=dynamic_info,
)
x = ["this is a test input to see if the optimized model works."]
inputs = tokenizer(x, return_tensors="pt").to(device)
model.to(device)
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
torch.mean(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
torch.mean(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
def test_torch_huggingface_torchscript_input_tensors():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = AlbertModel.from_pretrained("albert-base-v1", torchscript=True)
# Move the model to gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
text = "hi my name is Valerio"
inputs = tokenizer(text, return_tensors="pt").to(device)
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=[inputs for _ in range(10)],
optimization_time="constrained",
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "torchscript"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
dynamic_info=dynamic_info,
)
x = ["this is a test input to see if the optimized model works."]
inputs = tokenizer(x, return_tensors="pt").to(device)
model.to(device)
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert torch.mean(abs((res_original[0] - res_optimized[0]))) < 1e-2
assert torch.mean(abs((res_original[1] - res_optimized[1]))) < 1e-2
def test_tensorflow_huggingface_ort_input_text_np():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = TFAlbertModel.from_pretrained("albert-base-v1")
input_data = [
"this is a test",
"hi my name is Valerio",
"india is very far from italy",
]
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=input_data,
optimization_time="constrained",
tokenizer=tokenizer,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
tokenizer_args=dict(
add_special_tokens=True,
return_attention_mask=True,
return_tensors="np",
return_token_type_ids=None, # Sets to model default
padding="longest",
truncation=True,
),
dynamic_info=dynamic_info,
)
x = ["this is a test input to see if the optimized model works."]
inputs = tokenizer(x, return_tensors="np")
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
tf.math.reduce_max(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
tf.math.reduce_max(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
def test_tensorflow_huggingface_ort_input_tensors_np():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = TFAlbertModel.from_pretrained("albert-base-v1")
text = "hi my name is Valerio"
inputs = tokenizer(text, return_tensors="np")
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=[inputs for _ in range(10)],
optimization_time="constrained",
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
dynamic_info=dynamic_info,
)
x = ["Test to see if it works with a different output"]
inputs = tokenizer(x, return_tensors="np")
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
tf.math.reduce_max(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
tf.math.reduce_max(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
def test_tensorflow_huggingface_ort_input_text_tf():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = TFAlbertModel.from_pretrained("albert-base-v1")
input_data = [
"this is a test",
"hi my name is Valerio",
"india is very far from italy",
]
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=input_data,
optimization_time="constrained",
tokenizer=tokenizer,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
tokenizer_args=dict(
add_special_tokens=True,
return_attention_mask=True,
return_tensors="tf",
return_token_type_ids=None, # Sets to model default
padding="longest",
truncation=True,
),
dynamic_info=dynamic_info,
)
x = ["this is a test input to see if the optimized model works."]
inputs = tokenizer(x, return_tensors="tf")
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
tf.math.reduce_max(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
tf.math.reduce_max(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
def test_tensorflow_huggingface_ort_input_tensors_tf():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
model = TFAlbertModel.from_pretrained("albert-base-v1")
text = "hi my name is Valerio"
inputs = tokenizer(text, return_tensors="tf")
dynamic_info = {
"inputs": [
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
{0: "batch", 1: "num_tokens"},
],
"outputs": [{0: "batch", 1: "num_tokens"}, {0: "batch"}],
}
optimized_model = optimize_model(
model=model,
input_data=[inputs for _ in range(10)],
optimization_time="constrained",
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
dynamic_info=dynamic_info,
)
x = ["Test to see if it works with a different output"]
inputs = tokenizer(x, return_tensors="tf")
res_original = model(**inputs)
res_optimized = optimized_model(**inputs)
assert isinstance(optimized_model, HuggingFaceInferenceLearner)
assert (
tf.math.reduce_max(
abs(
(
res_original["last_hidden_state"]
- res_optimized["last_hidden_state"]
)
)
)
< 1e-2
)
assert (
tf.math.reduce_max(
abs(
(
res_original["pooler_output"]
- res_optimized["pooler_output"]
)
)
)
< 1e-2
)
================================================
FILE: optimization/speedster/speedster/api/tests/test_onnx.py
================================================
import cpuinfo
from tempfile import TemporaryDirectory
import numpy as np
import pytest
import torch
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.onnx import (
NumpyONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
NumpyOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
NumpyONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
NumpyApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from torchvision import models
from speedster import optimize_model, load_model
from speedster.api.tests.utils import torch_to_onnx
def test_onnx_ort():
with TemporaryDirectory() as tmp_dir:
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
model_path = torch_to_onnx(model, input_data, tmp_dir)
input_data = [
((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model_path,
input_data=input_data,
ignore_compilers=[
compiler
for compiler in COMPILER_LIST
if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
with TemporaryDirectory() as tmp_dir:
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, NumpyONNXInferenceLearner)
assert isinstance(loaded_model.get_size(), int)
# Try the optimized model
device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
x = torch.randn(1, 3, 256, 256, requires_grad=False)
model.to(device).eval()
with torch.inference_mode():
res_original = model(x.to(device))
res_optimized = optimized_model(x.numpy())[0]
assert (
abs(
(res_original.detach().cpu().numpy() - res_optimized)
).max()
< 1e-2
)
def test_onnx_ort_quant():
with TemporaryDirectory() as tmp_dir:
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
model_path = torch_to_onnx(model, input_data, tmp_dir)
input_data = [
((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model_path,
input_data=input_data,
ignore_compilers=[
compiler
for compiler in COMPILER_LIST
if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
metric_drop_ths=2,
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()
x = torch.randn(1, 3, 256, 256, requires_grad=False)
with torch.inference_mode():
res_original = model(x.to(device))
res_optimized = optimized_model(x.numpy())[0]
assert isinstance(optimized_model, NumpyONNXInferenceLearner)
assert (
abs((res_original.detach().cpu().numpy() - res_optimized)).max()
< 1
)
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="Skip because cuda is not available.",
)
def test_onnx_tensorrt():
with TemporaryDirectory() as tmp_dir:
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
model_path = torch_to_onnx(model, input_data, tmp_dir)
input_data = [
((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model_path,
input_data=input_data,
ignore_compilers=[
compiler
for compiler in COMPILER_LIST
if compiler != "tensor_rt"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False)
model.to(device).eval()
with torch.inference_mode():
res_original = model(x.to(device))
res_optimized = optimized_model(x.numpy())[0]
assert isinstance(optimized_model, NumpyONNXTensorRTInferenceLearner)
assert (
abs((res_original.detach().cpu().numpy() - res_optimized)).max()
< 1e-2
)
@pytest.mark.skipif(
"intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
reason="Openvino is only available for intel processors.",
)
def test_onnx_openvino():
with TemporaryDirectory() as tmp_dir:
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
model_path = torch_to_onnx(model, input_data, tmp_dir)
input_data = [
((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model_path,
input_data=input_data,
ignore_compilers=[
compiler
for compiler in COMPILER_LIST
if compiler != "openvino"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
device="cpu",
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False)
model.to(device).eval()
with torch.inference_mode():
res_original = model(x.to(device))
res_optimized = optimized_model(x.numpy())[0]
assert isinstance(optimized_model, NumpyOpenVinoInferenceLearner)
assert (
abs((res_original.detach().cpu().numpy() - res_optimized)).max()
< 1e-2
)
@pytest.mark.skipif(
not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_onnx_tvm():
with TemporaryDirectory() as tmp_dir:
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
model_path = torch_to_onnx(model, input_data, tmp_dir)
input_data = [
((np.random.randn(1, 3, 256, 256).astype(np.float32),), 0)
for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model_path,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tvm"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False)
model.to(device).eval()
with torch.inference_mode():
res_original = model(x.to(device))
res_optimized = optimized_model(x.numpy())[0]
assert isinstance(optimized_model, NumpyApacheTVMInferenceLearner)
assert (
abs((res_original.detach().cpu().numpy() - res_optimized)).max()
< 1e-2
)
================================================
FILE: optimization/speedster/speedster/api/tests/test_pytorch.py
================================================
import cpuinfo
from tempfile import TemporaryDirectory
import pytest
import torch
import torchvision.models as models
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.blade_disc import (
BladeDISCInferenceLearner,
)
from nebullvm.operations.inference_learners.onnx import (
PytorchONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
PytorchOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
PytorchTensorRTInferenceLearner,
PytorchONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.torch_dynamo import (
TorchDynamoInferenceLearner,
)
from nebullvm.operations.inference_learners.torchscript import (
TorchScriptInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
PytorchApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import (
tvm_is_available,
bladedisc_is_available,
)
from speedster import optimize_model, load_model
from nebullvm.tools.utils import check_module_version
def test_torch_ort():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
with TemporaryDirectory() as tmp_dir:
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, PytorchONNXInferenceLearner)
assert isinstance(loaded_model.get_size(), int)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, PytorchONNXInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
def test_torch_ort_quant():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
metric_drop_ths=2,
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, PytorchONNXInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 2
def test_torch_torchscript():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "torchscript"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, TorchScriptInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
@pytest.mark.skipif(
not check_module_version(torch, min_version="2.0.0") or True,
reason="Torch version is not supported",
)
def test_torch_torch_dynamo():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler
for compiler in COMPILER_LIST
if compiler != "torch_dynamo"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, TorchDynamoInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="Skip because cuda is not available.",
)
@pytest.mark.skipif(
not check_module_version(torch, max_version="1.13.1+cu117"),
reason="Skip because torch version is not supported.",
)
def test_torch_tensorrt():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tensor_rt"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
x = torch.randn(1, 3, 256, 256).cuda()
model.cuda().eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(
optimized_model, PytorchTensorRTInferenceLearner
) or isinstance(optimized_model, PytorchONNXTensorRTInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
@pytest.mark.skipif(
"intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
reason="Openvino is only available for intel processors.",
)
def test_torch_openvino():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "openvino"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
device="cpu",
)
# Try the optimized model
x = torch.randn(1, 3, 256, 256)
model.eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, PytorchOpenVinoInferenceLearner)
assert torch.max(abs((res_original.cpu() - res_optimized))) < 1e-2
@pytest.mark.skipif(
not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_torch_tvm():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tvm"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, PytorchApacheTVMInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
@pytest.mark.skipif(
not bladedisc_is_available(),
reason="Can't test bladedisc if it's not installed.",
)
def test_torch_bladedisc():
model = models.resnet18()
input_data = [((torch.randn(1, 3, 256, 256),), 0) for i in range(100)]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "bladedisc"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, 3, 256, 256, requires_grad=False).to(device)
model.to(device).eval()
res_original = model(x)
res_optimized = optimized_model(x)[0]
assert isinstance(optimized_model, BladeDISCInferenceLearner)
assert torch.max(abs((res_original - res_optimized))) < 1e-2
================================================
FILE: optimization/speedster/speedster/api/tests/test_tensorflow.py
================================================
from tempfile import TemporaryDirectory
import cpuinfo
import pytest
import tensorflow as tf
from keras.applications import ResNet50
from nebullvm.config import COMPILER_LIST, COMPRESSOR_LIST
from nebullvm.operations.inference_learners.onnx import (
TensorflowONNXInferenceLearner,
)
from nebullvm.operations.inference_learners.openvino import (
TensorflowOpenVinoInferenceLearner,
)
from nebullvm.operations.inference_learners.tensor_rt import (
TensorflowONNXTensorRTInferenceLearner,
)
from nebullvm.operations.inference_learners.tensorflow import (
TensorflowBackendInferenceLearner,
TFLiteBackendInferenceLearner,
)
from nebullvm.operations.inference_learners.tvm import (
TensorflowApacheTVMInferenceLearner,
)
from nebullvm.operations.optimizations.compilers.utils import tvm_is_available
from nebullvm.tools.utils import gpu_is_available
from speedster import optimize_model, load_model
# Limit tensorflow gpu memory usage
gpus = tf.config.list_physical_devices("GPU")
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.set_visible_devices(gpus[0], "GPU")
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.list_logical_devices("GPU")
print(
len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs"
)
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
def test_tensorflow_ort():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "onnxruntime"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
with TemporaryDirectory() as tmp_dir:
optimized_model.save(tmp_dir)
loaded_model = load_model(tmp_dir)
assert isinstance(loaded_model, TensorflowONNXInferenceLearner)
assert isinstance(loaded_model.get_size(), int)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TensorflowONNXInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
def test_tensorflow_tf_backend():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "xla"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TensorflowBackendInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
@pytest.mark.skipif(
gpu_is_available(),
reason="TFLite does not support Nvidia GPUs",
)
def test_tensorflow_tflite():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tflite"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
metric_drop_ths=0.1,
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TFLiteBackendInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
@pytest.mark.skipif(
not gpu_is_available(),
reason="Skip because cuda is not available.",
)
def test_tensorflow_tensorrt():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tensor_rt"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TensorflowONNXTensorRTInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
@pytest.mark.skipif(
"intel" not in cpuinfo.get_cpu_info()["brand_raw"].lower(),
reason="Openvino is only available for intel processors.",
)
def test_tensorflow_openvino():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "openvino"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
device="cpu",
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TensorflowOpenVinoInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
@pytest.mark.skipif(
not tvm_is_available(), reason="Can't test tvm if it's not installed."
)
def test_tensorflow_tvm():
model = ResNet50()
input_data = [
((tf.random.normal([1, 224, 224, 3]),), 0) for i in range(100)
]
# Run nebullvm optimization in one line of code
optimized_model = optimize_model(
model,
input_data=input_data,
ignore_compilers=[
compiler for compiler in COMPILER_LIST if compiler != "tvm"
],
ignore_compressors=[compressor for compressor in COMPRESSOR_LIST],
)
# Try the optimized model
x = tf.random.normal([1, 224, 224, 3])
res_original = model.predict(x)
res_optimized = optimized_model.predict(x)[0]
assert isinstance(optimized_model, TensorflowApacheTVMInferenceLearner)
assert abs((res_original - res_optimized)).max() < 1e-2
================================================
FILE: optimization/speedster/speedster/api/tests/utils.py
================================================
import os
from pathlib import Path
from nebullvm.core.models import ModelParams, Device, DeviceType
from nebullvm.operations.conversions.pytorch import convert_torch_to_onnx
from nebullvm.tools.data import DataManager
from nebullvm.tools.utils import gpu_is_available
def torch_to_onnx(model, input_data, output_path):
model_params = ModelParams(1, [], [], [])
output_path = os.path.join(output_path, "model.onnx")
device = Device(DeviceType.GPU if gpu_is_available() else DeviceType.CPU)
convert_torch_to_onnx(
model, DataManager(input_data), model_params, Path(output_path), device
)
return output_path
================================================
FILE: optimization/speedster/speedster/root_op.py
================================================
import json
import pickle
import sys
from typing import (
Any,
Union,
Iterable,
Sequence,
Dict,
Callable,
List,
)
from loguru import logger
from nebullvm import setup_logger
from nebullvm.config import MIN_NUMBER
from nebullvm.core.models import OptimizeInferenceResult, DeviceType
from nebullvm.operations.base import Operation
from nebullvm.operations.optimizations.optimize_inference import (
OptimizeInferenceOp,
)
from nebullvm.tools.data import DataManager
from nebullvm.tools.feedback_collector import FeedbackCollector
from tabulate import tabulate
from nebullvm.tools.hardware_utils import get_hw_setup
from nebullvm.tools.utils import (
get_model_size_mb,
get_model_name,
generate_model_id,
)
SPEEDSTER_FEEDBACK_COLLECTOR = FeedbackCollector(
url="https://nebuly.cloud/v1/store_speedster_results",
disable_telemetry_environ_var="SPEEDSTER_DISABLE_TELEMETRY",
app_version="0.4.0",
)
def _convert_technique(technique: str):
if technique.lower() == "none": # use fp32 instead of none
technique = "fp32"
elif technique == "HALF":
technique = "fp16"
elif technique == "STATIC":
technique = "int8"
else:
technique = "int8_dynamic"
return technique
def _get_model_len(model: Any):
try:
return len(pickle.dumps(model, -1))
except Exception:
logger.warning(
"Cannot pickle input model. Unable to "
"extract original model size"
)
# Model is not pickable
return -1
class SpeedsterRootOp(Operation):
def __init__(self):
super().__init__()
self.optimize_inference_op = OptimizeInferenceOp()
self.set_feedback_collector(SPEEDSTER_FEEDBACK_COLLECTOR)
def _send_feedback(
self,
optimization_result: OptimizeInferenceResult,
store_latencies: bool = False,
):
model_orig = optimization_result.original_model.model
model_name = get_model_name(model_orig)
model_info = {
"model_name": model_name,
"model_size": f"{get_model_size_mb(model_orig)} MB",
"framework": optimization_result.original_model.framework.value,
}
self.feedback_collector.store_info(
key="model_id", value=generate_model_id(model_orig)
)
self.feedback_collector.store_info(
key="model_metadata", value=model_info
)
self.feedback_collector.store_info(
key="hardware_setup", value=get_hw_setup(self.device).__dict__
)
optimizations = self.feedback_collector.get("optimizations")
original_model_dict = {
"compiler": optimization_result.original_model.framework.value,
"technique": "original",
"latency": optimization_result.original_model.latency_seconds,
}
optimizations.insert(0, original_model_dict)
self.feedback_collector.send_feedback()
if store_latencies:
model_id = self.feedback_collector.get("model_id", "")
with open(
f"{model_name}_latencies_{model_id[:10]}.json", "w"
) as f:
json.dump(
{
"optimizations": optimizations,
},
f,
)
self.feedback_collector.reset("optimizations")
self.feedback_collector.reset("model_id")
self.feedback_collector.reset("model_metadata")
def execute(
self,
model: Any,
input_data: Union[Iterable, Sequence, DataManager],
metric_drop_ths: float = None,
metric: Union[str, Callable] = None,
optimization_time: str = "constrained",
dynamic_info: Dict = None,
config_file: str = None,
ignore_compilers: List[str] = None,
ignore_compressors: List[str] = None,
store_latencies: bool = False,
**kwargs,
):
self.logger.info(
"Running Speedster on {}{}".format(
self.device.type.name,
f":{self.device.idx}"
if self.device.type is not DeviceType.CPU
else "",
)
)
result = self.optimize_inference_op.to(self.device).execute(
model=model,
input_data=input_data,
metric_drop_ths=metric_drop_ths,
metric=metric,
optimization_time=optimization_time,
dynamic_info=dynamic_info,
config_file=config_file,
ignore_compilers=ignore_compilers,
ignore_compressors=ignore_compressors,
store_latencies=store_latencies,
**kwargs,
)
if result.optimized_model is None:
return None
opt_metric_drop = (
f"{result.metric_drop:.4f}"
if result.metric_drop > MIN_NUMBER
else "0"
)
self._send_feedback(result, store_latencies=store_latencies)
table = [
[
"backend",
result.original_model.framework.name,
result.optimized_model.inference_learner.name,
"",
],
[
"latency",
f"{result.original_model.latency_seconds:.4f} sec/batch",
f"{result.optimized_model.latency_seconds:.4f} sec/batch",
f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x", # noqa: E501
],
[
"throughput",
f"{result.original_model.throughput:.2f} " f"data/sec",
f"{result.optimized_model.throughput:.2f} " f"data/sec",
f"{result.optimized_model.throughput / result.original_model.throughput:.2f}x", # noqa: E501
],
[
"model size",
f"{result.original_model.size_mb:.2f} MB",
f"{result.optimized_model.size_mb:.2f} MB",
f"{min(int((result.optimized_model.size_mb-result.original_model.size_mb) / result.original_model.size_mb * 100), 0)}%" # noqa: E501
if result.original_model.size_mb > 0
else "NA",
],
["metric drop", "", opt_metric_drop, ""],
[
"techniques",
"",
f"{_convert_technique(result.optimized_model.technique)}",
"",
],
]
headers = [
"Metric",
"Original Model",
"Optimized Model",
"Improvement",
]
# change format to the logger, avoiding printing verbose info
# to the console (as date, time, etc.)
self.logger.remove()
handler_id = self.logger.add(
sys.stdout, format="{message}"
)
hw_info = get_hw_setup(self.device)
hw_name = (
hw_info.cpu
if self.device.type is DeviceType.CPU
else hw_info.accelerator
)
self.logger.info(
(
f"\n[Speedster results on {hw_name}]\n"
f"{tabulate(table, headers, tablefmt='heavy_outline')}"
)
)
if (
result.original_model.latency_seconds
/ result.optimized_model.latency_seconds
< 2
):
self.logger.warning(
f"\nMax speed-up with your input parameters is "
f"{result.original_model.latency_seconds / result.optimized_model.latency_seconds:.2f}x. " # noqa: E501
f"If you want to get a faster optimized model, "
f"see the following link for some suggestions: "
f"https://docs.nebuly.com/Speedster/advanced_"
f"options/#acceleration-suggestions\n"
)
self.logger.remove(handler_id)
setup_logger()
return result.optimized_model.inference_learner
================================================
FILE: optimization/speedster/speedster/speedster.py
================================================
from nebullvm.apps.base import App
from speedster.root_op import SpeedsterRootOp
class SpeedsterApp(App):
def __init__(self):
super().__init__()
self.root_op = SpeedsterRootOp()
def execute(self, *args, **kwargs):
return self.root_op.execute(*args, **kwargs)
================================================
FILE: optimization/speedster/speedster/tests/__init__.py
================================================
================================================
FILE: optimization/speedster/speedster/tests/test_root_op.py
================================================
from nebullvm.core.models import OptimizeInferenceResult
from speedster.root_op import SpeedsterRootOp
def test_root_op_no_optim_model(mocker):
root_op = SpeedsterRootOp()
mocker.patch.object(
root_op.optimize_inference_op,
"execute",
return_value=OptimizeInferenceResult(
original_model=mocker.MagicMock(),
optimized_model=None,
hardware_setup=mocker.MagicMock(),
),
)
res = root_op.execute(
model=None,
input_data=mocker.MagicMock(),
metric_drop_ths=None,
metric="latency",
optimization_time=mocker.MagicMock(),
dynamic_info=None,
config_file=None,
ignore_compilers=None,
ignore_compressors=None,
store_latencies=False,
)
assert res is None
def test_root_op_optim_model(mocker):
root_op = SpeedsterRootOp()
mocker.patch.object(
root_op.optimize_inference_op,
"execute",
return_value=OptimizeInferenceResult(
original_model=mocker.MagicMock(
latency_seconds=1, throughput=1, size_mb=1
),
optimized_model=mocker.MagicMock(
metric_drop=0.1, latency_seconds=1, size_mb=1, throughput=1
),
hardware_setup=mocker.MagicMock(),
),
)
mocker.patch.object(root_op, "_send_feedback")
res = root_op.execute(
model=None,
input_data=mocker.MagicMock(),
metric_drop_ths=None,
metric="latency",
optimization_time=mocker.MagicMock(),
dynamic_info=None,
config_file=None,
ignore_compilers=None,
ignore_compressors=None,
store_latencies=False,
)
assert res is not None
================================================
FILE: optimization/speedster/speedster/utils.py
================================================
================================================
FILE: optimization/speedster/speedster.toml
================================================
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"